zfs_vnops.c revision 331017
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29/* Portions Copyright 2007 Jeremy Teo */
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/time.h>
35#include <sys/systm.h>
36#include <sys/sysmacros.h>
37#include <sys/resource.h>
38#include <sys/vfs.h>
39#include <sys/vm.h>
40#include <sys/vnode.h>
41#include <sys/file.h>
42#include <sys/stat.h>
43#include <sys/kmem.h>
44#include <sys/taskq.h>
45#include <sys/uio.h>
46#include <sys/atomic.h>
47#include <sys/namei.h>
48#include <sys/mman.h>
49#include <sys/cmn_err.h>
50#include <sys/errno.h>
51#include <sys/unistd.h>
52#include <sys/zfs_dir.h>
53#include <sys/zfs_ioctl.h>
54#include <sys/fs/zfs.h>
55#include <sys/dmu.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa.h>
58#include <sys/txg.h>
59#include <sys/dbuf.h>
60#include <sys/zap.h>
61#include <sys/sa.h>
62#include <sys/dirent.h>
63#include <sys/policy.h>
64#include <sys/sunddi.h>
65#include <sys/filio.h>
66#include <sys/sid.h>
67#include <sys/zfs_ctldir.h>
68#include <sys/zfs_fuid.h>
69#include <sys/zfs_sa.h>
70#include <sys/zfs_rlock.h>
71#include <sys/extdirent.h>
72#include <sys/kidmap.h>
73#include <sys/bio.h>
74#include <sys/buf.h>
75#include <sys/sched.h>
76#include <sys/acl.h>
77#include <sys/vmmeter.h>
78#include <vm/vm_param.h>
79#include <sys/zil.h>
80
81/*
82 * Programming rules.
83 *
84 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
85 * properly lock its in-core state, create a DMU transaction, do the work,
86 * record this work in the intent log (ZIL), commit the DMU transaction,
87 * and wait for the intent log to commit if it is a synchronous operation.
88 * Moreover, the vnode ops must work in both normal and log replay context.
89 * The ordering of events is important to avoid deadlocks and references
90 * to freed memory.  The example below illustrates the following Big Rules:
91 *
92 *  (1)	A check must be made in each zfs thread for a mounted file system.
93 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
94 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
95 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
96 *	can return EIO from the calling function.
97 *
98 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
99 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
100 *	First, if it's the last reference, the vnode/znode
101 *	can be freed, so the zp may point to freed memory.  Second, the last
102 *	reference will call zfs_zinactive(), which may induce a lot of work --
103 *	pushing cached pages (which acquires range locks) and syncing out
104 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
105 *	which could deadlock the system if you were already holding one.
106 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
107 *
108 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
109 *	as they can span dmu_tx_assign() calls.
110 *
111 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
112 *      dmu_tx_assign().  This is critical because we don't want to block
113 *      while holding locks.
114 *
115 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
116 *	reduces lock contention and CPU usage when we must wait (note that if
117 *	throughput is constrained by the storage, nearly every transaction
118 *	must wait).
119 *
120 *      Note, in particular, that if a lock is sometimes acquired before
121 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
122 *      to use a non-blocking assign can deadlock the system.  The scenario:
123 *
124 *	Thread A has grabbed a lock before calling dmu_tx_assign().
125 *	Thread B is in an already-assigned tx, and blocks for this lock.
126 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
127 *	forever, because the previous txg can't quiesce until B's tx commits.
128 *
129 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
130 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
131 *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
132 *	to indicate that this operation has already called dmu_tx_wait().
133 *	This will ensure that we don't retry forever, waiting a short bit
134 *	each time.
135 *
136 *  (5)	If the operation succeeded, generate the intent log entry for it
137 *	before dropping locks.  This ensures that the ordering of events
138 *	in the intent log matches the order in which they actually occurred.
139 *	During ZIL replay the zfs_log_* functions will update the sequence
140 *	number to indicate the zil transaction has replayed.
141 *
142 *  (6)	At the end of each vnode op, the DMU tx must always commit,
143 *	regardless of whether there were any errors.
144 *
145 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
146 *	to ensure that synchronous semantics are provided when necessary.
147 *
148 * In general, this is how things should be ordered in each vnode op:
149 *
150 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
151 * top:
152 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
153 *	rw_enter(...);			// grab any other locks you need
154 *	tx = dmu_tx_create(...);	// get DMU tx
155 *	dmu_tx_hold_*();		// hold each object you might modify
156 *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
157 *	if (error) {
158 *		rw_exit(...);		// drop locks
159 *		zfs_dirent_unlock(dl);	// unlock directory entry
160 *		VN_RELE(...);		// release held vnodes
161 *		if (error == ERESTART) {
162 *			waited = B_TRUE;
163 *			dmu_tx_wait(tx);
164 *			dmu_tx_abort(tx);
165 *			goto top;
166 *		}
167 *		dmu_tx_abort(tx);	// abort DMU tx
168 *		ZFS_EXIT(zfsvfs);	// finished in zfs
169 *		return (error);		// really out of space
170 *	}
171 *	error = do_real_work();		// do whatever this VOP does
172 *	if (error == 0)
173 *		zfs_log_*(...);		// on success, make ZIL entry
174 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
175 *	rw_exit(...);			// drop locks
176 *	zfs_dirent_unlock(dl);		// unlock directory entry
177 *	VN_RELE(...);			// release held vnodes
178 *	zil_commit(zilog, foid);	// synchronous when necessary
179 *	ZFS_EXIT(zfsvfs);		// finished in zfs
180 *	return (error);			// done, report error
181 */
182
183/* ARGSUSED */
184static int
185zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
186{
187	znode_t	*zp = VTOZ(*vpp);
188	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
189
190	ZFS_ENTER(zfsvfs);
191	ZFS_VERIFY_ZP(zp);
192
193	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
194	    ((flag & FAPPEND) == 0)) {
195		ZFS_EXIT(zfsvfs);
196		return (SET_ERROR(EPERM));
197	}
198
199	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
200	    ZTOV(zp)->v_type == VREG &&
201	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
202		if (fs_vscan(*vpp, cr, 0) != 0) {
203			ZFS_EXIT(zfsvfs);
204			return (SET_ERROR(EACCES));
205		}
206	}
207
208	/* Keep a count of the synchronous opens in the znode */
209	if (flag & (FSYNC | FDSYNC))
210		atomic_inc_32(&zp->z_sync_cnt);
211
212	ZFS_EXIT(zfsvfs);
213	return (0);
214}
215
216/* ARGSUSED */
217static int
218zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
219    caller_context_t *ct)
220{
221	znode_t	*zp = VTOZ(vp);
222	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
223
224	/*
225	 * Clean up any locks held by this process on the vp.
226	 */
227	cleanlocks(vp, ddi_get_pid(), 0);
228	cleanshares(vp, ddi_get_pid());
229
230	ZFS_ENTER(zfsvfs);
231	ZFS_VERIFY_ZP(zp);
232
233	/* Decrement the synchronous opens in the znode */
234	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
235		atomic_dec_32(&zp->z_sync_cnt);
236
237	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
238	    ZTOV(zp)->v_type == VREG &&
239	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
240		VERIFY(fs_vscan(vp, cr, 1) == 0);
241
242	ZFS_EXIT(zfsvfs);
243	return (0);
244}
245
246/*
247 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
248 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
249 */
250static int
251zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
252{
253	znode_t	*zp = VTOZ(vp);
254	uint64_t noff = (uint64_t)*off; /* new offset */
255	uint64_t file_sz;
256	int error;
257	boolean_t hole;
258
259	file_sz = zp->z_size;
260	if (noff >= file_sz)  {
261		return (SET_ERROR(ENXIO));
262	}
263
264	if (cmd == _FIO_SEEK_HOLE)
265		hole = B_TRUE;
266	else
267		hole = B_FALSE;
268
269	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
270
271	if (error == ESRCH)
272		return (SET_ERROR(ENXIO));
273
274	/*
275	 * We could find a hole that begins after the logical end-of-file,
276	 * because dmu_offset_next() only works on whole blocks.  If the
277	 * EOF falls mid-block, then indicate that the "virtual hole"
278	 * at the end of the file begins at the logical EOF, rather than
279	 * at the end of the last block.
280	 */
281	if (noff > file_sz) {
282		ASSERT(hole);
283		noff = file_sz;
284	}
285
286	if (noff < *off)
287		return (error);
288	*off = noff;
289	return (error);
290}
291
292/* ARGSUSED */
293static int
294zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
295    int *rvalp, caller_context_t *ct)
296{
297	offset_t off;
298	offset_t ndata;
299	dmu_object_info_t doi;
300	int error;
301	zfsvfs_t *zfsvfs;
302	znode_t *zp;
303
304	switch (com) {
305	case _FIOFFS:
306	{
307		return (0);
308
309		/*
310		 * The following two ioctls are used by bfu.  Faking out,
311		 * necessary to avoid bfu errors.
312		 */
313	}
314	case _FIOGDIO:
315	case _FIOSDIO:
316	{
317		return (0);
318	}
319
320	case _FIO_SEEK_DATA:
321	case _FIO_SEEK_HOLE:
322	{
323#ifdef illumos
324		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
325			return (SET_ERROR(EFAULT));
326#else
327		off = *(offset_t *)data;
328#endif
329		zp = VTOZ(vp);
330		zfsvfs = zp->z_zfsvfs;
331		ZFS_ENTER(zfsvfs);
332		ZFS_VERIFY_ZP(zp);
333
334		/* offset parameter is in/out */
335		error = zfs_holey(vp, com, &off);
336		ZFS_EXIT(zfsvfs);
337		if (error)
338			return (error);
339#ifdef illumos
340		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
341			return (SET_ERROR(EFAULT));
342#else
343		*(offset_t *)data = off;
344#endif
345		return (0);
346	}
347#ifdef illumos
348	case _FIO_COUNT_FILLED:
349	{
350		/*
351		 * _FIO_COUNT_FILLED adds a new ioctl command which
352		 * exposes the number of filled blocks in a
353		 * ZFS object.
354		 */
355		zp = VTOZ(vp);
356		zfsvfs = zp->z_zfsvfs;
357		ZFS_ENTER(zfsvfs);
358		ZFS_VERIFY_ZP(zp);
359
360		/*
361		 * Wait for all dirty blocks for this object
362		 * to get synced out to disk, and the DMU info
363		 * updated.
364		 */
365		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
366		if (error) {
367			ZFS_EXIT(zfsvfs);
368			return (error);
369		}
370
371		/*
372		 * Retrieve fill count from DMU object.
373		 */
374		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
375		if (error) {
376			ZFS_EXIT(zfsvfs);
377			return (error);
378		}
379
380		ndata = doi.doi_fill_count;
381
382		ZFS_EXIT(zfsvfs);
383		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
384			return (SET_ERROR(EFAULT));
385		return (0);
386	}
387#endif
388	}
389	return (SET_ERROR(ENOTTY));
390}
391
392static vm_page_t
393page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
394{
395	vm_object_t obj;
396	vm_page_t pp;
397	int64_t end;
398
399	/*
400	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
401	 * aligned boundaries, if the range is not aligned.  As a result a
402	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
403	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
404	 * the whole page would be considred clean despite have some dirty data.
405	 * For this reason we should shrink the range to DEV_BSIZE aligned
406	 * boundaries before calling vm_page_clear_dirty.
407	 */
408	end = rounddown2(off + nbytes, DEV_BSIZE);
409	off = roundup2(off, DEV_BSIZE);
410	nbytes = end - off;
411
412	obj = vp->v_object;
413	zfs_vmobject_assert_wlocked(obj);
414
415	for (;;) {
416		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
417		    pp->valid) {
418			if (vm_page_xbusied(pp)) {
419				/*
420				 * Reference the page before unlocking and
421				 * sleeping so that the page daemon is less
422				 * likely to reclaim it.
423				 */
424				vm_page_reference(pp);
425				vm_page_lock(pp);
426				zfs_vmobject_wunlock(obj);
427				vm_page_busy_sleep(pp, "zfsmwb", true);
428				zfs_vmobject_wlock(obj);
429				continue;
430			}
431			vm_page_sbusy(pp);
432		} else if (pp != NULL) {
433			ASSERT(!pp->valid);
434			pp = NULL;
435		}
436
437		if (pp != NULL) {
438			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
439			vm_object_pip_add(obj, 1);
440			pmap_remove_write(pp);
441			if (nbytes != 0)
442				vm_page_clear_dirty(pp, off, nbytes);
443		}
444		break;
445	}
446	return (pp);
447}
448
449static void
450page_unbusy(vm_page_t pp)
451{
452
453	vm_page_sunbusy(pp);
454	vm_object_pip_subtract(pp->object, 1);
455}
456
457static vm_page_t
458page_hold(vnode_t *vp, int64_t start)
459{
460	vm_object_t obj;
461	vm_page_t pp;
462
463	obj = vp->v_object;
464	zfs_vmobject_assert_wlocked(obj);
465
466	for (;;) {
467		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
468		    pp->valid) {
469			if (vm_page_xbusied(pp)) {
470				/*
471				 * Reference the page before unlocking and
472				 * sleeping so that the page daemon is less
473				 * likely to reclaim it.
474				 */
475				vm_page_reference(pp);
476				vm_page_lock(pp);
477				zfs_vmobject_wunlock(obj);
478				vm_page_busy_sleep(pp, "zfsmwb", true);
479				zfs_vmobject_wlock(obj);
480				continue;
481			}
482
483			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
484			vm_page_lock(pp);
485			vm_page_hold(pp);
486			vm_page_unlock(pp);
487
488		} else
489			pp = NULL;
490		break;
491	}
492	return (pp);
493}
494
495static void
496page_unhold(vm_page_t pp)
497{
498
499	vm_page_lock(pp);
500	vm_page_unhold(pp);
501	vm_page_unlock(pp);
502}
503
504/*
505 * When a file is memory mapped, we must keep the IO data synchronized
506 * between the DMU cache and the memory mapped pages.  What this means:
507 *
508 * On Write:	If we find a memory mapped page, we write to *both*
509 *		the page and the dmu buffer.
510 */
511static void
512update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
513    int segflg, dmu_tx_t *tx)
514{
515	vm_object_t obj;
516	struct sf_buf *sf;
517	caddr_t va;
518	int off;
519
520	ASSERT(segflg != UIO_NOCOPY);
521	ASSERT(vp->v_mount != NULL);
522	obj = vp->v_object;
523	ASSERT(obj != NULL);
524
525	off = start & PAGEOFFSET;
526	zfs_vmobject_wlock(obj);
527	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
528		vm_page_t pp;
529		int nbytes = imin(PAGESIZE - off, len);
530
531		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
532			zfs_vmobject_wunlock(obj);
533
534			va = zfs_map_page(pp, &sf);
535			(void) dmu_read(os, oid, start+off, nbytes,
536			    va+off, DMU_READ_PREFETCH);;
537			zfs_unmap_page(sf);
538
539			zfs_vmobject_wlock(obj);
540			page_unbusy(pp);
541		}
542		len -= nbytes;
543		off = 0;
544	}
545	vm_object_pip_wakeupn(obj, 0);
546	zfs_vmobject_wunlock(obj);
547}
548
549/*
550 * Read with UIO_NOCOPY flag means that sendfile(2) requests
551 * ZFS to populate a range of page cache pages with data.
552 *
553 * NOTE: this function could be optimized to pre-allocate
554 * all pages in advance, drain exclusive busy on all of them,
555 * map them into contiguous KVA region and populate them
556 * in one single dmu_read() call.
557 */
558static int
559mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
560{
561	znode_t *zp = VTOZ(vp);
562	objset_t *os = zp->z_zfsvfs->z_os;
563	struct sf_buf *sf;
564	vm_object_t obj;
565	vm_page_t pp;
566	int64_t start;
567	caddr_t va;
568	int len = nbytes;
569	int off;
570	int error = 0;
571
572	ASSERT(uio->uio_segflg == UIO_NOCOPY);
573	ASSERT(vp->v_mount != NULL);
574	obj = vp->v_object;
575	ASSERT(obj != NULL);
576	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
577
578	zfs_vmobject_wlock(obj);
579	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
580		int bytes = MIN(PAGESIZE, len);
581
582		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
583		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
584		if (pp->valid == 0) {
585			zfs_vmobject_wunlock(obj);
586			va = zfs_map_page(pp, &sf);
587			error = dmu_read(os, zp->z_id, start, bytes, va,
588			    DMU_READ_PREFETCH);
589			if (bytes != PAGESIZE && error == 0)
590				bzero(va + bytes, PAGESIZE - bytes);
591			zfs_unmap_page(sf);
592			zfs_vmobject_wlock(obj);
593			vm_page_sunbusy(pp);
594			vm_page_lock(pp);
595			if (error) {
596				if (pp->wire_count == 0 && pp->valid == 0 &&
597				    !vm_page_busied(pp))
598					vm_page_free(pp);
599			} else {
600				pp->valid = VM_PAGE_BITS_ALL;
601				vm_page_activate(pp);
602			}
603			vm_page_unlock(pp);
604		} else {
605			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
606			vm_page_sunbusy(pp);
607		}
608		if (error)
609			break;
610		uio->uio_resid -= bytes;
611		uio->uio_offset += bytes;
612		len -= bytes;
613	}
614	zfs_vmobject_wunlock(obj);
615	return (error);
616}
617
618/*
619 * When a file is memory mapped, we must keep the IO data synchronized
620 * between the DMU cache and the memory mapped pages.  What this means:
621 *
622 * On Read:	We "read" preferentially from memory mapped pages,
623 *		else we default from the dmu buffer.
624 *
625 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
626 *	 the file is memory mapped.
627 */
628static int
629mappedread(vnode_t *vp, int nbytes, uio_t *uio)
630{
631	znode_t *zp = VTOZ(vp);
632	vm_object_t obj;
633	int64_t start;
634	caddr_t va;
635	int len = nbytes;
636	int off;
637	int error = 0;
638
639	ASSERT(vp->v_mount != NULL);
640	obj = vp->v_object;
641	ASSERT(obj != NULL);
642
643	start = uio->uio_loffset;
644	off = start & PAGEOFFSET;
645	zfs_vmobject_wlock(obj);
646	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
647		vm_page_t pp;
648		uint64_t bytes = MIN(PAGESIZE - off, len);
649
650		if (pp = page_hold(vp, start)) {
651			struct sf_buf *sf;
652			caddr_t va;
653
654			zfs_vmobject_wunlock(obj);
655			va = zfs_map_page(pp, &sf);
656#ifdef illumos
657			error = uiomove(va + off, bytes, UIO_READ, uio);
658#else
659			error = vn_io_fault_uiomove(va + off, bytes, uio);
660#endif
661			zfs_unmap_page(sf);
662			zfs_vmobject_wlock(obj);
663			page_unhold(pp);
664		} else {
665			zfs_vmobject_wunlock(obj);
666			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
667			    uio, bytes);
668			zfs_vmobject_wlock(obj);
669		}
670		len -= bytes;
671		off = 0;
672		if (error)
673			break;
674	}
675	zfs_vmobject_wunlock(obj);
676	return (error);
677}
678
679offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
680
681/*
682 * Read bytes from specified file into supplied buffer.
683 *
684 *	IN:	vp	- vnode of file to be read from.
685 *		uio	- structure supplying read location, range info,
686 *			  and return buffer.
687 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
688 *		cr	- credentials of caller.
689 *		ct	- caller context
690 *
691 *	OUT:	uio	- updated offset and range, buffer filled.
692 *
693 *	RETURN:	0 on success, error code on failure.
694 *
695 * Side Effects:
696 *	vp - atime updated if byte count > 0
697 */
698/* ARGSUSED */
699static int
700zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
701{
702	znode_t		*zp = VTOZ(vp);
703	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
704	ssize_t		n, nbytes;
705	int		error = 0;
706	rl_t		*rl;
707	xuio_t		*xuio = NULL;
708
709	ZFS_ENTER(zfsvfs);
710	ZFS_VERIFY_ZP(zp);
711
712	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
713		ZFS_EXIT(zfsvfs);
714		return (SET_ERROR(EACCES));
715	}
716
717	/*
718	 * Validate file offset
719	 */
720	if (uio->uio_loffset < (offset_t)0) {
721		ZFS_EXIT(zfsvfs);
722		return (SET_ERROR(EINVAL));
723	}
724
725	/*
726	 * Fasttrack empty reads
727	 */
728	if (uio->uio_resid == 0) {
729		ZFS_EXIT(zfsvfs);
730		return (0);
731	}
732
733	/*
734	 * Check for mandatory locks
735	 */
736	if (MANDMODE(zp->z_mode)) {
737		if (error = chklock(vp, FREAD,
738		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
739			ZFS_EXIT(zfsvfs);
740			return (error);
741		}
742	}
743
744	/*
745	 * If we're in FRSYNC mode, sync out this znode before reading it.
746	 */
747	if (zfsvfs->z_log &&
748	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
749		zil_commit(zfsvfs->z_log, zp->z_id);
750
751	/*
752	 * Lock the range against changes.
753	 */
754	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
755
756	/*
757	 * If we are reading past end-of-file we can skip
758	 * to the end; but we might still need to set atime.
759	 */
760	if (uio->uio_loffset >= zp->z_size) {
761		error = 0;
762		goto out;
763	}
764
765	ASSERT(uio->uio_loffset < zp->z_size);
766	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
767
768#ifdef illumos
769	if ((uio->uio_extflg == UIO_XUIO) &&
770	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
771		int nblk;
772		int blksz = zp->z_blksz;
773		uint64_t offset = uio->uio_loffset;
774
775		xuio = (xuio_t *)uio;
776		if ((ISP2(blksz))) {
777			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
778			    blksz)) / blksz;
779		} else {
780			ASSERT(offset + n <= blksz);
781			nblk = 1;
782		}
783		(void) dmu_xuio_init(xuio, nblk);
784
785		if (vn_has_cached_data(vp)) {
786			/*
787			 * For simplicity, we always allocate a full buffer
788			 * even if we only expect to read a portion of a block.
789			 */
790			while (--nblk >= 0) {
791				(void) dmu_xuio_add(xuio,
792				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
793				    blksz), 0, blksz);
794			}
795		}
796	}
797#endif	/* illumos */
798
799	while (n > 0) {
800		nbytes = MIN(n, zfs_read_chunk_size -
801		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
802
803#ifdef __FreeBSD__
804		if (uio->uio_segflg == UIO_NOCOPY)
805			error = mappedread_sf(vp, nbytes, uio);
806		else
807#endif /* __FreeBSD__ */
808		if (vn_has_cached_data(vp)) {
809			error = mappedread(vp, nbytes, uio);
810		} else {
811			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
812			    uio, nbytes);
813		}
814		if (error) {
815			/* convert checksum errors into IO errors */
816			if (error == ECKSUM)
817				error = SET_ERROR(EIO);
818			break;
819		}
820
821		n -= nbytes;
822	}
823out:
824	zfs_range_unlock(rl);
825
826	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
827	ZFS_EXIT(zfsvfs);
828	return (error);
829}
830
831/*
832 * Write the bytes to a file.
833 *
834 *	IN:	vp	- vnode of file to be written to.
835 *		uio	- structure supplying write location, range info,
836 *			  and data buffer.
837 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
838 *			  set if in append mode.
839 *		cr	- credentials of caller.
840 *		ct	- caller context (NFS/CIFS fem monitor only)
841 *
842 *	OUT:	uio	- updated offset and range.
843 *
844 *	RETURN:	0 on success, error code on failure.
845 *
846 * Timestamps:
847 *	vp - ctime|mtime updated if byte count > 0
848 */
849
850/* ARGSUSED */
851static int
852zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
853{
854	znode_t		*zp = VTOZ(vp);
855	rlim64_t	limit = MAXOFFSET_T;
856	ssize_t		start_resid = uio->uio_resid;
857	ssize_t		tx_bytes;
858	uint64_t	end_size;
859	dmu_tx_t	*tx;
860	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
861	zilog_t		*zilog;
862	offset_t	woff;
863	ssize_t		n, nbytes;
864	rl_t		*rl;
865	int		max_blksz = zfsvfs->z_max_blksz;
866	int		error = 0;
867	arc_buf_t	*abuf;
868	iovec_t		*aiov = NULL;
869	xuio_t		*xuio = NULL;
870	int		i_iov = 0;
871	int		iovcnt = uio->uio_iovcnt;
872	iovec_t		*iovp = uio->uio_iov;
873	int		write_eof;
874	int		count = 0;
875	sa_bulk_attr_t	bulk[4];
876	uint64_t	mtime[2], ctime[2];
877
878	/*
879	 * Fasttrack empty write
880	 */
881	n = start_resid;
882	if (n == 0)
883		return (0);
884
885	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
886		limit = MAXOFFSET_T;
887
888	ZFS_ENTER(zfsvfs);
889	ZFS_VERIFY_ZP(zp);
890
891	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
894	    &zp->z_size, 8);
895	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
896	    &zp->z_pflags, 8);
897
898	/*
899	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
900	 * callers might not be able to detect properly that we are read-only,
901	 * so check it explicitly here.
902	 */
903	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
904		ZFS_EXIT(zfsvfs);
905		return (SET_ERROR(EROFS));
906	}
907
908	/*
909	 * If immutable or not appending then return EPERM.
910	 * Intentionally allow ZFS_READONLY through here.
911	 * See zfs_zaccess_common()
912	 */
913	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
914	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
915	    (uio->uio_loffset < zp->z_size))) {
916		ZFS_EXIT(zfsvfs);
917		return (SET_ERROR(EPERM));
918	}
919
920	zilog = zfsvfs->z_log;
921
922	/*
923	 * Validate file offset
924	 */
925	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
926	if (woff < 0) {
927		ZFS_EXIT(zfsvfs);
928		return (SET_ERROR(EINVAL));
929	}
930
931	/*
932	 * Check for mandatory locks before calling zfs_range_lock()
933	 * in order to prevent a deadlock with locks set via fcntl().
934	 */
935	if (MANDMODE((mode_t)zp->z_mode) &&
936	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
937		ZFS_EXIT(zfsvfs);
938		return (error);
939	}
940
941#ifdef illumos
942	/*
943	 * Pre-fault the pages to ensure slow (eg NFS) pages
944	 * don't hold up txg.
945	 * Skip this if uio contains loaned arc_buf.
946	 */
947	if ((uio->uio_extflg == UIO_XUIO) &&
948	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
949		xuio = (xuio_t *)uio;
950	else
951		uio_prefaultpages(MIN(n, max_blksz), uio);
952#endif
953
954	/*
955	 * If in append mode, set the io offset pointer to eof.
956	 */
957	if (ioflag & FAPPEND) {
958		/*
959		 * Obtain an appending range lock to guarantee file append
960		 * semantics.  We reset the write offset once we have the lock.
961		 */
962		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
963		woff = rl->r_off;
964		if (rl->r_len == UINT64_MAX) {
965			/*
966			 * We overlocked the file because this write will cause
967			 * the file block size to increase.
968			 * Note that zp_size cannot change with this lock held.
969			 */
970			woff = zp->z_size;
971		}
972		uio->uio_loffset = woff;
973	} else {
974		/*
975		 * Note that if the file block size will change as a result of
976		 * this write, then this range lock will lock the entire file
977		 * so that we can re-write the block safely.
978		 */
979		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
980	}
981
982	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
983		zfs_range_unlock(rl);
984		ZFS_EXIT(zfsvfs);
985		return (EFBIG);
986	}
987
988	if (woff >= limit) {
989		zfs_range_unlock(rl);
990		ZFS_EXIT(zfsvfs);
991		return (SET_ERROR(EFBIG));
992	}
993
994	if ((woff + n) > limit || woff > (limit - n))
995		n = limit - woff;
996
997	/* Will this write extend the file length? */
998	write_eof = (woff + n > zp->z_size);
999
1000	end_size = MAX(zp->z_size, woff + n);
1001
1002	/*
1003	 * Write the file in reasonable size chunks.  Each chunk is written
1004	 * in a separate transaction; this keeps the intent log records small
1005	 * and allows us to do more fine-grained space accounting.
1006	 */
1007	while (n > 0) {
1008		abuf = NULL;
1009		woff = uio->uio_loffset;
1010		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1011		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1012			if (abuf != NULL)
1013				dmu_return_arcbuf(abuf);
1014			error = SET_ERROR(EDQUOT);
1015			break;
1016		}
1017
1018		if (xuio && abuf == NULL) {
1019			ASSERT(i_iov < iovcnt);
1020			aiov = &iovp[i_iov];
1021			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1022			dmu_xuio_clear(xuio, i_iov);
1023			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1024			    iovec_t *, aiov, arc_buf_t *, abuf);
1025			ASSERT((aiov->iov_base == abuf->b_data) ||
1026			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1027			    aiov->iov_len == arc_buf_size(abuf)));
1028			i_iov++;
1029		} else if (abuf == NULL && n >= max_blksz &&
1030		    woff >= zp->z_size &&
1031		    P2PHASE(woff, max_blksz) == 0 &&
1032		    zp->z_blksz == max_blksz) {
1033			/*
1034			 * This write covers a full block.  "Borrow" a buffer
1035			 * from the dmu so that we can fill it before we enter
1036			 * a transaction.  This avoids the possibility of
1037			 * holding up the transaction if the data copy hangs
1038			 * up on a pagefault (e.g., from an NFS server mapping).
1039			 */
1040			size_t cbytes;
1041
1042			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1043			    max_blksz);
1044			ASSERT(abuf != NULL);
1045			ASSERT(arc_buf_size(abuf) == max_blksz);
1046			if (error = uiocopy(abuf->b_data, max_blksz,
1047			    UIO_WRITE, uio, &cbytes)) {
1048				dmu_return_arcbuf(abuf);
1049				break;
1050			}
1051			ASSERT(cbytes == max_blksz);
1052		}
1053
1054		/*
1055		 * Start a transaction.
1056		 */
1057		tx = dmu_tx_create(zfsvfs->z_os);
1058		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1059		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1060		zfs_sa_upgrade_txholds(tx, zp);
1061		error = dmu_tx_assign(tx, TXG_WAIT);
1062		if (error) {
1063			dmu_tx_abort(tx);
1064			if (abuf != NULL)
1065				dmu_return_arcbuf(abuf);
1066			break;
1067		}
1068
1069		/*
1070		 * If zfs_range_lock() over-locked we grow the blocksize
1071		 * and then reduce the lock range.  This will only happen
1072		 * on the first iteration since zfs_range_reduce() will
1073		 * shrink down r_len to the appropriate size.
1074		 */
1075		if (rl->r_len == UINT64_MAX) {
1076			uint64_t new_blksz;
1077
1078			if (zp->z_blksz > max_blksz) {
1079				/*
1080				 * File's blocksize is already larger than the
1081				 * "recordsize" property.  Only let it grow to
1082				 * the next power of 2.
1083				 */
1084				ASSERT(!ISP2(zp->z_blksz));
1085				new_blksz = MIN(end_size,
1086				    1 << highbit64(zp->z_blksz));
1087			} else {
1088				new_blksz = MIN(end_size, max_blksz);
1089			}
1090			zfs_grow_blocksize(zp, new_blksz, tx);
1091			zfs_range_reduce(rl, woff, n);
1092		}
1093
1094		/*
1095		 * XXX - should we really limit each write to z_max_blksz?
1096		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1097		 */
1098		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1099
1100		if (woff + nbytes > zp->z_size)
1101			vnode_pager_setsize(vp, woff + nbytes);
1102
1103		if (abuf == NULL) {
1104			tx_bytes = uio->uio_resid;
1105			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1106			    uio, nbytes, tx);
1107			tx_bytes -= uio->uio_resid;
1108		} else {
1109			tx_bytes = nbytes;
1110			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1111			/*
1112			 * If this is not a full block write, but we are
1113			 * extending the file past EOF and this data starts
1114			 * block-aligned, use assign_arcbuf().  Otherwise,
1115			 * write via dmu_write().
1116			 */
1117			if (tx_bytes < max_blksz && (!write_eof ||
1118			    aiov->iov_base != abuf->b_data)) {
1119				ASSERT(xuio);
1120				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1121				    aiov->iov_len, aiov->iov_base, tx);
1122				dmu_return_arcbuf(abuf);
1123				xuio_stat_wbuf_copied();
1124			} else {
1125				ASSERT(xuio || tx_bytes == max_blksz);
1126				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1127				    woff, abuf, tx);
1128			}
1129			ASSERT(tx_bytes <= uio->uio_resid);
1130			uioskip(uio, tx_bytes);
1131		}
1132		if (tx_bytes && vn_has_cached_data(vp)) {
1133			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1134			    zp->z_id, uio->uio_segflg, tx);
1135		}
1136
1137		/*
1138		 * If we made no progress, we're done.  If we made even
1139		 * partial progress, update the znode and ZIL accordingly.
1140		 */
1141		if (tx_bytes == 0) {
1142			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1143			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1144			dmu_tx_commit(tx);
1145			ASSERT(error != 0);
1146			break;
1147		}
1148
1149		/*
1150		 * Clear Set-UID/Set-GID bits on successful write if not
1151		 * privileged and at least one of the excute bits is set.
1152		 *
1153		 * It would be nice to to this after all writes have
1154		 * been done, but that would still expose the ISUID/ISGID
1155		 * to another app after the partial write is committed.
1156		 *
1157		 * Note: we don't call zfs_fuid_map_id() here because
1158		 * user 0 is not an ephemeral uid.
1159		 */
1160		mutex_enter(&zp->z_acl_lock);
1161		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1162		    (S_IXUSR >> 6))) != 0 &&
1163		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1164		    secpolicy_vnode_setid_retain(vp, cr,
1165		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1166			uint64_t newmode;
1167			zp->z_mode &= ~(S_ISUID | S_ISGID);
1168			newmode = zp->z_mode;
1169			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1170			    (void *)&newmode, sizeof (uint64_t), tx);
1171		}
1172		mutex_exit(&zp->z_acl_lock);
1173
1174		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1175		    B_TRUE);
1176
1177		/*
1178		 * Update the file size (zp_size) if it has changed;
1179		 * account for possible concurrent updates.
1180		 */
1181		while ((end_size = zp->z_size) < uio->uio_loffset) {
1182			(void) atomic_cas_64(&zp->z_size, end_size,
1183			    uio->uio_loffset);
1184#ifdef illumos
1185			ASSERT(error == 0);
1186#else
1187			ASSERT(error == 0 || error == EFAULT);
1188#endif
1189		}
1190		/*
1191		 * If we are replaying and eof is non zero then force
1192		 * the file size to the specified eof. Note, there's no
1193		 * concurrency during replay.
1194		 */
1195		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1196			zp->z_size = zfsvfs->z_replay_eof;
1197
1198		if (error == 0)
1199			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1200		else
1201			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1202
1203		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1204		dmu_tx_commit(tx);
1205
1206		if (error != 0)
1207			break;
1208		ASSERT(tx_bytes == nbytes);
1209		n -= nbytes;
1210
1211#ifdef illumos
1212		if (!xuio && n > 0)
1213			uio_prefaultpages(MIN(n, max_blksz), uio);
1214#endif
1215	}
1216
1217	zfs_range_unlock(rl);
1218
1219	/*
1220	 * If we're in replay mode, or we made no progress, return error.
1221	 * Otherwise, it's at least a partial write, so it's successful.
1222	 */
1223	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1224		ZFS_EXIT(zfsvfs);
1225		return (error);
1226	}
1227
1228#ifdef __FreeBSD__
1229	/*
1230	 * EFAULT means that at least one page of the source buffer was not
1231	 * available.  VFS will re-try remaining I/O upon this error.
1232	 */
1233	if (error == EFAULT) {
1234		ZFS_EXIT(zfsvfs);
1235		return (error);
1236	}
1237#endif
1238
1239	if (ioflag & (FSYNC | FDSYNC) ||
1240	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1241		zil_commit(zilog, zp->z_id);
1242
1243	ZFS_EXIT(zfsvfs);
1244	return (0);
1245}
1246
1247void
1248zfs_get_done(zgd_t *zgd, int error)
1249{
1250	znode_t *zp = zgd->zgd_private;
1251	objset_t *os = zp->z_zfsvfs->z_os;
1252
1253	if (zgd->zgd_db)
1254		dmu_buf_rele(zgd->zgd_db, zgd);
1255
1256	zfs_range_unlock(zgd->zgd_rl);
1257
1258	/*
1259	 * Release the vnode asynchronously as we currently have the
1260	 * txg stopped from syncing.
1261	 */
1262	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1263
1264	if (error == 0 && zgd->zgd_bp)
1265		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
1266
1267	kmem_free(zgd, sizeof (zgd_t));
1268}
1269
1270#ifdef DEBUG
1271static int zil_fault_io = 0;
1272#endif
1273
1274/*
1275 * Get data to generate a TX_WRITE intent log record.
1276 */
1277int
1278zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1279{
1280	zfsvfs_t *zfsvfs = arg;
1281	objset_t *os = zfsvfs->z_os;
1282	znode_t *zp;
1283	uint64_t object = lr->lr_foid;
1284	uint64_t offset = lr->lr_offset;
1285	uint64_t size = lr->lr_length;
1286	dmu_buf_t *db;
1287	zgd_t *zgd;
1288	int error = 0;
1289
1290	ASSERT3P(lwb, !=, NULL);
1291	ASSERT3P(zio, !=, NULL);
1292	ASSERT3U(size, !=, 0);
1293
1294	/*
1295	 * Nothing to do if the file has been removed
1296	 */
1297	if (zfs_zget(zfsvfs, object, &zp) != 0)
1298		return (SET_ERROR(ENOENT));
1299	if (zp->z_unlinked) {
1300		/*
1301		 * Release the vnode asynchronously as we currently have the
1302		 * txg stopped from syncing.
1303		 */
1304		VN_RELE_ASYNC(ZTOV(zp),
1305		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1306		return (SET_ERROR(ENOENT));
1307	}
1308
1309	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1310	zgd->zgd_lwb = lwb;
1311	zgd->zgd_private = zp;
1312
1313	/*
1314	 * Write records come in two flavors: immediate and indirect.
1315	 * For small writes it's cheaper to store the data with the
1316	 * log record (immediate); for large writes it's cheaper to
1317	 * sync the data and get a pointer to it (indirect) so that
1318	 * we don't have to write the data twice.
1319	 */
1320	if (buf != NULL) { /* immediate write */
1321		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1322		/* test for truncation needs to be done while range locked */
1323		if (offset >= zp->z_size) {
1324			error = SET_ERROR(ENOENT);
1325		} else {
1326			error = dmu_read(os, object, offset, size, buf,
1327			    DMU_READ_NO_PREFETCH);
1328		}
1329		ASSERT(error == 0 || error == ENOENT);
1330	} else { /* indirect write */
1331		/*
1332		 * Have to lock the whole block to ensure when it's
1333		 * written out and its checksum is being calculated
1334		 * that no one can change the data. We need to re-check
1335		 * blocksize after we get the lock in case it's changed!
1336		 */
1337		for (;;) {
1338			uint64_t blkoff;
1339			size = zp->z_blksz;
1340			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1341			offset -= blkoff;
1342			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1343			    RL_READER);
1344			if (zp->z_blksz == size)
1345				break;
1346			offset += blkoff;
1347			zfs_range_unlock(zgd->zgd_rl);
1348		}
1349		/* test for truncation needs to be done while range locked */
1350		if (lr->lr_offset >= zp->z_size)
1351			error = SET_ERROR(ENOENT);
1352#ifdef DEBUG
1353		if (zil_fault_io) {
1354			error = SET_ERROR(EIO);
1355			zil_fault_io = 0;
1356		}
1357#endif
1358		if (error == 0)
1359			error = dmu_buf_hold(os, object, offset, zgd, &db,
1360			    DMU_READ_NO_PREFETCH);
1361
1362		if (error == 0) {
1363			blkptr_t *bp = &lr->lr_blkptr;
1364
1365			zgd->zgd_db = db;
1366			zgd->zgd_bp = bp;
1367
1368			ASSERT(db->db_offset == offset);
1369			ASSERT(db->db_size == size);
1370
1371			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1372			    zfs_get_done, zgd);
1373			ASSERT(error || lr->lr_length <= size);
1374
1375			/*
1376			 * On success, we need to wait for the write I/O
1377			 * initiated by dmu_sync() to complete before we can
1378			 * release this dbuf.  We will finish everything up
1379			 * in the zfs_get_done() callback.
1380			 */
1381			if (error == 0)
1382				return (0);
1383
1384			if (error == EALREADY) {
1385				lr->lr_common.lrc_txtype = TX_WRITE2;
1386				error = 0;
1387			}
1388		}
1389	}
1390
1391	zfs_get_done(zgd, error);
1392
1393	return (error);
1394}
1395
1396/*ARGSUSED*/
1397static int
1398zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1399    caller_context_t *ct)
1400{
1401	znode_t *zp = VTOZ(vp);
1402	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1403	int error;
1404
1405	ZFS_ENTER(zfsvfs);
1406	ZFS_VERIFY_ZP(zp);
1407
1408	if (flag & V_ACE_MASK)
1409		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1410	else
1411		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1412
1413	ZFS_EXIT(zfsvfs);
1414	return (error);
1415}
1416
1417static int
1418zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1419{
1420	int error;
1421
1422	*vpp = arg;
1423	error = vn_lock(*vpp, lkflags);
1424	if (error != 0)
1425		vrele(*vpp);
1426	return (error);
1427}
1428
1429static int
1430zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1431{
1432	znode_t *zdp = VTOZ(dvp);
1433	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1434	int error;
1435	int ltype;
1436
1437	ASSERT_VOP_LOCKED(dvp, __func__);
1438#ifdef DIAGNOSTIC
1439	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1440		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1441#endif
1442
1443	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1444		ASSERT3P(dvp, ==, vp);
1445		vref(dvp);
1446		ltype = lkflags & LK_TYPE_MASK;
1447		if (ltype != VOP_ISLOCKED(dvp)) {
1448			if (ltype == LK_EXCLUSIVE)
1449				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1450			else /* if (ltype == LK_SHARED) */
1451				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1452
1453			/*
1454			 * Relock for the "." case could leave us with
1455			 * reclaimed vnode.
1456			 */
1457			if (dvp->v_iflag & VI_DOOMED) {
1458				vrele(dvp);
1459				return (SET_ERROR(ENOENT));
1460			}
1461		}
1462		return (0);
1463	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1464		/*
1465		 * Note that in this case, dvp is the child vnode, and we
1466		 * are looking up the parent vnode - exactly reverse from
1467		 * normal operation.  Unlocking dvp requires some rather
1468		 * tricky unlock/relock dance to prevent mp from being freed;
1469		 * use vn_vget_ino_gen() which takes care of all that.
1470		 *
1471		 * XXX Note that there is a time window when both vnodes are
1472		 * unlocked.  It is possible, although highly unlikely, that
1473		 * during that window the parent-child relationship between
1474		 * the vnodes may change, for example, get reversed.
1475		 * In that case we would have a wrong lock order for the vnodes.
1476		 * All other filesystems seem to ignore this problem, so we
1477		 * do the same here.
1478		 * A potential solution could be implemented as follows:
1479		 * - using LK_NOWAIT when locking the second vnode and retrying
1480		 *   if necessary
1481		 * - checking that the parent-child relationship still holds
1482		 *   after locking both vnodes and retrying if it doesn't
1483		 */
1484		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1485		return (error);
1486	} else {
1487		error = vn_lock(vp, lkflags);
1488		if (error != 0)
1489			vrele(vp);
1490		return (error);
1491	}
1492}
1493
1494/*
1495 * Lookup an entry in a directory, or an extended attribute directory.
1496 * If it exists, return a held vnode reference for it.
1497 *
1498 *	IN:	dvp	- vnode of directory to search.
1499 *		nm	- name of entry to lookup.
1500 *		pnp	- full pathname to lookup [UNUSED].
1501 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1502 *		rdir	- root directory vnode [UNUSED].
1503 *		cr	- credentials of caller.
1504 *		ct	- caller context
1505 *
1506 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1507 *
1508 *	RETURN:	0 on success, error code on failure.
1509 *
1510 * Timestamps:
1511 *	NA
1512 */
1513/* ARGSUSED */
1514static int
1515zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1516    int nameiop, cred_t *cr, kthread_t *td, int flags)
1517{
1518	znode_t *zdp = VTOZ(dvp);
1519	znode_t *zp;
1520	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1521	int	error = 0;
1522
1523	/*
1524	 * Fast path lookup, however we must skip DNLC lookup
1525	 * for case folding or normalizing lookups because the
1526	 * DNLC code only stores the passed in name.  This means
1527	 * creating 'a' and removing 'A' on a case insensitive
1528	 * file system would work, but DNLC still thinks 'a'
1529	 * exists and won't let you create it again on the next
1530	 * pass through fast path.
1531	 */
1532	if (!(flags & LOOKUP_XATTR)) {
1533		if (dvp->v_type != VDIR) {
1534			return (SET_ERROR(ENOTDIR));
1535		} else if (zdp->z_sa_hdl == NULL) {
1536			return (SET_ERROR(EIO));
1537		}
1538	}
1539
1540	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1541
1542	ZFS_ENTER(zfsvfs);
1543	ZFS_VERIFY_ZP(zdp);
1544
1545	*vpp = NULL;
1546
1547	if (flags & LOOKUP_XATTR) {
1548#ifdef TODO
1549		/*
1550		 * If the xattr property is off, refuse the lookup request.
1551		 */
1552		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1553			ZFS_EXIT(zfsvfs);
1554			return (SET_ERROR(EINVAL));
1555		}
1556#endif
1557
1558		/*
1559		 * We don't allow recursive attributes..
1560		 * Maybe someday we will.
1561		 */
1562		if (zdp->z_pflags & ZFS_XATTR) {
1563			ZFS_EXIT(zfsvfs);
1564			return (SET_ERROR(EINVAL));
1565		}
1566
1567		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1568			ZFS_EXIT(zfsvfs);
1569			return (error);
1570		}
1571
1572		/*
1573		 * Do we have permission to get into attribute directory?
1574		 */
1575		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1576		    B_FALSE, cr)) {
1577			vrele(*vpp);
1578			*vpp = NULL;
1579		}
1580
1581		ZFS_EXIT(zfsvfs);
1582		return (error);
1583	}
1584
1585	/*
1586	 * Check accessibility of directory.
1587	 */
1588	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1589		ZFS_EXIT(zfsvfs);
1590		return (error);
1591	}
1592
1593	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1594	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1595		ZFS_EXIT(zfsvfs);
1596		return (SET_ERROR(EILSEQ));
1597	}
1598
1599
1600	/*
1601	 * First handle the special cases.
1602	 */
1603	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1604		/*
1605		 * If we are a snapshot mounted under .zfs, return
1606		 * the vp for the snapshot directory.
1607		 */
1608		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1609			struct componentname cn;
1610			vnode_t *zfsctl_vp;
1611			int ltype;
1612
1613			ZFS_EXIT(zfsvfs);
1614			ltype = VOP_ISLOCKED(dvp);
1615			VOP_UNLOCK(dvp, 0);
1616			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1617			    &zfsctl_vp);
1618			if (error == 0) {
1619				cn.cn_nameptr = "snapshot";
1620				cn.cn_namelen = strlen(cn.cn_nameptr);
1621				cn.cn_nameiop = cnp->cn_nameiop;
1622				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1623				cn.cn_lkflags = cnp->cn_lkflags;
1624				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1625				vput(zfsctl_vp);
1626			}
1627			vn_lock(dvp, ltype | LK_RETRY);
1628			return (error);
1629		}
1630	}
1631	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1632		ZFS_EXIT(zfsvfs);
1633		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1634			return (SET_ERROR(ENOTSUP));
1635		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1636		return (error);
1637	}
1638
1639	/*
1640	 * The loop is retry the lookup if the parent-child relationship
1641	 * changes during the dot-dot locking complexities.
1642	 */
1643	for (;;) {
1644		uint64_t parent;
1645
1646		error = zfs_dirlook(zdp, nm, &zp);
1647		if (error == 0)
1648			*vpp = ZTOV(zp);
1649
1650		ZFS_EXIT(zfsvfs);
1651		if (error != 0)
1652			break;
1653
1654		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1655		if (error != 0) {
1656			/*
1657			 * If we've got a locking error, then the vnode
1658			 * got reclaimed because of a force unmount.
1659			 * We never enter doomed vnodes into the name cache.
1660			 */
1661			*vpp = NULL;
1662			return (error);
1663		}
1664
1665		if ((cnp->cn_flags & ISDOTDOT) == 0)
1666			break;
1667
1668		ZFS_ENTER(zfsvfs);
1669		if (zdp->z_sa_hdl == NULL) {
1670			error = SET_ERROR(EIO);
1671		} else {
1672			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1673			    &parent, sizeof (parent));
1674		}
1675		if (error != 0) {
1676			ZFS_EXIT(zfsvfs);
1677			vput(ZTOV(zp));
1678			break;
1679		}
1680		if (zp->z_id == parent) {
1681			ZFS_EXIT(zfsvfs);
1682			break;
1683		}
1684		vput(ZTOV(zp));
1685	}
1686
1687out:
1688	if (error != 0)
1689		*vpp = NULL;
1690
1691	/* Translate errors and add SAVENAME when needed. */
1692	if (cnp->cn_flags & ISLASTCN) {
1693		switch (nameiop) {
1694		case CREATE:
1695		case RENAME:
1696			if (error == ENOENT) {
1697				error = EJUSTRETURN;
1698				cnp->cn_flags |= SAVENAME;
1699				break;
1700			}
1701			/* FALLTHROUGH */
1702		case DELETE:
1703			if (error == 0)
1704				cnp->cn_flags |= SAVENAME;
1705			break;
1706		}
1707	}
1708
1709	/* Insert name into cache (as non-existent) if appropriate. */
1710	if (zfsvfs->z_use_namecache &&
1711	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1712		cache_enter(dvp, NULL, cnp);
1713
1714	/* Insert name into cache if appropriate. */
1715	if (zfsvfs->z_use_namecache &&
1716	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1717		if (!(cnp->cn_flags & ISLASTCN) ||
1718		    (nameiop != DELETE && nameiop != RENAME)) {
1719			cache_enter(dvp, *vpp, cnp);
1720		}
1721	}
1722
1723	return (error);
1724}
1725
1726/*
1727 * Attempt to create a new entry in a directory.  If the entry
1728 * already exists, truncate the file if permissible, else return
1729 * an error.  Return the vp of the created or trunc'd file.
1730 *
1731 *	IN:	dvp	- vnode of directory to put new file entry in.
1732 *		name	- name of new file entry.
1733 *		vap	- attributes of new file.
1734 *		excl	- flag indicating exclusive or non-exclusive mode.
1735 *		mode	- mode to open file with.
1736 *		cr	- credentials of caller.
1737 *		flag	- large file flag [UNUSED].
1738 *		ct	- caller context
1739 *		vsecp	- ACL to be set
1740 *
1741 *	OUT:	vpp	- vnode of created or trunc'd entry.
1742 *
1743 *	RETURN:	0 on success, error code on failure.
1744 *
1745 * Timestamps:
1746 *	dvp - ctime|mtime updated if new entry created
1747 *	 vp - ctime|mtime always, atime if new
1748 */
1749
1750/* ARGSUSED */
1751static int
1752zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1753    vnode_t **vpp, cred_t *cr, kthread_t *td)
1754{
1755	znode_t		*zp, *dzp = VTOZ(dvp);
1756	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1757	zilog_t		*zilog;
1758	objset_t	*os;
1759	dmu_tx_t	*tx;
1760	int		error;
1761	ksid_t		*ksid;
1762	uid_t		uid;
1763	gid_t		gid = crgetgid(cr);
1764	zfs_acl_ids_t   acl_ids;
1765	boolean_t	fuid_dirtied;
1766	void		*vsecp = NULL;
1767	int		flag = 0;
1768	uint64_t	txtype;
1769
1770	/*
1771	 * If we have an ephemeral id, ACL, or XVATTR then
1772	 * make sure file system is at proper version
1773	 */
1774
1775	ksid = crgetsid(cr, KSID_OWNER);
1776	if (ksid)
1777		uid = ksid_getid(ksid);
1778	else
1779		uid = crgetuid(cr);
1780
1781	if (zfsvfs->z_use_fuids == B_FALSE &&
1782	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1783	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1784		return (SET_ERROR(EINVAL));
1785
1786	ZFS_ENTER(zfsvfs);
1787	ZFS_VERIFY_ZP(dzp);
1788	os = zfsvfs->z_os;
1789	zilog = zfsvfs->z_log;
1790
1791	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1792	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1793		ZFS_EXIT(zfsvfs);
1794		return (SET_ERROR(EILSEQ));
1795	}
1796
1797	if (vap->va_mask & AT_XVATTR) {
1798		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1799		    crgetuid(cr), cr, vap->va_type)) != 0) {
1800			ZFS_EXIT(zfsvfs);
1801			return (error);
1802		}
1803	}
1804
1805	*vpp = NULL;
1806
1807	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1808		vap->va_mode &= ~S_ISVTX;
1809
1810	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1811	if (error) {
1812		ZFS_EXIT(zfsvfs);
1813		return (error);
1814	}
1815	ASSERT3P(zp, ==, NULL);
1816
1817	/*
1818	 * Create a new file object and update the directory
1819	 * to reference it.
1820	 */
1821	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1822		goto out;
1823	}
1824
1825	/*
1826	 * We only support the creation of regular files in
1827	 * extended attribute directories.
1828	 */
1829
1830	if ((dzp->z_pflags & ZFS_XATTR) &&
1831	    (vap->va_type != VREG)) {
1832		error = SET_ERROR(EINVAL);
1833		goto out;
1834	}
1835
1836	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1837	    cr, vsecp, &acl_ids)) != 0)
1838		goto out;
1839
1840	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1841		zfs_acl_ids_free(&acl_ids);
1842		error = SET_ERROR(EDQUOT);
1843		goto out;
1844	}
1845
1846	getnewvnode_reserve(1);
1847
1848	tx = dmu_tx_create(os);
1849
1850	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1851	    ZFS_SA_BASE_ATTR_SIZE);
1852
1853	fuid_dirtied = zfsvfs->z_fuid_dirty;
1854	if (fuid_dirtied)
1855		zfs_fuid_txhold(zfsvfs, tx);
1856	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1857	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1858	if (!zfsvfs->z_use_sa &&
1859	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1860		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1861		    0, acl_ids.z_aclp->z_acl_bytes);
1862	}
1863	error = dmu_tx_assign(tx, TXG_WAIT);
1864	if (error) {
1865		zfs_acl_ids_free(&acl_ids);
1866		dmu_tx_abort(tx);
1867		getnewvnode_drop_reserve();
1868		ZFS_EXIT(zfsvfs);
1869		return (error);
1870	}
1871	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1872
1873	if (fuid_dirtied)
1874		zfs_fuid_sync(zfsvfs, tx);
1875
1876	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1877	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1878	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1879	    vsecp, acl_ids.z_fuidp, vap);
1880	zfs_acl_ids_free(&acl_ids);
1881	dmu_tx_commit(tx);
1882
1883	getnewvnode_drop_reserve();
1884
1885out:
1886	if (error == 0) {
1887		*vpp = ZTOV(zp);
1888	}
1889
1890	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1891		zil_commit(zilog, 0);
1892
1893	ZFS_EXIT(zfsvfs);
1894	return (error);
1895}
1896
1897/*
1898 * Remove an entry from a directory.
1899 *
1900 *	IN:	dvp	- vnode of directory to remove entry from.
1901 *		name	- name of entry to remove.
1902 *		cr	- credentials of caller.
1903 *		ct	- caller context
1904 *		flags	- case flags
1905 *
1906 *	RETURN:	0 on success, error code on failure.
1907 *
1908 * Timestamps:
1909 *	dvp - ctime|mtime
1910 *	 vp - ctime (if nlink > 0)
1911 */
1912
1913/*ARGSUSED*/
1914static int
1915zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1916{
1917	znode_t		*dzp = VTOZ(dvp);
1918	znode_t		*zp = VTOZ(vp);
1919	znode_t		*xzp;
1920	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1921	zilog_t		*zilog;
1922	uint64_t	acl_obj, xattr_obj;
1923	uint64_t	obj = 0;
1924	dmu_tx_t	*tx;
1925	boolean_t	unlinked, toobig = FALSE;
1926	uint64_t	txtype;
1927	int		error;
1928
1929	ZFS_ENTER(zfsvfs);
1930	ZFS_VERIFY_ZP(dzp);
1931	ZFS_VERIFY_ZP(zp);
1932	zilog = zfsvfs->z_log;
1933	zp = VTOZ(vp);
1934
1935	xattr_obj = 0;
1936	xzp = NULL;
1937
1938	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1939		goto out;
1940	}
1941
1942	/*
1943	 * Need to use rmdir for removing directories.
1944	 */
1945	if (vp->v_type == VDIR) {
1946		error = SET_ERROR(EPERM);
1947		goto out;
1948	}
1949
1950	vnevent_remove(vp, dvp, name, ct);
1951
1952	obj = zp->z_id;
1953
1954	/* are there any extended attributes? */
1955	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1956	    &xattr_obj, sizeof (xattr_obj));
1957	if (error == 0 && xattr_obj) {
1958		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1959		ASSERT0(error);
1960	}
1961
1962	/*
1963	 * We may delete the znode now, or we may put it in the unlinked set;
1964	 * it depends on whether we're the last link, and on whether there are
1965	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1966	 * allow for either case.
1967	 */
1968	tx = dmu_tx_create(zfsvfs->z_os);
1969	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1970	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1971	zfs_sa_upgrade_txholds(tx, zp);
1972	zfs_sa_upgrade_txholds(tx, dzp);
1973
1974	if (xzp) {
1975		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1976		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1977	}
1978
1979	/* charge as an update -- would be nice not to charge at all */
1980	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1981
1982	/*
1983	 * Mark this transaction as typically resulting in a net free of space
1984	 */
1985	dmu_tx_mark_netfree(tx);
1986
1987	error = dmu_tx_assign(tx, TXG_WAIT);
1988	if (error) {
1989		dmu_tx_abort(tx);
1990		ZFS_EXIT(zfsvfs);
1991		return (error);
1992	}
1993
1994	/*
1995	 * Remove the directory entry.
1996	 */
1997	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1998
1999	if (error) {
2000		dmu_tx_commit(tx);
2001		goto out;
2002	}
2003
2004	if (unlinked) {
2005		zfs_unlinked_add(zp, tx);
2006		vp->v_vflag |= VV_NOSYNC;
2007	}
2008
2009	txtype = TX_REMOVE;
2010	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2011
2012	dmu_tx_commit(tx);
2013out:
2014
2015	if (xzp)
2016		vrele(ZTOV(xzp));
2017
2018	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2019		zil_commit(zilog, 0);
2020
2021	ZFS_EXIT(zfsvfs);
2022	return (error);
2023}
2024
2025/*
2026 * Create a new directory and insert it into dvp using the name
2027 * provided.  Return a pointer to the inserted directory.
2028 *
2029 *	IN:	dvp	- vnode of directory to add subdir to.
2030 *		dirname	- name of new directory.
2031 *		vap	- attributes of new directory.
2032 *		cr	- credentials of caller.
2033 *		ct	- caller context
2034 *		flags	- case flags
2035 *		vsecp	- ACL to be set
2036 *
2037 *	OUT:	vpp	- vnode of created directory.
2038 *
2039 *	RETURN:	0 on success, error code on failure.
2040 *
2041 * Timestamps:
2042 *	dvp - ctime|mtime updated
2043 *	 vp - ctime|mtime|atime updated
2044 */
2045/*ARGSUSED*/
2046static int
2047zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2048{
2049	znode_t		*zp, *dzp = VTOZ(dvp);
2050	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2051	zilog_t		*zilog;
2052	uint64_t	txtype;
2053	dmu_tx_t	*tx;
2054	int		error;
2055	ksid_t		*ksid;
2056	uid_t		uid;
2057	gid_t		gid = crgetgid(cr);
2058	zfs_acl_ids_t   acl_ids;
2059	boolean_t	fuid_dirtied;
2060
2061	ASSERT(vap->va_type == VDIR);
2062
2063	/*
2064	 * If we have an ephemeral id, ACL, or XVATTR then
2065	 * make sure file system is at proper version
2066	 */
2067
2068	ksid = crgetsid(cr, KSID_OWNER);
2069	if (ksid)
2070		uid = ksid_getid(ksid);
2071	else
2072		uid = crgetuid(cr);
2073	if (zfsvfs->z_use_fuids == B_FALSE &&
2074	    ((vap->va_mask & AT_XVATTR) ||
2075	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2076		return (SET_ERROR(EINVAL));
2077
2078	ZFS_ENTER(zfsvfs);
2079	ZFS_VERIFY_ZP(dzp);
2080	zilog = zfsvfs->z_log;
2081
2082	if (dzp->z_pflags & ZFS_XATTR) {
2083		ZFS_EXIT(zfsvfs);
2084		return (SET_ERROR(EINVAL));
2085	}
2086
2087	if (zfsvfs->z_utf8 && u8_validate(dirname,
2088	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2089		ZFS_EXIT(zfsvfs);
2090		return (SET_ERROR(EILSEQ));
2091	}
2092
2093	if (vap->va_mask & AT_XVATTR) {
2094		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2095		    crgetuid(cr), cr, vap->va_type)) != 0) {
2096			ZFS_EXIT(zfsvfs);
2097			return (error);
2098		}
2099	}
2100
2101	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2102	    NULL, &acl_ids)) != 0) {
2103		ZFS_EXIT(zfsvfs);
2104		return (error);
2105	}
2106
2107	/*
2108	 * First make sure the new directory doesn't exist.
2109	 *
2110	 * Existence is checked first to make sure we don't return
2111	 * EACCES instead of EEXIST which can cause some applications
2112	 * to fail.
2113	 */
2114	*vpp = NULL;
2115
2116	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2117		zfs_acl_ids_free(&acl_ids);
2118		ZFS_EXIT(zfsvfs);
2119		return (error);
2120	}
2121	ASSERT3P(zp, ==, NULL);
2122
2123	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2124		zfs_acl_ids_free(&acl_ids);
2125		ZFS_EXIT(zfsvfs);
2126		return (error);
2127	}
2128
2129	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2130		zfs_acl_ids_free(&acl_ids);
2131		ZFS_EXIT(zfsvfs);
2132		return (SET_ERROR(EDQUOT));
2133	}
2134
2135	/*
2136	 * Add a new entry to the directory.
2137	 */
2138	getnewvnode_reserve(1);
2139	tx = dmu_tx_create(zfsvfs->z_os);
2140	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2141	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2142	fuid_dirtied = zfsvfs->z_fuid_dirty;
2143	if (fuid_dirtied)
2144		zfs_fuid_txhold(zfsvfs, tx);
2145	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2146		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2147		    acl_ids.z_aclp->z_acl_bytes);
2148	}
2149
2150	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2151	    ZFS_SA_BASE_ATTR_SIZE);
2152
2153	error = dmu_tx_assign(tx, TXG_WAIT);
2154	if (error) {
2155		zfs_acl_ids_free(&acl_ids);
2156		dmu_tx_abort(tx);
2157		getnewvnode_drop_reserve();
2158		ZFS_EXIT(zfsvfs);
2159		return (error);
2160	}
2161
2162	/*
2163	 * Create new node.
2164	 */
2165	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2166
2167	if (fuid_dirtied)
2168		zfs_fuid_sync(zfsvfs, tx);
2169
2170	/*
2171	 * Now put new name in parent dir.
2172	 */
2173	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2174
2175	*vpp = ZTOV(zp);
2176
2177	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2178	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2179	    acl_ids.z_fuidp, vap);
2180
2181	zfs_acl_ids_free(&acl_ids);
2182
2183	dmu_tx_commit(tx);
2184
2185	getnewvnode_drop_reserve();
2186
2187	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2188		zil_commit(zilog, 0);
2189
2190	ZFS_EXIT(zfsvfs);
2191	return (0);
2192}
2193
2194/*
2195 * Remove a directory subdir entry.  If the current working
2196 * directory is the same as the subdir to be removed, the
2197 * remove will fail.
2198 *
2199 *	IN:	dvp	- vnode of directory to remove from.
2200 *		name	- name of directory to be removed.
2201 *		cwd	- vnode of current working directory.
2202 *		cr	- credentials of caller.
2203 *		ct	- caller context
2204 *		flags	- case flags
2205 *
2206 *	RETURN:	0 on success, error code on failure.
2207 *
2208 * Timestamps:
2209 *	dvp - ctime|mtime updated
2210 */
2211/*ARGSUSED*/
2212static int
2213zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2214{
2215	znode_t		*dzp = VTOZ(dvp);
2216	znode_t		*zp = VTOZ(vp);
2217	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2218	zilog_t		*zilog;
2219	dmu_tx_t	*tx;
2220	int		error;
2221
2222	ZFS_ENTER(zfsvfs);
2223	ZFS_VERIFY_ZP(dzp);
2224	ZFS_VERIFY_ZP(zp);
2225	zilog = zfsvfs->z_log;
2226
2227
2228	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2229		goto out;
2230	}
2231
2232	if (vp->v_type != VDIR) {
2233		error = SET_ERROR(ENOTDIR);
2234		goto out;
2235	}
2236
2237	vnevent_rmdir(vp, dvp, name, ct);
2238
2239	tx = dmu_tx_create(zfsvfs->z_os);
2240	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2241	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2242	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2243	zfs_sa_upgrade_txholds(tx, zp);
2244	zfs_sa_upgrade_txholds(tx, dzp);
2245	dmu_tx_mark_netfree(tx);
2246	error = dmu_tx_assign(tx, TXG_WAIT);
2247	if (error) {
2248		dmu_tx_abort(tx);
2249		ZFS_EXIT(zfsvfs);
2250		return (error);
2251	}
2252
2253	cache_purge(dvp);
2254
2255	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2256
2257	if (error == 0) {
2258		uint64_t txtype = TX_RMDIR;
2259		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2260	}
2261
2262	dmu_tx_commit(tx);
2263
2264	cache_purge(vp);
2265out:
2266	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2267		zil_commit(zilog, 0);
2268
2269	ZFS_EXIT(zfsvfs);
2270	return (error);
2271}
2272
2273/*
2274 * Read as many directory entries as will fit into the provided
2275 * buffer from the given directory cursor position (specified in
2276 * the uio structure).
2277 *
2278 *	IN:	vp	- vnode of directory to read.
2279 *		uio	- structure supplying read location, range info,
2280 *			  and return buffer.
2281 *		cr	- credentials of caller.
2282 *		ct	- caller context
2283 *		flags	- case flags
2284 *
2285 *	OUT:	uio	- updated offset and range, buffer filled.
2286 *		eofp	- set to true if end-of-file detected.
2287 *
2288 *	RETURN:	0 on success, error code on failure.
2289 *
2290 * Timestamps:
2291 *	vp - atime updated
2292 *
2293 * Note that the low 4 bits of the cookie returned by zap is always zero.
2294 * This allows us to use the low range for "special" directory entries:
2295 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2296 * we use the offset 2 for the '.zfs' directory.
2297 */
2298/* ARGSUSED */
2299static int
2300zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2301{
2302	znode_t		*zp = VTOZ(vp);
2303	iovec_t		*iovp;
2304	edirent_t	*eodp;
2305	dirent64_t	*odp;
2306	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2307	objset_t	*os;
2308	caddr_t		outbuf;
2309	size_t		bufsize;
2310	zap_cursor_t	zc;
2311	zap_attribute_t	zap;
2312	uint_t		bytes_wanted;
2313	uint64_t	offset; /* must be unsigned; checks for < 1 */
2314	uint64_t	parent;
2315	int		local_eof;
2316	int		outcount;
2317	int		error;
2318	uint8_t		prefetch;
2319	boolean_t	check_sysattrs;
2320	uint8_t		type;
2321	int		ncooks;
2322	u_long		*cooks = NULL;
2323	int		flags = 0;
2324
2325	ZFS_ENTER(zfsvfs);
2326	ZFS_VERIFY_ZP(zp);
2327
2328	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2329	    &parent, sizeof (parent))) != 0) {
2330		ZFS_EXIT(zfsvfs);
2331		return (error);
2332	}
2333
2334	/*
2335	 * If we are not given an eof variable,
2336	 * use a local one.
2337	 */
2338	if (eofp == NULL)
2339		eofp = &local_eof;
2340
2341	/*
2342	 * Check for valid iov_len.
2343	 */
2344	if (uio->uio_iov->iov_len <= 0) {
2345		ZFS_EXIT(zfsvfs);
2346		return (SET_ERROR(EINVAL));
2347	}
2348
2349	/*
2350	 * Quit if directory has been removed (posix)
2351	 */
2352	if ((*eofp = zp->z_unlinked) != 0) {
2353		ZFS_EXIT(zfsvfs);
2354		return (0);
2355	}
2356
2357	error = 0;
2358	os = zfsvfs->z_os;
2359	offset = uio->uio_loffset;
2360	prefetch = zp->z_zn_prefetch;
2361
2362	/*
2363	 * Initialize the iterator cursor.
2364	 */
2365	if (offset <= 3) {
2366		/*
2367		 * Start iteration from the beginning of the directory.
2368		 */
2369		zap_cursor_init(&zc, os, zp->z_id);
2370	} else {
2371		/*
2372		 * The offset is a serialized cursor.
2373		 */
2374		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2375	}
2376
2377	/*
2378	 * Get space to change directory entries into fs independent format.
2379	 */
2380	iovp = uio->uio_iov;
2381	bytes_wanted = iovp->iov_len;
2382	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2383		bufsize = bytes_wanted;
2384		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2385		odp = (struct dirent64 *)outbuf;
2386	} else {
2387		bufsize = bytes_wanted;
2388		outbuf = NULL;
2389		odp = (struct dirent64 *)iovp->iov_base;
2390	}
2391	eodp = (struct edirent *)odp;
2392
2393	if (ncookies != NULL) {
2394		/*
2395		 * Minimum entry size is dirent size and 1 byte for a file name.
2396		 */
2397		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2398		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2399		*cookies = cooks;
2400		*ncookies = ncooks;
2401	}
2402	/*
2403	 * If this VFS supports the system attribute view interface; and
2404	 * we're looking at an extended attribute directory; and we care
2405	 * about normalization conflicts on this vfs; then we must check
2406	 * for normalization conflicts with the sysattr name space.
2407	 */
2408#ifdef TODO
2409	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2410	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2411	    (flags & V_RDDIR_ENTFLAGS);
2412#else
2413	check_sysattrs = 0;
2414#endif
2415
2416	/*
2417	 * Transform to file-system independent format
2418	 */
2419	outcount = 0;
2420	while (outcount < bytes_wanted) {
2421		ino64_t objnum;
2422		ushort_t reclen;
2423		off64_t *next = NULL;
2424
2425		/*
2426		 * Special case `.', `..', and `.zfs'.
2427		 */
2428		if (offset == 0) {
2429			(void) strcpy(zap.za_name, ".");
2430			zap.za_normalization_conflict = 0;
2431			objnum = zp->z_id;
2432			type = DT_DIR;
2433		} else if (offset == 1) {
2434			(void) strcpy(zap.za_name, "..");
2435			zap.za_normalization_conflict = 0;
2436			objnum = parent;
2437			type = DT_DIR;
2438		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2439			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2440			zap.za_normalization_conflict = 0;
2441			objnum = ZFSCTL_INO_ROOT;
2442			type = DT_DIR;
2443		} else {
2444			/*
2445			 * Grab next entry.
2446			 */
2447			if (error = zap_cursor_retrieve(&zc, &zap)) {
2448				if ((*eofp = (error == ENOENT)) != 0)
2449					break;
2450				else
2451					goto update;
2452			}
2453
2454			if (zap.za_integer_length != 8 ||
2455			    zap.za_num_integers != 1) {
2456				cmn_err(CE_WARN, "zap_readdir: bad directory "
2457				    "entry, obj = %lld, offset = %lld\n",
2458				    (u_longlong_t)zp->z_id,
2459				    (u_longlong_t)offset);
2460				error = SET_ERROR(ENXIO);
2461				goto update;
2462			}
2463
2464			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2465			/*
2466			 * MacOS X can extract the object type here such as:
2467			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2468			 */
2469			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2470
2471			if (check_sysattrs && !zap.za_normalization_conflict) {
2472#ifdef TODO
2473				zap.za_normalization_conflict =
2474				    xattr_sysattr_casechk(zap.za_name);
2475#else
2476				panic("%s:%u: TODO", __func__, __LINE__);
2477#endif
2478			}
2479		}
2480
2481		if (flags & V_RDDIR_ACCFILTER) {
2482			/*
2483			 * If we have no access at all, don't include
2484			 * this entry in the returned information
2485			 */
2486			znode_t	*ezp;
2487			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2488				goto skip_entry;
2489			if (!zfs_has_access(ezp, cr)) {
2490				vrele(ZTOV(ezp));
2491				goto skip_entry;
2492			}
2493			vrele(ZTOV(ezp));
2494		}
2495
2496		if (flags & V_RDDIR_ENTFLAGS)
2497			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2498		else
2499			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2500
2501		/*
2502		 * Will this entry fit in the buffer?
2503		 */
2504		if (outcount + reclen > bufsize) {
2505			/*
2506			 * Did we manage to fit anything in the buffer?
2507			 */
2508			if (!outcount) {
2509				error = SET_ERROR(EINVAL);
2510				goto update;
2511			}
2512			break;
2513		}
2514		if (flags & V_RDDIR_ENTFLAGS) {
2515			/*
2516			 * Add extended flag entry:
2517			 */
2518			eodp->ed_ino = objnum;
2519			eodp->ed_reclen = reclen;
2520			/* NOTE: ed_off is the offset for the *next* entry */
2521			next = &(eodp->ed_off);
2522			eodp->ed_eflags = zap.za_normalization_conflict ?
2523			    ED_CASE_CONFLICT : 0;
2524			(void) strncpy(eodp->ed_name, zap.za_name,
2525			    EDIRENT_NAMELEN(reclen));
2526			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2527		} else {
2528			/*
2529			 * Add normal entry:
2530			 */
2531			odp->d_ino = objnum;
2532			odp->d_reclen = reclen;
2533			odp->d_namlen = strlen(zap.za_name);
2534			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2535			odp->d_type = type;
2536			odp = (dirent64_t *)((intptr_t)odp + reclen);
2537		}
2538		outcount += reclen;
2539
2540		ASSERT(outcount <= bufsize);
2541
2542		/* Prefetch znode */
2543		if (prefetch)
2544			dmu_prefetch(os, objnum, 0, 0, 0,
2545			    ZIO_PRIORITY_SYNC_READ);
2546
2547	skip_entry:
2548		/*
2549		 * Move to the next entry, fill in the previous offset.
2550		 */
2551		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2552			zap_cursor_advance(&zc);
2553			offset = zap_cursor_serialize(&zc);
2554		} else {
2555			offset += 1;
2556		}
2557
2558		if (cooks != NULL) {
2559			*cooks++ = offset;
2560			ncooks--;
2561			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2562		}
2563	}
2564	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2565
2566	/* Subtract unused cookies */
2567	if (ncookies != NULL)
2568		*ncookies -= ncooks;
2569
2570	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2571		iovp->iov_base += outcount;
2572		iovp->iov_len -= outcount;
2573		uio->uio_resid -= outcount;
2574	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2575		/*
2576		 * Reset the pointer.
2577		 */
2578		offset = uio->uio_loffset;
2579	}
2580
2581update:
2582	zap_cursor_fini(&zc);
2583	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2584		kmem_free(outbuf, bufsize);
2585
2586	if (error == ENOENT)
2587		error = 0;
2588
2589	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2590
2591	uio->uio_loffset = offset;
2592	ZFS_EXIT(zfsvfs);
2593	if (error != 0 && cookies != NULL) {
2594		free(*cookies, M_TEMP);
2595		*cookies = NULL;
2596		*ncookies = 0;
2597	}
2598	return (error);
2599}
2600
2601ulong_t zfs_fsync_sync_cnt = 4;
2602
2603static int
2604zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2605{
2606	znode_t	*zp = VTOZ(vp);
2607	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2608
2609	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2610
2611	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2612		ZFS_ENTER(zfsvfs);
2613		ZFS_VERIFY_ZP(zp);
2614		zil_commit(zfsvfs->z_log, zp->z_id);
2615		ZFS_EXIT(zfsvfs);
2616	}
2617	return (0);
2618}
2619
2620
2621/*
2622 * Get the requested file attributes and place them in the provided
2623 * vattr structure.
2624 *
2625 *	IN:	vp	- vnode of file.
2626 *		vap	- va_mask identifies requested attributes.
2627 *			  If AT_XVATTR set, then optional attrs are requested
2628 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2629 *		cr	- credentials of caller.
2630 *		ct	- caller context
2631 *
2632 *	OUT:	vap	- attribute values.
2633 *
2634 *	RETURN:	0 (always succeeds).
2635 */
2636/* ARGSUSED */
2637static int
2638zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2639    caller_context_t *ct)
2640{
2641	znode_t *zp = VTOZ(vp);
2642	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2643	int	error = 0;
2644	uint32_t blksize;
2645	u_longlong_t nblocks;
2646	uint64_t links;
2647	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2648	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2649	xoptattr_t *xoap = NULL;
2650	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2651	sa_bulk_attr_t bulk[4];
2652	int count = 0;
2653
2654	ZFS_ENTER(zfsvfs);
2655	ZFS_VERIFY_ZP(zp);
2656
2657	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2658
2659	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2660	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2661	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2662	if (vp->v_type == VBLK || vp->v_type == VCHR)
2663		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2664		    &rdev, 8);
2665
2666	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2667		ZFS_EXIT(zfsvfs);
2668		return (error);
2669	}
2670
2671	/*
2672	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2673	 * Also, if we are the owner don't bother, since owner should
2674	 * always be allowed to read basic attributes of file.
2675	 */
2676	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2677	    (vap->va_uid != crgetuid(cr))) {
2678		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2679		    skipaclchk, cr)) {
2680			ZFS_EXIT(zfsvfs);
2681			return (error);
2682		}
2683	}
2684
2685	/*
2686	 * Return all attributes.  It's cheaper to provide the answer
2687	 * than to determine whether we were asked the question.
2688	 */
2689
2690	vap->va_type = IFTOVT(zp->z_mode);
2691	vap->va_mode = zp->z_mode & ~S_IFMT;
2692#ifdef illumos
2693	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2694#else
2695	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2696#endif
2697	vap->va_nodeid = zp->z_id;
2698	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2699		links = zp->z_links + 1;
2700	else
2701		links = zp->z_links;
2702	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2703	vap->va_size = zp->z_size;
2704#ifdef illumos
2705	vap->va_rdev = vp->v_rdev;
2706#else
2707	if (vp->v_type == VBLK || vp->v_type == VCHR)
2708		vap->va_rdev = zfs_cmpldev(rdev);
2709#endif
2710	vap->va_seq = zp->z_seq;
2711	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2712     	vap->va_filerev = zp->z_seq;
2713
2714	/*
2715	 * Add in any requested optional attributes and the create time.
2716	 * Also set the corresponding bits in the returned attribute bitmap.
2717	 */
2718	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2719		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2720			xoap->xoa_archive =
2721			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2722			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2723		}
2724
2725		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2726			xoap->xoa_readonly =
2727			    ((zp->z_pflags & ZFS_READONLY) != 0);
2728			XVA_SET_RTN(xvap, XAT_READONLY);
2729		}
2730
2731		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2732			xoap->xoa_system =
2733			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2734			XVA_SET_RTN(xvap, XAT_SYSTEM);
2735		}
2736
2737		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2738			xoap->xoa_hidden =
2739			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2740			XVA_SET_RTN(xvap, XAT_HIDDEN);
2741		}
2742
2743		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2744			xoap->xoa_nounlink =
2745			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2746			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2747		}
2748
2749		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2750			xoap->xoa_immutable =
2751			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2752			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2753		}
2754
2755		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2756			xoap->xoa_appendonly =
2757			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2758			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2759		}
2760
2761		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2762			xoap->xoa_nodump =
2763			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2764			XVA_SET_RTN(xvap, XAT_NODUMP);
2765		}
2766
2767		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2768			xoap->xoa_opaque =
2769			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2770			XVA_SET_RTN(xvap, XAT_OPAQUE);
2771		}
2772
2773		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2774			xoap->xoa_av_quarantined =
2775			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2776			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2777		}
2778
2779		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2780			xoap->xoa_av_modified =
2781			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2782			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2783		}
2784
2785		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2786		    vp->v_type == VREG) {
2787			zfs_sa_get_scanstamp(zp, xvap);
2788		}
2789
2790		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2791			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2792			XVA_SET_RTN(xvap, XAT_REPARSE);
2793		}
2794		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2795			xoap->xoa_generation = zp->z_gen;
2796			XVA_SET_RTN(xvap, XAT_GEN);
2797		}
2798
2799		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2800			xoap->xoa_offline =
2801			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2802			XVA_SET_RTN(xvap, XAT_OFFLINE);
2803		}
2804
2805		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2806			xoap->xoa_sparse =
2807			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2808			XVA_SET_RTN(xvap, XAT_SPARSE);
2809		}
2810	}
2811
2812	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2813	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2814	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2815	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2816
2817
2818	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2819	vap->va_blksize = blksize;
2820	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2821
2822	if (zp->z_blksz == 0) {
2823		/*
2824		 * Block size hasn't been set; suggest maximal I/O transfers.
2825		 */
2826		vap->va_blksize = zfsvfs->z_max_blksz;
2827	}
2828
2829	ZFS_EXIT(zfsvfs);
2830	return (0);
2831}
2832
2833/*
2834 * Set the file attributes to the values contained in the
2835 * vattr structure.
2836 *
2837 *	IN:	vp	- vnode of file to be modified.
2838 *		vap	- new attribute values.
2839 *			  If AT_XVATTR set, then optional attrs are being set
2840 *		flags	- ATTR_UTIME set if non-default time values provided.
2841 *			- ATTR_NOACLCHECK (CIFS context only).
2842 *		cr	- credentials of caller.
2843 *		ct	- caller context
2844 *
2845 *	RETURN:	0 on success, error code on failure.
2846 *
2847 * Timestamps:
2848 *	vp - ctime updated, mtime updated if size changed.
2849 */
2850/* ARGSUSED */
2851static int
2852zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2853    caller_context_t *ct)
2854{
2855	znode_t		*zp = VTOZ(vp);
2856	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2857	zilog_t		*zilog;
2858	dmu_tx_t	*tx;
2859	vattr_t		oldva;
2860	xvattr_t	tmpxvattr;
2861	uint_t		mask = vap->va_mask;
2862	uint_t		saved_mask = 0;
2863	uint64_t	saved_mode;
2864	int		trim_mask = 0;
2865	uint64_t	new_mode;
2866	uint64_t	new_uid, new_gid;
2867	uint64_t	xattr_obj;
2868	uint64_t	mtime[2], ctime[2];
2869	znode_t		*attrzp;
2870	int		need_policy = FALSE;
2871	int		err, err2;
2872	zfs_fuid_info_t *fuidp = NULL;
2873	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2874	xoptattr_t	*xoap;
2875	zfs_acl_t	*aclp;
2876	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2877	boolean_t	fuid_dirtied = B_FALSE;
2878	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2879	int		count = 0, xattr_count = 0;
2880
2881	if (mask == 0)
2882		return (0);
2883
2884	if (mask & AT_NOSET)
2885		return (SET_ERROR(EINVAL));
2886
2887	ZFS_ENTER(zfsvfs);
2888	ZFS_VERIFY_ZP(zp);
2889
2890	zilog = zfsvfs->z_log;
2891
2892	/*
2893	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2894	 * that file system is at proper version level
2895	 */
2896
2897	if (zfsvfs->z_use_fuids == B_FALSE &&
2898	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2899	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2900	    (mask & AT_XVATTR))) {
2901		ZFS_EXIT(zfsvfs);
2902		return (SET_ERROR(EINVAL));
2903	}
2904
2905	if (mask & AT_SIZE && vp->v_type == VDIR) {
2906		ZFS_EXIT(zfsvfs);
2907		return (SET_ERROR(EISDIR));
2908	}
2909
2910	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2911		ZFS_EXIT(zfsvfs);
2912		return (SET_ERROR(EINVAL));
2913	}
2914
2915	/*
2916	 * If this is an xvattr_t, then get a pointer to the structure of
2917	 * optional attributes.  If this is NULL, then we have a vattr_t.
2918	 */
2919	xoap = xva_getxoptattr(xvap);
2920
2921	xva_init(&tmpxvattr);
2922
2923	/*
2924	 * Immutable files can only alter immutable bit and atime
2925	 */
2926	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2927	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2928	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2929		ZFS_EXIT(zfsvfs);
2930		return (SET_ERROR(EPERM));
2931	}
2932
2933	/*
2934	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2935	 */
2936
2937	/*
2938	 * Verify timestamps doesn't overflow 32 bits.
2939	 * ZFS can handle large timestamps, but 32bit syscalls can't
2940	 * handle times greater than 2039.  This check should be removed
2941	 * once large timestamps are fully supported.
2942	 */
2943	if (mask & (AT_ATIME | AT_MTIME)) {
2944		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2945		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2946			ZFS_EXIT(zfsvfs);
2947			return (SET_ERROR(EOVERFLOW));
2948		}
2949	}
2950	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2951	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2952		ZFS_EXIT(zfsvfs);
2953		return (SET_ERROR(EOVERFLOW));
2954	}
2955
2956	attrzp = NULL;
2957	aclp = NULL;
2958
2959	/* Can this be moved to before the top label? */
2960	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2961		ZFS_EXIT(zfsvfs);
2962		return (SET_ERROR(EROFS));
2963	}
2964
2965	/*
2966	 * First validate permissions
2967	 */
2968
2969	if (mask & AT_SIZE) {
2970		/*
2971		 * XXX - Note, we are not providing any open
2972		 * mode flags here (like FNDELAY), so we may
2973		 * block if there are locks present... this
2974		 * should be addressed in openat().
2975		 */
2976		/* XXX - would it be OK to generate a log record here? */
2977		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2978		if (err) {
2979			ZFS_EXIT(zfsvfs);
2980			return (err);
2981		}
2982	}
2983
2984	if (mask & (AT_ATIME|AT_MTIME) ||
2985	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2986	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2987	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2988	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2989	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2990	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2991	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2992		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2993		    skipaclchk, cr);
2994	}
2995
2996	if (mask & (AT_UID|AT_GID)) {
2997		int	idmask = (mask & (AT_UID|AT_GID));
2998		int	take_owner;
2999		int	take_group;
3000
3001		/*
3002		 * NOTE: even if a new mode is being set,
3003		 * we may clear S_ISUID/S_ISGID bits.
3004		 */
3005
3006		if (!(mask & AT_MODE))
3007			vap->va_mode = zp->z_mode;
3008
3009		/*
3010		 * Take ownership or chgrp to group we are a member of
3011		 */
3012
3013		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3014		take_group = (mask & AT_GID) &&
3015		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3016
3017		/*
3018		 * If both AT_UID and AT_GID are set then take_owner and
3019		 * take_group must both be set in order to allow taking
3020		 * ownership.
3021		 *
3022		 * Otherwise, send the check through secpolicy_vnode_setattr()
3023		 *
3024		 */
3025
3026		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3027		    ((idmask == AT_UID) && take_owner) ||
3028		    ((idmask == AT_GID) && take_group)) {
3029			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3030			    skipaclchk, cr) == 0) {
3031				/*
3032				 * Remove setuid/setgid for non-privileged users
3033				 */
3034				secpolicy_setid_clear(vap, vp, cr);
3035				trim_mask = (mask & (AT_UID|AT_GID));
3036			} else {
3037				need_policy =  TRUE;
3038			}
3039		} else {
3040			need_policy =  TRUE;
3041		}
3042	}
3043
3044	oldva.va_mode = zp->z_mode;
3045	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3046	if (mask & AT_XVATTR) {
3047		/*
3048		 * Update xvattr mask to include only those attributes
3049		 * that are actually changing.
3050		 *
3051		 * the bits will be restored prior to actually setting
3052		 * the attributes so the caller thinks they were set.
3053		 */
3054		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3055			if (xoap->xoa_appendonly !=
3056			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3057				need_policy = TRUE;
3058			} else {
3059				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3060				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3061			}
3062		}
3063
3064		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3065			if (xoap->xoa_nounlink !=
3066			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3067				need_policy = TRUE;
3068			} else {
3069				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3070				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3071			}
3072		}
3073
3074		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3075			if (xoap->xoa_immutable !=
3076			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3077				need_policy = TRUE;
3078			} else {
3079				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3080				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3081			}
3082		}
3083
3084		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3085			if (xoap->xoa_nodump !=
3086			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3087				need_policy = TRUE;
3088			} else {
3089				XVA_CLR_REQ(xvap, XAT_NODUMP);
3090				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3091			}
3092		}
3093
3094		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3095			if (xoap->xoa_av_modified !=
3096			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3097				need_policy = TRUE;
3098			} else {
3099				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3100				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3101			}
3102		}
3103
3104		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3105			if ((vp->v_type != VREG &&
3106			    xoap->xoa_av_quarantined) ||
3107			    xoap->xoa_av_quarantined !=
3108			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3109				need_policy = TRUE;
3110			} else {
3111				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3112				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3113			}
3114		}
3115
3116		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3117			ZFS_EXIT(zfsvfs);
3118			return (SET_ERROR(EPERM));
3119		}
3120
3121		if (need_policy == FALSE &&
3122		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3123		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3124			need_policy = TRUE;
3125		}
3126	}
3127
3128	if (mask & AT_MODE) {
3129		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3130			err = secpolicy_setid_setsticky_clear(vp, vap,
3131			    &oldva, cr);
3132			if (err) {
3133				ZFS_EXIT(zfsvfs);
3134				return (err);
3135			}
3136			trim_mask |= AT_MODE;
3137		} else {
3138			need_policy = TRUE;
3139		}
3140	}
3141
3142	if (need_policy) {
3143		/*
3144		 * If trim_mask is set then take ownership
3145		 * has been granted or write_acl is present and user
3146		 * has the ability to modify mode.  In that case remove
3147		 * UID|GID and or MODE from mask so that
3148		 * secpolicy_vnode_setattr() doesn't revoke it.
3149		 */
3150
3151		if (trim_mask) {
3152			saved_mask = vap->va_mask;
3153			vap->va_mask &= ~trim_mask;
3154			if (trim_mask & AT_MODE) {
3155				/*
3156				 * Save the mode, as secpolicy_vnode_setattr()
3157				 * will overwrite it with ova.va_mode.
3158				 */
3159				saved_mode = vap->va_mode;
3160			}
3161		}
3162		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3163		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3164		if (err) {
3165			ZFS_EXIT(zfsvfs);
3166			return (err);
3167		}
3168
3169		if (trim_mask) {
3170			vap->va_mask |= saved_mask;
3171			if (trim_mask & AT_MODE) {
3172				/*
3173				 * Recover the mode after
3174				 * secpolicy_vnode_setattr().
3175				 */
3176				vap->va_mode = saved_mode;
3177			}
3178		}
3179	}
3180
3181	/*
3182	 * secpolicy_vnode_setattr, or take ownership may have
3183	 * changed va_mask
3184	 */
3185	mask = vap->va_mask;
3186
3187	if ((mask & (AT_UID | AT_GID))) {
3188		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3189		    &xattr_obj, sizeof (xattr_obj));
3190
3191		if (err == 0 && xattr_obj) {
3192			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3193			if (err == 0) {
3194				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3195				if (err != 0)
3196					vrele(ZTOV(attrzp));
3197			}
3198			if (err)
3199				goto out2;
3200		}
3201		if (mask & AT_UID) {
3202			new_uid = zfs_fuid_create(zfsvfs,
3203			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3204			if (new_uid != zp->z_uid &&
3205			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3206				if (attrzp)
3207					vput(ZTOV(attrzp));
3208				err = SET_ERROR(EDQUOT);
3209				goto out2;
3210			}
3211		}
3212
3213		if (mask & AT_GID) {
3214			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3215			    cr, ZFS_GROUP, &fuidp);
3216			if (new_gid != zp->z_gid &&
3217			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3218				if (attrzp)
3219					vput(ZTOV(attrzp));
3220				err = SET_ERROR(EDQUOT);
3221				goto out2;
3222			}
3223		}
3224	}
3225	tx = dmu_tx_create(zfsvfs->z_os);
3226
3227	if (mask & AT_MODE) {
3228		uint64_t pmode = zp->z_mode;
3229		uint64_t acl_obj;
3230		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3231
3232		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3233		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3234			err = SET_ERROR(EPERM);
3235			goto out;
3236		}
3237
3238		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3239			goto out;
3240
3241		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3242			/*
3243			 * Are we upgrading ACL from old V0 format
3244			 * to V1 format?
3245			 */
3246			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3247			    zfs_znode_acl_version(zp) ==
3248			    ZFS_ACL_VERSION_INITIAL) {
3249				dmu_tx_hold_free(tx, acl_obj, 0,
3250				    DMU_OBJECT_END);
3251				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3252				    0, aclp->z_acl_bytes);
3253			} else {
3254				dmu_tx_hold_write(tx, acl_obj, 0,
3255				    aclp->z_acl_bytes);
3256			}
3257		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3258			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3259			    0, aclp->z_acl_bytes);
3260		}
3261		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3262	} else {
3263		if ((mask & AT_XVATTR) &&
3264		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3265			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3266		else
3267			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3268	}
3269
3270	if (attrzp) {
3271		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3272	}
3273
3274	fuid_dirtied = zfsvfs->z_fuid_dirty;
3275	if (fuid_dirtied)
3276		zfs_fuid_txhold(zfsvfs, tx);
3277
3278	zfs_sa_upgrade_txholds(tx, zp);
3279
3280	err = dmu_tx_assign(tx, TXG_WAIT);
3281	if (err)
3282		goto out;
3283
3284	count = 0;
3285	/*
3286	 * Set each attribute requested.
3287	 * We group settings according to the locks they need to acquire.
3288	 *
3289	 * Note: you cannot set ctime directly, although it will be
3290	 * updated as a side-effect of calling this function.
3291	 */
3292
3293	if (mask & (AT_UID|AT_GID|AT_MODE))
3294		mutex_enter(&zp->z_acl_lock);
3295
3296	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3297	    &zp->z_pflags, sizeof (zp->z_pflags));
3298
3299	if (attrzp) {
3300		if (mask & (AT_UID|AT_GID|AT_MODE))
3301			mutex_enter(&attrzp->z_acl_lock);
3302		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3303		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3304		    sizeof (attrzp->z_pflags));
3305	}
3306
3307	if (mask & (AT_UID|AT_GID)) {
3308
3309		if (mask & AT_UID) {
3310			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3311			    &new_uid, sizeof (new_uid));
3312			zp->z_uid = new_uid;
3313			if (attrzp) {
3314				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3315				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3316				    sizeof (new_uid));
3317				attrzp->z_uid = new_uid;
3318			}
3319		}
3320
3321		if (mask & AT_GID) {
3322			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3323			    NULL, &new_gid, sizeof (new_gid));
3324			zp->z_gid = new_gid;
3325			if (attrzp) {
3326				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3327				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3328				    sizeof (new_gid));
3329				attrzp->z_gid = new_gid;
3330			}
3331		}
3332		if (!(mask & AT_MODE)) {
3333			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3334			    NULL, &new_mode, sizeof (new_mode));
3335			new_mode = zp->z_mode;
3336		}
3337		err = zfs_acl_chown_setattr(zp);
3338		ASSERT(err == 0);
3339		if (attrzp) {
3340			err = zfs_acl_chown_setattr(attrzp);
3341			ASSERT(err == 0);
3342		}
3343	}
3344
3345	if (mask & AT_MODE) {
3346		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3347		    &new_mode, sizeof (new_mode));
3348		zp->z_mode = new_mode;
3349		ASSERT3U((uintptr_t)aclp, !=, 0);
3350		err = zfs_aclset_common(zp, aclp, cr, tx);
3351		ASSERT0(err);
3352		if (zp->z_acl_cached)
3353			zfs_acl_free(zp->z_acl_cached);
3354		zp->z_acl_cached = aclp;
3355		aclp = NULL;
3356	}
3357
3358
3359	if (mask & AT_ATIME) {
3360		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3361		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3362		    &zp->z_atime, sizeof (zp->z_atime));
3363	}
3364
3365	if (mask & AT_MTIME) {
3366		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3367		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3368		    mtime, sizeof (mtime));
3369	}
3370
3371	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3372	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3373		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3374		    NULL, mtime, sizeof (mtime));
3375		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3376		    &ctime, sizeof (ctime));
3377		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3378		    B_TRUE);
3379	} else if (mask != 0) {
3380		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3381		    &ctime, sizeof (ctime));
3382		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3383		    B_TRUE);
3384		if (attrzp) {
3385			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3386			    SA_ZPL_CTIME(zfsvfs), NULL,
3387			    &ctime, sizeof (ctime));
3388			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3389			    mtime, ctime, B_TRUE);
3390		}
3391	}
3392	/*
3393	 * Do this after setting timestamps to prevent timestamp
3394	 * update from toggling bit
3395	 */
3396
3397	if (xoap && (mask & AT_XVATTR)) {
3398
3399		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3400			xoap->xoa_createtime = vap->va_birthtime;
3401		/*
3402		 * restore trimmed off masks
3403		 * so that return masks can be set for caller.
3404		 */
3405
3406		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3407			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3408		}
3409		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3410			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3411		}
3412		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3413			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3414		}
3415		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3416			XVA_SET_REQ(xvap, XAT_NODUMP);
3417		}
3418		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3419			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3420		}
3421		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3422			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3423		}
3424
3425		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3426			ASSERT(vp->v_type == VREG);
3427
3428		zfs_xvattr_set(zp, xvap, tx);
3429	}
3430
3431	if (fuid_dirtied)
3432		zfs_fuid_sync(zfsvfs, tx);
3433
3434	if (mask != 0)
3435		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3436
3437	if (mask & (AT_UID|AT_GID|AT_MODE))
3438		mutex_exit(&zp->z_acl_lock);
3439
3440	if (attrzp) {
3441		if (mask & (AT_UID|AT_GID|AT_MODE))
3442			mutex_exit(&attrzp->z_acl_lock);
3443	}
3444out:
3445	if (err == 0 && attrzp) {
3446		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3447		    xattr_count, tx);
3448		ASSERT(err2 == 0);
3449	}
3450
3451	if (attrzp)
3452		vput(ZTOV(attrzp));
3453
3454	if (aclp)
3455		zfs_acl_free(aclp);
3456
3457	if (fuidp) {
3458		zfs_fuid_info_free(fuidp);
3459		fuidp = NULL;
3460	}
3461
3462	if (err) {
3463		dmu_tx_abort(tx);
3464	} else {
3465		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3466		dmu_tx_commit(tx);
3467	}
3468
3469out2:
3470	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3471		zil_commit(zilog, 0);
3472
3473	ZFS_EXIT(zfsvfs);
3474	return (err);
3475}
3476
3477/*
3478 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3479 * fail to acquire any lock in the path we will drop all held locks,
3480 * acquire the new lock in a blocking fashion, and then release it and
3481 * restart the rename.  This acquire/release step ensures that we do not
3482 * spin on a lock waiting for release.  On error release all vnode locks
3483 * and decrement references the way tmpfs_rename() would do.
3484 */
3485static int
3486zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3487    struct vnode *tdvp, struct vnode **tvpp,
3488    const struct componentname *scnp, const struct componentname *tcnp)
3489{
3490	zfsvfs_t	*zfsvfs;
3491	struct vnode	*nvp, *svp, *tvp;
3492	znode_t		*sdzp, *tdzp, *szp, *tzp;
3493	const char	*snm = scnp->cn_nameptr;
3494	const char	*tnm = tcnp->cn_nameptr;
3495	int error;
3496
3497	VOP_UNLOCK(tdvp, 0);
3498	if (*tvpp != NULL && *tvpp != tdvp)
3499		VOP_UNLOCK(*tvpp, 0);
3500
3501relock:
3502	error = vn_lock(sdvp, LK_EXCLUSIVE);
3503	if (error)
3504		goto out;
3505	sdzp = VTOZ(sdvp);
3506
3507	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3508	if (error != 0) {
3509		VOP_UNLOCK(sdvp, 0);
3510		if (error != EBUSY)
3511			goto out;
3512		error = vn_lock(tdvp, LK_EXCLUSIVE);
3513		if (error)
3514			goto out;
3515		VOP_UNLOCK(tdvp, 0);
3516		goto relock;
3517	}
3518	tdzp = VTOZ(tdvp);
3519
3520	/*
3521	 * Before using sdzp and tdzp we must ensure that they are live.
3522	 * As a porting legacy from illumos we have two things to worry
3523	 * about.  One is typical for FreeBSD and it is that the vnode is
3524	 * not reclaimed (doomed).  The other is that the znode is live.
3525	 * The current code can invalidate the znode without acquiring the
3526	 * corresponding vnode lock if the object represented by the znode
3527	 * and vnode is no longer valid after a rollback or receive operation.
3528	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3529	 * that protects the znodes from the invalidation.
3530	 */
3531	zfsvfs = sdzp->z_zfsvfs;
3532	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3533	ZFS_ENTER(zfsvfs);
3534
3535	/*
3536	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3537	 * bypassing the cleanup code in the case of an error.
3538	 */
3539	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3540		ZFS_EXIT(zfsvfs);
3541		VOP_UNLOCK(sdvp, 0);
3542		VOP_UNLOCK(tdvp, 0);
3543		error = SET_ERROR(EIO);
3544		goto out;
3545	}
3546
3547	/*
3548	 * Re-resolve svp to be certain it still exists and fetch the
3549	 * correct vnode.
3550	 */
3551	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3552	if (error != 0) {
3553		/* Source entry invalid or not there. */
3554		ZFS_EXIT(zfsvfs);
3555		VOP_UNLOCK(sdvp, 0);
3556		VOP_UNLOCK(tdvp, 0);
3557		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3558		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3559			error = SET_ERROR(EINVAL);
3560		goto out;
3561	}
3562	svp = ZTOV(szp);
3563
3564	/*
3565	 * Re-resolve tvp, if it disappeared we just carry on.
3566	 */
3567	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3568	if (error != 0) {
3569		ZFS_EXIT(zfsvfs);
3570		VOP_UNLOCK(sdvp, 0);
3571		VOP_UNLOCK(tdvp, 0);
3572		vrele(svp);
3573		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3574			error = SET_ERROR(EINVAL);
3575		goto out;
3576	}
3577	if (tzp != NULL)
3578		tvp = ZTOV(tzp);
3579	else
3580		tvp = NULL;
3581
3582	/*
3583	 * At present the vnode locks must be acquired before z_teardown_lock,
3584	 * although it would be more logical to use the opposite order.
3585	 */
3586	ZFS_EXIT(zfsvfs);
3587
3588	/*
3589	 * Now try acquire locks on svp and tvp.
3590	 */
3591	nvp = svp;
3592	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3593	if (error != 0) {
3594		VOP_UNLOCK(sdvp, 0);
3595		VOP_UNLOCK(tdvp, 0);
3596		if (tvp != NULL)
3597			vrele(tvp);
3598		if (error != EBUSY) {
3599			vrele(nvp);
3600			goto out;
3601		}
3602		error = vn_lock(nvp, LK_EXCLUSIVE);
3603		if (error != 0) {
3604			vrele(nvp);
3605			goto out;
3606		}
3607		VOP_UNLOCK(nvp, 0);
3608		/*
3609		 * Concurrent rename race.
3610		 * XXX ?
3611		 */
3612		if (nvp == tdvp) {
3613			vrele(nvp);
3614			error = SET_ERROR(EINVAL);
3615			goto out;
3616		}
3617		vrele(*svpp);
3618		*svpp = nvp;
3619		goto relock;
3620	}
3621	vrele(*svpp);
3622	*svpp = nvp;
3623
3624	if (*tvpp != NULL)
3625		vrele(*tvpp);
3626	*tvpp = NULL;
3627	if (tvp != NULL) {
3628		nvp = tvp;
3629		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3630		if (error != 0) {
3631			VOP_UNLOCK(sdvp, 0);
3632			VOP_UNLOCK(tdvp, 0);
3633			VOP_UNLOCK(*svpp, 0);
3634			if (error != EBUSY) {
3635				vrele(nvp);
3636				goto out;
3637			}
3638			error = vn_lock(nvp, LK_EXCLUSIVE);
3639			if (error != 0) {
3640				vrele(nvp);
3641				goto out;
3642			}
3643			vput(nvp);
3644			goto relock;
3645		}
3646		*tvpp = nvp;
3647	}
3648
3649	return (0);
3650
3651out:
3652	return (error);
3653}
3654
3655/*
3656 * Note that we must use VRELE_ASYNC in this function as it walks
3657 * up the directory tree and vrele may need to acquire an exclusive
3658 * lock if a last reference to a vnode is dropped.
3659 */
3660static int
3661zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3662{
3663	zfsvfs_t	*zfsvfs;
3664	znode_t		*zp, *zp1;
3665	uint64_t	parent;
3666	int		error;
3667
3668	zfsvfs = tdzp->z_zfsvfs;
3669	if (tdzp == szp)
3670		return (SET_ERROR(EINVAL));
3671	if (tdzp == sdzp)
3672		return (0);
3673	if (tdzp->z_id == zfsvfs->z_root)
3674		return (0);
3675	zp = tdzp;
3676	for (;;) {
3677		ASSERT(!zp->z_unlinked);
3678		if ((error = sa_lookup(zp->z_sa_hdl,
3679		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3680			break;
3681
3682		if (parent == szp->z_id) {
3683			error = SET_ERROR(EINVAL);
3684			break;
3685		}
3686		if (parent == zfsvfs->z_root)
3687			break;
3688		if (parent == sdzp->z_id)
3689			break;
3690
3691		error = zfs_zget(zfsvfs, parent, &zp1);
3692		if (error != 0)
3693			break;
3694
3695		if (zp != tdzp)
3696			VN_RELE_ASYNC(ZTOV(zp),
3697			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3698		zp = zp1;
3699	}
3700
3701	if (error == ENOTDIR)
3702		panic("checkpath: .. not a directory\n");
3703	if (zp != tdzp)
3704		VN_RELE_ASYNC(ZTOV(zp),
3705		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3706	return (error);
3707}
3708
3709/*
3710 * Move an entry from the provided source directory to the target
3711 * directory.  Change the entry name as indicated.
3712 *
3713 *	IN:	sdvp	- Source directory containing the "old entry".
3714 *		snm	- Old entry name.
3715 *		tdvp	- Target directory to contain the "new entry".
3716 *		tnm	- New entry name.
3717 *		cr	- credentials of caller.
3718 *		ct	- caller context
3719 *		flags	- case flags
3720 *
3721 *	RETURN:	0 on success, error code on failure.
3722 *
3723 * Timestamps:
3724 *	sdvp,tdvp - ctime|mtime updated
3725 */
3726/*ARGSUSED*/
3727static int
3728zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3729    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3730    cred_t *cr)
3731{
3732	zfsvfs_t	*zfsvfs;
3733	znode_t		*sdzp, *tdzp, *szp, *tzp;
3734	zilog_t		*zilog = NULL;
3735	dmu_tx_t	*tx;
3736	char		*snm = scnp->cn_nameptr;
3737	char		*tnm = tcnp->cn_nameptr;
3738	int		error = 0;
3739
3740	/* Reject renames across filesystems. */
3741	if ((*svpp)->v_mount != tdvp->v_mount ||
3742	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3743		error = SET_ERROR(EXDEV);
3744		goto out;
3745	}
3746
3747	if (zfsctl_is_node(tdvp)) {
3748		error = SET_ERROR(EXDEV);
3749		goto out;
3750	}
3751
3752	/*
3753	 * Lock all four vnodes to ensure safety and semantics of renaming.
3754	 */
3755	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3756	if (error != 0) {
3757		/* no vnodes are locked in the case of error here */
3758		return (error);
3759	}
3760
3761	tdzp = VTOZ(tdvp);
3762	sdzp = VTOZ(sdvp);
3763	zfsvfs = tdzp->z_zfsvfs;
3764	zilog = zfsvfs->z_log;
3765
3766	/*
3767	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3768	 * znodes involved.
3769	 */
3770	ZFS_ENTER(zfsvfs);
3771
3772	if (zfsvfs->z_utf8 && u8_validate(tnm,
3773	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3774		error = SET_ERROR(EILSEQ);
3775		goto unlockout;
3776	}
3777
3778	/* If source and target are the same file, there is nothing to do. */
3779	if ((*svpp) == (*tvpp)) {
3780		error = 0;
3781		goto unlockout;
3782	}
3783
3784	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3785	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3786	    (*tvpp)->v_mountedhere != NULL)) {
3787		error = SET_ERROR(EXDEV);
3788		goto unlockout;
3789	}
3790
3791	/*
3792	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3793	 * bypassing the cleanup code in the case of an error.
3794	 */
3795	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3796		error = SET_ERROR(EIO);
3797		goto unlockout;
3798	}
3799
3800	szp = VTOZ(*svpp);
3801	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3802	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3803		error = SET_ERROR(EIO);
3804		goto unlockout;
3805	}
3806
3807	/*
3808	 * This is to prevent the creation of links into attribute space
3809	 * by renaming a linked file into/outof an attribute directory.
3810	 * See the comment in zfs_link() for why this is considered bad.
3811	 */
3812	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3813		error = SET_ERROR(EINVAL);
3814		goto unlockout;
3815	}
3816
3817	/*
3818	 * Must have write access at the source to remove the old entry
3819	 * and write access at the target to create the new entry.
3820	 * Note that if target and source are the same, this can be
3821	 * done in a single check.
3822	 */
3823	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3824		goto unlockout;
3825
3826	if ((*svpp)->v_type == VDIR) {
3827		/*
3828		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3829		 */
3830		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3831		    sdzp == szp ||
3832		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3833			error = EINVAL;
3834			goto unlockout;
3835		}
3836
3837		/*
3838		 * Check to make sure rename is valid.
3839		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3840		 */
3841		if (error = zfs_rename_check(szp, sdzp, tdzp))
3842			goto unlockout;
3843	}
3844
3845	/*
3846	 * Does target exist?
3847	 */
3848	if (tzp) {
3849		/*
3850		 * Source and target must be the same type.
3851		 */
3852		if ((*svpp)->v_type == VDIR) {
3853			if ((*tvpp)->v_type != VDIR) {
3854				error = SET_ERROR(ENOTDIR);
3855				goto unlockout;
3856			} else {
3857				cache_purge(tdvp);
3858				if (sdvp != tdvp)
3859					cache_purge(sdvp);
3860			}
3861		} else {
3862			if ((*tvpp)->v_type == VDIR) {
3863				error = SET_ERROR(EISDIR);
3864				goto unlockout;
3865			}
3866		}
3867	}
3868
3869	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3870	if (tzp)
3871		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3872
3873	/*
3874	 * notify the target directory if it is not the same
3875	 * as source directory.
3876	 */
3877	if (tdvp != sdvp) {
3878		vnevent_rename_dest_dir(tdvp, ct);
3879	}
3880
3881	tx = dmu_tx_create(zfsvfs->z_os);
3882	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3883	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3884	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3885	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3886	if (sdzp != tdzp) {
3887		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3888		zfs_sa_upgrade_txholds(tx, tdzp);
3889	}
3890	if (tzp) {
3891		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3892		zfs_sa_upgrade_txholds(tx, tzp);
3893	}
3894
3895	zfs_sa_upgrade_txholds(tx, szp);
3896	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3897	error = dmu_tx_assign(tx, TXG_WAIT);
3898	if (error) {
3899		dmu_tx_abort(tx);
3900		goto unlockout;
3901	}
3902
3903
3904	if (tzp)	/* Attempt to remove the existing target */
3905		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3906
3907	if (error == 0) {
3908		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3909		if (error == 0) {
3910			szp->z_pflags |= ZFS_AV_MODIFIED;
3911
3912			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3913			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3914			ASSERT0(error);
3915
3916			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3917			    NULL);
3918			if (error == 0) {
3919				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3920				    snm, tdzp, tnm, szp);
3921
3922				/*
3923				 * Update path information for the target vnode
3924				 */
3925				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3926			} else {
3927				/*
3928				 * At this point, we have successfully created
3929				 * the target name, but have failed to remove
3930				 * the source name.  Since the create was done
3931				 * with the ZRENAMING flag, there are
3932				 * complications; for one, the link count is
3933				 * wrong.  The easiest way to deal with this
3934				 * is to remove the newly created target, and
3935				 * return the original error.  This must
3936				 * succeed; fortunately, it is very unlikely to
3937				 * fail, since we just created it.
3938				 */
3939				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3940				    ZRENAMING, NULL), ==, 0);
3941			}
3942		}
3943		if (error == 0) {
3944			cache_purge(*svpp);
3945			if (*tvpp != NULL)
3946				cache_purge(*tvpp);
3947			cache_purge_negative(tdvp);
3948		}
3949	}
3950
3951	dmu_tx_commit(tx);
3952
3953unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3954	ZFS_EXIT(zfsvfs);
3955	VOP_UNLOCK(*svpp, 0);
3956	VOP_UNLOCK(sdvp, 0);
3957
3958out:				/* original two vnodes are locked */
3959	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3960		zil_commit(zilog, 0);
3961
3962	if (*tvpp != NULL)
3963		VOP_UNLOCK(*tvpp, 0);
3964	if (tdvp != *tvpp)
3965		VOP_UNLOCK(tdvp, 0);
3966	return (error);
3967}
3968
3969/*
3970 * Insert the indicated symbolic reference entry into the directory.
3971 *
3972 *	IN:	dvp	- Directory to contain new symbolic link.
3973 *		link	- Name for new symlink entry.
3974 *		vap	- Attributes of new entry.
3975 *		cr	- credentials of caller.
3976 *		ct	- caller context
3977 *		flags	- case flags
3978 *
3979 *	RETURN:	0 on success, error code on failure.
3980 *
3981 * Timestamps:
3982 *	dvp - ctime|mtime updated
3983 */
3984/*ARGSUSED*/
3985static int
3986zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3987    cred_t *cr, kthread_t *td)
3988{
3989	znode_t		*zp, *dzp = VTOZ(dvp);
3990	dmu_tx_t	*tx;
3991	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3992	zilog_t		*zilog;
3993	uint64_t	len = strlen(link);
3994	int		error;
3995	zfs_acl_ids_t	acl_ids;
3996	boolean_t	fuid_dirtied;
3997	uint64_t	txtype = TX_SYMLINK;
3998	int		flags = 0;
3999
4000	ASSERT(vap->va_type == VLNK);
4001
4002	ZFS_ENTER(zfsvfs);
4003	ZFS_VERIFY_ZP(dzp);
4004	zilog = zfsvfs->z_log;
4005
4006	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4007	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4008		ZFS_EXIT(zfsvfs);
4009		return (SET_ERROR(EILSEQ));
4010	}
4011
4012	if (len > MAXPATHLEN) {
4013		ZFS_EXIT(zfsvfs);
4014		return (SET_ERROR(ENAMETOOLONG));
4015	}
4016
4017	if ((error = zfs_acl_ids_create(dzp, 0,
4018	    vap, cr, NULL, &acl_ids)) != 0) {
4019		ZFS_EXIT(zfsvfs);
4020		return (error);
4021	}
4022
4023	/*
4024	 * Attempt to lock directory; fail if entry already exists.
4025	 */
4026	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4027	if (error) {
4028		zfs_acl_ids_free(&acl_ids);
4029		ZFS_EXIT(zfsvfs);
4030		return (error);
4031	}
4032
4033	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4034		zfs_acl_ids_free(&acl_ids);
4035		ZFS_EXIT(zfsvfs);
4036		return (error);
4037	}
4038
4039	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4040		zfs_acl_ids_free(&acl_ids);
4041		ZFS_EXIT(zfsvfs);
4042		return (SET_ERROR(EDQUOT));
4043	}
4044
4045	getnewvnode_reserve(1);
4046	tx = dmu_tx_create(zfsvfs->z_os);
4047	fuid_dirtied = zfsvfs->z_fuid_dirty;
4048	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4049	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4050	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4051	    ZFS_SA_BASE_ATTR_SIZE + len);
4052	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4053	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4054		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4055		    acl_ids.z_aclp->z_acl_bytes);
4056	}
4057	if (fuid_dirtied)
4058		zfs_fuid_txhold(zfsvfs, tx);
4059	error = dmu_tx_assign(tx, TXG_WAIT);
4060	if (error) {
4061		zfs_acl_ids_free(&acl_ids);
4062		dmu_tx_abort(tx);
4063		getnewvnode_drop_reserve();
4064		ZFS_EXIT(zfsvfs);
4065		return (error);
4066	}
4067
4068	/*
4069	 * Create a new object for the symlink.
4070	 * for version 4 ZPL datsets the symlink will be an SA attribute
4071	 */
4072	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4073
4074	if (fuid_dirtied)
4075		zfs_fuid_sync(zfsvfs, tx);
4076
4077	if (zp->z_is_sa)
4078		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4079		    link, len, tx);
4080	else
4081		zfs_sa_symlink(zp, link, len, tx);
4082
4083	zp->z_size = len;
4084	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4085	    &zp->z_size, sizeof (zp->z_size), tx);
4086	/*
4087	 * Insert the new object into the directory.
4088	 */
4089	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4090
4091	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4092	*vpp = ZTOV(zp);
4093
4094	zfs_acl_ids_free(&acl_ids);
4095
4096	dmu_tx_commit(tx);
4097
4098	getnewvnode_drop_reserve();
4099
4100	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4101		zil_commit(zilog, 0);
4102
4103	ZFS_EXIT(zfsvfs);
4104	return (error);
4105}
4106
4107/*
4108 * Return, in the buffer contained in the provided uio structure,
4109 * the symbolic path referred to by vp.
4110 *
4111 *	IN:	vp	- vnode of symbolic link.
4112 *		uio	- structure to contain the link path.
4113 *		cr	- credentials of caller.
4114 *		ct	- caller context
4115 *
4116 *	OUT:	uio	- structure containing the link path.
4117 *
4118 *	RETURN:	0 on success, error code on failure.
4119 *
4120 * Timestamps:
4121 *	vp - atime updated
4122 */
4123/* ARGSUSED */
4124static int
4125zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4126{
4127	znode_t		*zp = VTOZ(vp);
4128	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4129	int		error;
4130
4131	ZFS_ENTER(zfsvfs);
4132	ZFS_VERIFY_ZP(zp);
4133
4134	if (zp->z_is_sa)
4135		error = sa_lookup_uio(zp->z_sa_hdl,
4136		    SA_ZPL_SYMLINK(zfsvfs), uio);
4137	else
4138		error = zfs_sa_readlink(zp, uio);
4139
4140	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4141
4142	ZFS_EXIT(zfsvfs);
4143	return (error);
4144}
4145
4146/*
4147 * Insert a new entry into directory tdvp referencing svp.
4148 *
4149 *	IN:	tdvp	- Directory to contain new entry.
4150 *		svp	- vnode of new entry.
4151 *		name	- name of new entry.
4152 *		cr	- credentials of caller.
4153 *		ct	- caller context
4154 *
4155 *	RETURN:	0 on success, error code on failure.
4156 *
4157 * Timestamps:
4158 *	tdvp - ctime|mtime updated
4159 *	 svp - ctime updated
4160 */
4161/* ARGSUSED */
4162static int
4163zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4164    caller_context_t *ct, int flags)
4165{
4166	znode_t		*dzp = VTOZ(tdvp);
4167	znode_t		*tzp, *szp;
4168	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4169	zilog_t		*zilog;
4170	dmu_tx_t	*tx;
4171	int		error;
4172	uint64_t	parent;
4173	uid_t		owner;
4174
4175	ASSERT(tdvp->v_type == VDIR);
4176
4177	ZFS_ENTER(zfsvfs);
4178	ZFS_VERIFY_ZP(dzp);
4179	zilog = zfsvfs->z_log;
4180
4181	/*
4182	 * POSIX dictates that we return EPERM here.
4183	 * Better choices include ENOTSUP or EISDIR.
4184	 */
4185	if (svp->v_type == VDIR) {
4186		ZFS_EXIT(zfsvfs);
4187		return (SET_ERROR(EPERM));
4188	}
4189
4190	szp = VTOZ(svp);
4191	ZFS_VERIFY_ZP(szp);
4192
4193	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4194		ZFS_EXIT(zfsvfs);
4195		return (SET_ERROR(EPERM));
4196	}
4197
4198	/* Prevent links to .zfs/shares files */
4199
4200	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4201	    &parent, sizeof (uint64_t))) != 0) {
4202		ZFS_EXIT(zfsvfs);
4203		return (error);
4204	}
4205	if (parent == zfsvfs->z_shares_dir) {
4206		ZFS_EXIT(zfsvfs);
4207		return (SET_ERROR(EPERM));
4208	}
4209
4210	if (zfsvfs->z_utf8 && u8_validate(name,
4211	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4212		ZFS_EXIT(zfsvfs);
4213		return (SET_ERROR(EILSEQ));
4214	}
4215
4216	/*
4217	 * We do not support links between attributes and non-attributes
4218	 * because of the potential security risk of creating links
4219	 * into "normal" file space in order to circumvent restrictions
4220	 * imposed in attribute space.
4221	 */
4222	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4223		ZFS_EXIT(zfsvfs);
4224		return (SET_ERROR(EINVAL));
4225	}
4226
4227
4228	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4229	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4230		ZFS_EXIT(zfsvfs);
4231		return (SET_ERROR(EPERM));
4232	}
4233
4234	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4235		ZFS_EXIT(zfsvfs);
4236		return (error);
4237	}
4238
4239	/*
4240	 * Attempt to lock directory; fail if entry already exists.
4241	 */
4242	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4243	if (error) {
4244		ZFS_EXIT(zfsvfs);
4245		return (error);
4246	}
4247
4248	tx = dmu_tx_create(zfsvfs->z_os);
4249	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4250	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4251	zfs_sa_upgrade_txholds(tx, szp);
4252	zfs_sa_upgrade_txholds(tx, dzp);
4253	error = dmu_tx_assign(tx, TXG_WAIT);
4254	if (error) {
4255		dmu_tx_abort(tx);
4256		ZFS_EXIT(zfsvfs);
4257		return (error);
4258	}
4259
4260	error = zfs_link_create(dzp, name, szp, tx, 0);
4261
4262	if (error == 0) {
4263		uint64_t txtype = TX_LINK;
4264		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4265	}
4266
4267	dmu_tx_commit(tx);
4268
4269	if (error == 0) {
4270		vnevent_link(svp, ct);
4271	}
4272
4273	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4274		zil_commit(zilog, 0);
4275
4276	ZFS_EXIT(zfsvfs);
4277	return (error);
4278}
4279
4280
4281/*ARGSUSED*/
4282void
4283zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4284{
4285	znode_t	*zp = VTOZ(vp);
4286	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4287	int error;
4288
4289	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4290	if (zp->z_sa_hdl == NULL) {
4291		/*
4292		 * The fs has been unmounted, or we did a
4293		 * suspend/resume and this file no longer exists.
4294		 */
4295		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4296		vrecycle(vp);
4297		return;
4298	}
4299
4300	if (zp->z_unlinked) {
4301		/*
4302		 * Fast path to recycle a vnode of a removed file.
4303		 */
4304		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4305		vrecycle(vp);
4306		return;
4307	}
4308
4309	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4310		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4311
4312		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4313		zfs_sa_upgrade_txholds(tx, zp);
4314		error = dmu_tx_assign(tx, TXG_WAIT);
4315		if (error) {
4316			dmu_tx_abort(tx);
4317		} else {
4318			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4319			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4320			zp->z_atime_dirty = 0;
4321			dmu_tx_commit(tx);
4322		}
4323	}
4324	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4325}
4326
4327
4328CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4329CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4330
4331/*ARGSUSED*/
4332static int
4333zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4334{
4335	znode_t		*zp = VTOZ(vp);
4336	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4337	uint32_t	gen;
4338	uint64_t	gen64;
4339	uint64_t	object = zp->z_id;
4340	zfid_short_t	*zfid;
4341	int		size, i, error;
4342
4343	ZFS_ENTER(zfsvfs);
4344	ZFS_VERIFY_ZP(zp);
4345
4346	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4347	    &gen64, sizeof (uint64_t))) != 0) {
4348		ZFS_EXIT(zfsvfs);
4349		return (error);
4350	}
4351
4352	gen = (uint32_t)gen64;
4353
4354	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4355
4356#ifdef illumos
4357	if (fidp->fid_len < size) {
4358		fidp->fid_len = size;
4359		ZFS_EXIT(zfsvfs);
4360		return (SET_ERROR(ENOSPC));
4361	}
4362#else
4363	fidp->fid_len = size;
4364#endif
4365
4366	zfid = (zfid_short_t *)fidp;
4367
4368	zfid->zf_len = size;
4369
4370	for (i = 0; i < sizeof (zfid->zf_object); i++)
4371		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4372
4373	/* Must have a non-zero generation number to distinguish from .zfs */
4374	if (gen == 0)
4375		gen = 1;
4376	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4377		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4378
4379	if (size == LONG_FID_LEN) {
4380		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4381		zfid_long_t	*zlfid;
4382
4383		zlfid = (zfid_long_t *)fidp;
4384
4385		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4386			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4387
4388		/* XXX - this should be the generation number for the objset */
4389		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4390			zlfid->zf_setgen[i] = 0;
4391	}
4392
4393	ZFS_EXIT(zfsvfs);
4394	return (0);
4395}
4396
4397static int
4398zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4399    caller_context_t *ct)
4400{
4401	znode_t		*zp, *xzp;
4402	zfsvfs_t	*zfsvfs;
4403	int		error;
4404
4405	switch (cmd) {
4406	case _PC_LINK_MAX:
4407		*valp = INT_MAX;
4408		return (0);
4409
4410	case _PC_FILESIZEBITS:
4411		*valp = 64;
4412		return (0);
4413#ifdef illumos
4414	case _PC_XATTR_EXISTS:
4415		zp = VTOZ(vp);
4416		zfsvfs = zp->z_zfsvfs;
4417		ZFS_ENTER(zfsvfs);
4418		ZFS_VERIFY_ZP(zp);
4419		*valp = 0;
4420		error = zfs_dirent_lookup(zp, "", &xzp,
4421		    ZXATTR | ZEXISTS | ZSHARED);
4422		if (error == 0) {
4423			if (!zfs_dirempty(xzp))
4424				*valp = 1;
4425			vrele(ZTOV(xzp));
4426		} else if (error == ENOENT) {
4427			/*
4428			 * If there aren't extended attributes, it's the
4429			 * same as having zero of them.
4430			 */
4431			error = 0;
4432		}
4433		ZFS_EXIT(zfsvfs);
4434		return (error);
4435
4436	case _PC_SATTR_ENABLED:
4437	case _PC_SATTR_EXISTS:
4438		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4439		    (vp->v_type == VREG || vp->v_type == VDIR);
4440		return (0);
4441
4442	case _PC_ACCESS_FILTERING:
4443		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4444		    vp->v_type == VDIR;
4445		return (0);
4446
4447	case _PC_ACL_ENABLED:
4448		*valp = _ACL_ACE_ENABLED;
4449		return (0);
4450#endif	/* illumos */
4451	case _PC_MIN_HOLE_SIZE:
4452		*valp = (int)SPA_MINBLOCKSIZE;
4453		return (0);
4454#ifdef illumos
4455	case _PC_TIMESTAMP_RESOLUTION:
4456		/* nanosecond timestamp resolution */
4457		*valp = 1L;
4458		return (0);
4459#endif
4460	case _PC_ACL_EXTENDED:
4461		*valp = 0;
4462		return (0);
4463
4464	case _PC_ACL_NFS4:
4465		*valp = 1;
4466		return (0);
4467
4468	case _PC_ACL_PATH_MAX:
4469		*valp = ACL_MAX_ENTRIES;
4470		return (0);
4471
4472	default:
4473		return (EOPNOTSUPP);
4474	}
4475}
4476
4477/*ARGSUSED*/
4478static int
4479zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4480    caller_context_t *ct)
4481{
4482	znode_t *zp = VTOZ(vp);
4483	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4484	int error;
4485	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4486
4487	ZFS_ENTER(zfsvfs);
4488	ZFS_VERIFY_ZP(zp);
4489	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4490	ZFS_EXIT(zfsvfs);
4491
4492	return (error);
4493}
4494
4495/*ARGSUSED*/
4496int
4497zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4498    caller_context_t *ct)
4499{
4500	znode_t *zp = VTOZ(vp);
4501	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4502	int error;
4503	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4504	zilog_t	*zilog = zfsvfs->z_log;
4505
4506	ZFS_ENTER(zfsvfs);
4507	ZFS_VERIFY_ZP(zp);
4508
4509	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4510
4511	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4512		zil_commit(zilog, 0);
4513
4514	ZFS_EXIT(zfsvfs);
4515	return (error);
4516}
4517
4518static int
4519zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4520    int *rahead)
4521{
4522	znode_t *zp = VTOZ(vp);
4523	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4524	objset_t *os = zp->z_zfsvfs->z_os;
4525	rl_t *rl;
4526	vm_object_t object;
4527	off_t start, end, obj_size;
4528	uint_t blksz;
4529	int pgsin_b, pgsin_a;
4530	int error;
4531
4532	ZFS_ENTER(zfsvfs);
4533	ZFS_VERIFY_ZP(zp);
4534
4535	start = IDX_TO_OFF(ma[0]->pindex);
4536	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4537
4538	/*
4539	 * Lock a range covering all required and optional pages.
4540	 * Note that we need to handle the case of the block size growing.
4541	 */
4542	for (;;) {
4543		blksz = zp->z_blksz;
4544		rl = zfs_range_lock(zp, rounddown(start, blksz),
4545		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
4546		if (blksz == zp->z_blksz)
4547			break;
4548		zfs_range_unlock(rl);
4549	}
4550
4551	object = ma[0]->object;
4552	zfs_vmobject_wlock(object);
4553	obj_size = object->un_pager.vnp.vnp_size;
4554	zfs_vmobject_wunlock(object);
4555	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4556		zfs_range_unlock(rl);
4557		ZFS_EXIT(zfsvfs);
4558		return (zfs_vm_pagerret_bad);
4559	}
4560
4561	pgsin_b = 0;
4562	if (rbehind != NULL) {
4563		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4564		pgsin_b = MIN(*rbehind, pgsin_b);
4565	}
4566
4567	pgsin_a = 0;
4568	if (rahead != NULL) {
4569		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4570		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4571			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4572		pgsin_a = MIN(*rahead, pgsin_a);
4573	}
4574
4575	/*
4576	 * NB: we need to pass the exact byte size of the data that we expect
4577	 * to read after accounting for the file size.  This is required because
4578	 * ZFS will panic if we request DMU to read beyond the end of the last
4579	 * allocated block.
4580	 */
4581	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
4582	    MIN(end, obj_size) - (end - PAGE_SIZE));
4583
4584	zfs_range_unlock(rl);
4585	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4586	ZFS_EXIT(zfsvfs);
4587
4588	if (error != 0)
4589		return (zfs_vm_pagerret_error);
4590
4591	PCPU_INC(cnt.v_vnodein);
4592	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
4593	if (rbehind != NULL)
4594		*rbehind = pgsin_b;
4595	if (rahead != NULL)
4596		*rahead = pgsin_a;
4597	return (zfs_vm_pagerret_ok);
4598}
4599
4600static int
4601zfs_freebsd_getpages(ap)
4602	struct vop_getpages_args /* {
4603		struct vnode *a_vp;
4604		vm_page_t *a_m;
4605		int a_count;
4606		int *a_rbehind;
4607		int *a_rahead;
4608	} */ *ap;
4609{
4610
4611	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4612	    ap->a_rahead));
4613}
4614
4615static int
4616zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4617    int *rtvals)
4618{
4619	znode_t		*zp = VTOZ(vp);
4620	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4621	rl_t		*rl;
4622	dmu_tx_t	*tx;
4623	struct sf_buf	*sf;
4624	vm_object_t	object;
4625	vm_page_t	m;
4626	caddr_t		va;
4627	size_t		tocopy;
4628	size_t		lo_len;
4629	vm_ooffset_t	lo_off;
4630	vm_ooffset_t	off;
4631	uint_t		blksz;
4632	int		ncount;
4633	int		pcount;
4634	int		err;
4635	int		i;
4636
4637	ZFS_ENTER(zfsvfs);
4638	ZFS_VERIFY_ZP(zp);
4639
4640	object = vp->v_object;
4641	pcount = btoc(len);
4642	ncount = pcount;
4643
4644	KASSERT(ma[0]->object == object, ("mismatching object"));
4645	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4646
4647	for (i = 0; i < pcount; i++)
4648		rtvals[i] = zfs_vm_pagerret_error;
4649
4650	off = IDX_TO_OFF(ma[0]->pindex);
4651	blksz = zp->z_blksz;
4652	lo_off = rounddown(off, blksz);
4653	lo_len = roundup(len + (off - lo_off), blksz);
4654	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4655
4656	zfs_vmobject_wlock(object);
4657	if (len + off > object->un_pager.vnp.vnp_size) {
4658		if (object->un_pager.vnp.vnp_size > off) {
4659			int pgoff;
4660
4661			len = object->un_pager.vnp.vnp_size - off;
4662			ncount = btoc(len);
4663			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4664				/*
4665				 * If the object is locked and the following
4666				 * conditions hold, then the page's dirty
4667				 * field cannot be concurrently changed by a
4668				 * pmap operation.
4669				 */
4670				m = ma[ncount - 1];
4671				vm_page_assert_sbusied(m);
4672				KASSERT(!pmap_page_is_write_mapped(m),
4673				    ("zfs_putpages: page %p is not read-only", m));
4674				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4675				    pgoff);
4676			}
4677		} else {
4678			len = 0;
4679			ncount = 0;
4680		}
4681		if (ncount < pcount) {
4682			for (i = ncount; i < pcount; i++) {
4683				rtvals[i] = zfs_vm_pagerret_bad;
4684			}
4685		}
4686	}
4687	zfs_vmobject_wunlock(object);
4688
4689	if (ncount == 0)
4690		goto out;
4691
4692	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4693	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4694		goto out;
4695	}
4696
4697	tx = dmu_tx_create(zfsvfs->z_os);
4698	dmu_tx_hold_write(tx, zp->z_id, off, len);
4699
4700	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4701	zfs_sa_upgrade_txholds(tx, zp);
4702	err = dmu_tx_assign(tx, TXG_WAIT);
4703	if (err != 0) {
4704		dmu_tx_abort(tx);
4705		goto out;
4706	}
4707
4708	if (zp->z_blksz < PAGE_SIZE) {
4709		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4710			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4711			va = zfs_map_page(ma[i], &sf);
4712			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4713			zfs_unmap_page(sf);
4714		}
4715	} else {
4716		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4717	}
4718
4719	if (err == 0) {
4720		uint64_t mtime[2], ctime[2];
4721		sa_bulk_attr_t bulk[3];
4722		int count = 0;
4723
4724		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4725		    &mtime, 16);
4726		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4727		    &ctime, 16);
4728		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4729		    &zp->z_pflags, 8);
4730		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4731		    B_TRUE);
4732		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4733		ASSERT0(err);
4734		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4735
4736		zfs_vmobject_wlock(object);
4737		for (i = 0; i < ncount; i++) {
4738			rtvals[i] = zfs_vm_pagerret_ok;
4739			vm_page_undirty(ma[i]);
4740		}
4741		zfs_vmobject_wunlock(object);
4742		PCPU_INC(cnt.v_vnodeout);
4743		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4744	}
4745	dmu_tx_commit(tx);
4746
4747out:
4748	zfs_range_unlock(rl);
4749	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4750	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4751		zil_commit(zfsvfs->z_log, zp->z_id);
4752	ZFS_EXIT(zfsvfs);
4753	return (rtvals[0]);
4754}
4755
4756int
4757zfs_freebsd_putpages(ap)
4758	struct vop_putpages_args /* {
4759		struct vnode *a_vp;
4760		vm_page_t *a_m;
4761		int a_count;
4762		int a_sync;
4763		int *a_rtvals;
4764	} */ *ap;
4765{
4766
4767	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4768	    ap->a_rtvals));
4769}
4770
4771static int
4772zfs_freebsd_bmap(ap)
4773	struct vop_bmap_args /* {
4774		struct vnode *a_vp;
4775		daddr_t  a_bn;
4776		struct bufobj **a_bop;
4777		daddr_t *a_bnp;
4778		int *a_runp;
4779		int *a_runb;
4780	} */ *ap;
4781{
4782
4783	if (ap->a_bop != NULL)
4784		*ap->a_bop = &ap->a_vp->v_bufobj;
4785	if (ap->a_bnp != NULL)
4786		*ap->a_bnp = ap->a_bn;
4787	if (ap->a_runp != NULL)
4788		*ap->a_runp = 0;
4789	if (ap->a_runb != NULL)
4790		*ap->a_runb = 0;
4791
4792	return (0);
4793}
4794
4795static int
4796zfs_freebsd_open(ap)
4797	struct vop_open_args /* {
4798		struct vnode *a_vp;
4799		int a_mode;
4800		struct ucred *a_cred;
4801		struct thread *a_td;
4802	} */ *ap;
4803{
4804	vnode_t	*vp = ap->a_vp;
4805	znode_t *zp = VTOZ(vp);
4806	int error;
4807
4808	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4809	if (error == 0)
4810		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4811	return (error);
4812}
4813
4814static int
4815zfs_freebsd_close(ap)
4816	struct vop_close_args /* {
4817		struct vnode *a_vp;
4818		int  a_fflag;
4819		struct ucred *a_cred;
4820		struct thread *a_td;
4821	} */ *ap;
4822{
4823
4824	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4825}
4826
4827static int
4828zfs_freebsd_ioctl(ap)
4829	struct vop_ioctl_args /* {
4830		struct vnode *a_vp;
4831		u_long a_command;
4832		caddr_t a_data;
4833		int a_fflag;
4834		struct ucred *cred;
4835		struct thread *td;
4836	} */ *ap;
4837{
4838
4839	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4840	    ap->a_fflag, ap->a_cred, NULL, NULL));
4841}
4842
4843static int
4844ioflags(int ioflags)
4845{
4846	int flags = 0;
4847
4848	if (ioflags & IO_APPEND)
4849		flags |= FAPPEND;
4850	if (ioflags & IO_NDELAY)
4851		flags |= FNONBLOCK;
4852	if (ioflags & IO_SYNC)
4853		flags |= (FSYNC | FDSYNC | FRSYNC);
4854
4855	return (flags);
4856}
4857
4858static int
4859zfs_freebsd_read(ap)
4860	struct vop_read_args /* {
4861		struct vnode *a_vp;
4862		struct uio *a_uio;
4863		int a_ioflag;
4864		struct ucred *a_cred;
4865	} */ *ap;
4866{
4867
4868	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4869	    ap->a_cred, NULL));
4870}
4871
4872static int
4873zfs_freebsd_write(ap)
4874	struct vop_write_args /* {
4875		struct vnode *a_vp;
4876		struct uio *a_uio;
4877		int a_ioflag;
4878		struct ucred *a_cred;
4879	} */ *ap;
4880{
4881
4882	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4883	    ap->a_cred, NULL));
4884}
4885
4886static int
4887zfs_freebsd_access(ap)
4888	struct vop_access_args /* {
4889		struct vnode *a_vp;
4890		accmode_t a_accmode;
4891		struct ucred *a_cred;
4892		struct thread *a_td;
4893	} */ *ap;
4894{
4895	vnode_t *vp = ap->a_vp;
4896	znode_t *zp = VTOZ(vp);
4897	accmode_t accmode;
4898	int error = 0;
4899
4900	/*
4901	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4902	 */
4903	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4904	if (accmode != 0)
4905		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4906
4907	/*
4908	 * VADMIN has to be handled by vaccess().
4909	 */
4910	if (error == 0) {
4911		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4912		if (accmode != 0) {
4913			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4914			    zp->z_gid, accmode, ap->a_cred, NULL);
4915		}
4916	}
4917
4918	/*
4919	 * For VEXEC, ensure that at least one execute bit is set for
4920	 * non-directories.
4921	 */
4922	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4923	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4924		error = EACCES;
4925	}
4926
4927	return (error);
4928}
4929
4930static int
4931zfs_freebsd_lookup(ap)
4932	struct vop_lookup_args /* {
4933		struct vnode *a_dvp;
4934		struct vnode **a_vpp;
4935		struct componentname *a_cnp;
4936	} */ *ap;
4937{
4938	struct componentname *cnp = ap->a_cnp;
4939	char nm[NAME_MAX + 1];
4940
4941	ASSERT(cnp->cn_namelen < sizeof(nm));
4942	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4943
4944	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4945	    cnp->cn_cred, cnp->cn_thread, 0));
4946}
4947
4948static int
4949zfs_cache_lookup(ap)
4950	struct vop_lookup_args /* {
4951		struct vnode *a_dvp;
4952		struct vnode **a_vpp;
4953		struct componentname *a_cnp;
4954	} */ *ap;
4955{
4956	zfsvfs_t *zfsvfs;
4957
4958	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4959	if (zfsvfs->z_use_namecache)
4960		return (vfs_cache_lookup(ap));
4961	else
4962		return (zfs_freebsd_lookup(ap));
4963}
4964
4965static int
4966zfs_freebsd_create(ap)
4967	struct vop_create_args /* {
4968		struct vnode *a_dvp;
4969		struct vnode **a_vpp;
4970		struct componentname *a_cnp;
4971		struct vattr *a_vap;
4972	} */ *ap;
4973{
4974	zfsvfs_t *zfsvfs;
4975	struct componentname *cnp = ap->a_cnp;
4976	vattr_t *vap = ap->a_vap;
4977	int error, mode;
4978
4979	ASSERT(cnp->cn_flags & SAVENAME);
4980
4981	vattr_init_mask(vap);
4982	mode = vap->va_mode & ALLPERMS;
4983	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4984
4985	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4986	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
4987	if (zfsvfs->z_use_namecache &&
4988	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4989		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4990	return (error);
4991}
4992
4993static int
4994zfs_freebsd_remove(ap)
4995	struct vop_remove_args /* {
4996		struct vnode *a_dvp;
4997		struct vnode *a_vp;
4998		struct componentname *a_cnp;
4999	} */ *ap;
5000{
5001
5002	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5003
5004	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5005	    ap->a_cnp->cn_cred));
5006}
5007
5008static int
5009zfs_freebsd_mkdir(ap)
5010	struct vop_mkdir_args /* {
5011		struct vnode *a_dvp;
5012		struct vnode **a_vpp;
5013		struct componentname *a_cnp;
5014		struct vattr *a_vap;
5015	} */ *ap;
5016{
5017	vattr_t *vap = ap->a_vap;
5018
5019	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5020
5021	vattr_init_mask(vap);
5022
5023	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5024	    ap->a_cnp->cn_cred));
5025}
5026
5027static int
5028zfs_freebsd_rmdir(ap)
5029	struct vop_rmdir_args /* {
5030		struct vnode *a_dvp;
5031		struct vnode *a_vp;
5032		struct componentname *a_cnp;
5033	} */ *ap;
5034{
5035	struct componentname *cnp = ap->a_cnp;
5036
5037	ASSERT(cnp->cn_flags & SAVENAME);
5038
5039	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5040}
5041
5042static int
5043zfs_freebsd_readdir(ap)
5044	struct vop_readdir_args /* {
5045		struct vnode *a_vp;
5046		struct uio *a_uio;
5047		struct ucred *a_cred;
5048		int *a_eofflag;
5049		int *a_ncookies;
5050		u_long **a_cookies;
5051	} */ *ap;
5052{
5053
5054	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5055	    ap->a_ncookies, ap->a_cookies));
5056}
5057
5058static int
5059zfs_freebsd_fsync(ap)
5060	struct vop_fsync_args /* {
5061		struct vnode *a_vp;
5062		int a_waitfor;
5063		struct thread *a_td;
5064	} */ *ap;
5065{
5066
5067	vop_stdfsync(ap);
5068	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5069}
5070
5071static int
5072zfs_freebsd_getattr(ap)
5073	struct vop_getattr_args /* {
5074		struct vnode *a_vp;
5075		struct vattr *a_vap;
5076		struct ucred *a_cred;
5077	} */ *ap;
5078{
5079	vattr_t *vap = ap->a_vap;
5080	xvattr_t xvap;
5081	u_long fflags = 0;
5082	int error;
5083
5084	xva_init(&xvap);
5085	xvap.xva_vattr = *vap;
5086	xvap.xva_vattr.va_mask |= AT_XVATTR;
5087
5088	/* Convert chflags into ZFS-type flags. */
5089	/* XXX: what about SF_SETTABLE?. */
5090	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5091	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5092	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5093	XVA_SET_REQ(&xvap, XAT_NODUMP);
5094	XVA_SET_REQ(&xvap, XAT_READONLY);
5095	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5096	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5097	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5098	XVA_SET_REQ(&xvap, XAT_REPARSE);
5099	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5100	XVA_SET_REQ(&xvap, XAT_SPARSE);
5101
5102	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5103	if (error != 0)
5104		return (error);
5105
5106	/* Convert ZFS xattr into chflags. */
5107#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5108	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5109		fflags |= (fflag);					\
5110} while (0)
5111	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5112	    xvap.xva_xoptattrs.xoa_immutable);
5113	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5114	    xvap.xva_xoptattrs.xoa_appendonly);
5115	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5116	    xvap.xva_xoptattrs.xoa_nounlink);
5117	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5118	    xvap.xva_xoptattrs.xoa_archive);
5119	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5120	    xvap.xva_xoptattrs.xoa_nodump);
5121	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5122	    xvap.xva_xoptattrs.xoa_readonly);
5123	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5124	    xvap.xva_xoptattrs.xoa_system);
5125	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5126	    xvap.xva_xoptattrs.xoa_hidden);
5127	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5128	    xvap.xva_xoptattrs.xoa_reparse);
5129	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5130	    xvap.xva_xoptattrs.xoa_offline);
5131	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5132	    xvap.xva_xoptattrs.xoa_sparse);
5133
5134#undef	FLAG_CHECK
5135	*vap = xvap.xva_vattr;
5136	vap->va_flags = fflags;
5137	return (0);
5138}
5139
5140static int
5141zfs_freebsd_setattr(ap)
5142	struct vop_setattr_args /* {
5143		struct vnode *a_vp;
5144		struct vattr *a_vap;
5145		struct ucred *a_cred;
5146	} */ *ap;
5147{
5148	vnode_t *vp = ap->a_vp;
5149	vattr_t *vap = ap->a_vap;
5150	cred_t *cred = ap->a_cred;
5151	xvattr_t xvap;
5152	u_long fflags;
5153	uint64_t zflags;
5154
5155	vattr_init_mask(vap);
5156	vap->va_mask &= ~AT_NOSET;
5157
5158	xva_init(&xvap);
5159	xvap.xva_vattr = *vap;
5160
5161	zflags = VTOZ(vp)->z_pflags;
5162
5163	if (vap->va_flags != VNOVAL) {
5164		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5165		int error;
5166
5167		if (zfsvfs->z_use_fuids == B_FALSE)
5168			return (EOPNOTSUPP);
5169
5170		fflags = vap->va_flags;
5171		/*
5172		 * XXX KDM
5173		 * We need to figure out whether it makes sense to allow
5174		 * UF_REPARSE through, since we don't really have other
5175		 * facilities to handle reparse points and zfs_setattr()
5176		 * doesn't currently allow setting that attribute anyway.
5177		 */
5178		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5179		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5180		     UF_OFFLINE|UF_SPARSE)) != 0)
5181			return (EOPNOTSUPP);
5182		/*
5183		 * Unprivileged processes are not permitted to unset system
5184		 * flags, or modify flags if any system flags are set.
5185		 * Privileged non-jail processes may not modify system flags
5186		 * if securelevel > 0 and any existing system flags are set.
5187		 * Privileged jail processes behave like privileged non-jail
5188		 * processes if the security.jail.chflags_allowed sysctl is
5189		 * is non-zero; otherwise, they behave like unprivileged
5190		 * processes.
5191		 */
5192		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5193		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5194			if (zflags &
5195			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5196				error = securelevel_gt(cred, 0);
5197				if (error != 0)
5198					return (error);
5199			}
5200		} else {
5201			/*
5202			 * Callers may only modify the file flags on objects they
5203			 * have VADMIN rights for.
5204			 */
5205			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5206				return (error);
5207			if (zflags &
5208			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5209				return (EPERM);
5210			}
5211			if (fflags &
5212			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5213				return (EPERM);
5214			}
5215		}
5216
5217#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5218	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5219	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5220		XVA_SET_REQ(&xvap, (xflag));				\
5221		(xfield) = ((fflags & (fflag)) != 0);			\
5222	}								\
5223} while (0)
5224		/* Convert chflags into ZFS-type flags. */
5225		/* XXX: what about SF_SETTABLE?. */
5226		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5227		    xvap.xva_xoptattrs.xoa_immutable);
5228		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5229		    xvap.xva_xoptattrs.xoa_appendonly);
5230		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5231		    xvap.xva_xoptattrs.xoa_nounlink);
5232		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5233		    xvap.xva_xoptattrs.xoa_archive);
5234		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5235		    xvap.xva_xoptattrs.xoa_nodump);
5236		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5237		    xvap.xva_xoptattrs.xoa_readonly);
5238		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5239		    xvap.xva_xoptattrs.xoa_system);
5240		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5241		    xvap.xva_xoptattrs.xoa_hidden);
5242		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5243		    xvap.xva_xoptattrs.xoa_hidden);
5244		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5245		    xvap.xva_xoptattrs.xoa_offline);
5246		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5247		    xvap.xva_xoptattrs.xoa_sparse);
5248#undef	FLAG_CHANGE
5249	}
5250	if (vap->va_birthtime.tv_sec != VNOVAL) {
5251		xvap.xva_vattr.va_mask |= AT_XVATTR;
5252		XVA_SET_REQ(&xvap, XAT_CREATETIME);
5253	}
5254	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5255}
5256
5257static int
5258zfs_freebsd_rename(ap)
5259	struct vop_rename_args  /* {
5260		struct vnode *a_fdvp;
5261		struct vnode *a_fvp;
5262		struct componentname *a_fcnp;
5263		struct vnode *a_tdvp;
5264		struct vnode *a_tvp;
5265		struct componentname *a_tcnp;
5266	} */ *ap;
5267{
5268	vnode_t *fdvp = ap->a_fdvp;
5269	vnode_t *fvp = ap->a_fvp;
5270	vnode_t *tdvp = ap->a_tdvp;
5271	vnode_t *tvp = ap->a_tvp;
5272	int error;
5273
5274	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5275	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5276
5277	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5278	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5279
5280	vrele(fdvp);
5281	vrele(fvp);
5282	vrele(tdvp);
5283	if (tvp != NULL)
5284		vrele(tvp);
5285
5286	return (error);
5287}
5288
5289static int
5290zfs_freebsd_symlink(ap)
5291	struct vop_symlink_args /* {
5292		struct vnode *a_dvp;
5293		struct vnode **a_vpp;
5294		struct componentname *a_cnp;
5295		struct vattr *a_vap;
5296		char *a_target;
5297	} */ *ap;
5298{
5299	struct componentname *cnp = ap->a_cnp;
5300	vattr_t *vap = ap->a_vap;
5301
5302	ASSERT(cnp->cn_flags & SAVENAME);
5303
5304	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5305	vattr_init_mask(vap);
5306
5307	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5308	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5309}
5310
5311static int
5312zfs_freebsd_readlink(ap)
5313	struct vop_readlink_args /* {
5314		struct vnode *a_vp;
5315		struct uio *a_uio;
5316		struct ucred *a_cred;
5317	} */ *ap;
5318{
5319
5320	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5321}
5322
5323static int
5324zfs_freebsd_link(ap)
5325	struct vop_link_args /* {
5326		struct vnode *a_tdvp;
5327		struct vnode *a_vp;
5328		struct componentname *a_cnp;
5329	} */ *ap;
5330{
5331	struct componentname *cnp = ap->a_cnp;
5332	vnode_t *vp = ap->a_vp;
5333	vnode_t *tdvp = ap->a_tdvp;
5334
5335	if (tdvp->v_mount != vp->v_mount)
5336		return (EXDEV);
5337
5338	ASSERT(cnp->cn_flags & SAVENAME);
5339
5340	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5341}
5342
5343static int
5344zfs_freebsd_inactive(ap)
5345	struct vop_inactive_args /* {
5346		struct vnode *a_vp;
5347		struct thread *a_td;
5348	} */ *ap;
5349{
5350	vnode_t *vp = ap->a_vp;
5351
5352	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5353	return (0);
5354}
5355
5356static int
5357zfs_freebsd_reclaim(ap)
5358	struct vop_reclaim_args /* {
5359		struct vnode *a_vp;
5360		struct thread *a_td;
5361	} */ *ap;
5362{
5363	vnode_t	*vp = ap->a_vp;
5364	znode_t	*zp = VTOZ(vp);
5365	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5366
5367	ASSERT(zp != NULL);
5368
5369	/* Destroy the vm object and flush associated pages. */
5370	vnode_destroy_vobject(vp);
5371
5372	/*
5373	 * z_teardown_inactive_lock protects from a race with
5374	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5375	 * force unmount.
5376	 */
5377	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5378	if (zp->z_sa_hdl == NULL)
5379		zfs_znode_free(zp);
5380	else
5381		zfs_zinactive(zp);
5382	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5383
5384	vp->v_data = NULL;
5385	return (0);
5386}
5387
5388static int
5389zfs_freebsd_fid(ap)
5390	struct vop_fid_args /* {
5391		struct vnode *a_vp;
5392		struct fid *a_fid;
5393	} */ *ap;
5394{
5395
5396	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5397}
5398
5399static int
5400zfs_freebsd_pathconf(ap)
5401	struct vop_pathconf_args /* {
5402		struct vnode *a_vp;
5403		int a_name;
5404		register_t *a_retval;
5405	} */ *ap;
5406{
5407	ulong_t val;
5408	int error;
5409
5410	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5411	if (error == 0) {
5412		*ap->a_retval = val;
5413		return (error);
5414	}
5415	if (error != EOPNOTSUPP)
5416		return (error);
5417
5418	switch (ap->a_name) {
5419	case _PC_NAME_MAX:
5420		*ap->a_retval = NAME_MAX;
5421		return (0);
5422	case _PC_PIPE_BUF:
5423		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5424			*ap->a_retval = PIPE_BUF;
5425			return (0);
5426		}
5427		return (EINVAL);
5428	default:
5429		return (vop_stdpathconf(ap));
5430	}
5431}
5432
5433/*
5434 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5435 * extended attribute name:
5436 *
5437 *	NAMESPACE	PREFIX
5438 *	system		freebsd:system:
5439 *	user		(none, can be used to access ZFS fsattr(5) attributes
5440 *			created on Solaris)
5441 */
5442static int
5443zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5444    size_t size)
5445{
5446	const char *namespace, *prefix, *suffix;
5447
5448	/* We don't allow '/' character in attribute name. */
5449	if (strchr(name, '/') != NULL)
5450		return (EINVAL);
5451	/* We don't allow attribute names that start with "freebsd:" string. */
5452	if (strncmp(name, "freebsd:", 8) == 0)
5453		return (EINVAL);
5454
5455	bzero(attrname, size);
5456
5457	switch (attrnamespace) {
5458	case EXTATTR_NAMESPACE_USER:
5459#if 0
5460		prefix = "freebsd:";
5461		namespace = EXTATTR_NAMESPACE_USER_STRING;
5462		suffix = ":";
5463#else
5464		/*
5465		 * This is the default namespace by which we can access all
5466		 * attributes created on Solaris.
5467		 */
5468		prefix = namespace = suffix = "";
5469#endif
5470		break;
5471	case EXTATTR_NAMESPACE_SYSTEM:
5472		prefix = "freebsd:";
5473		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5474		suffix = ":";
5475		break;
5476	case EXTATTR_NAMESPACE_EMPTY:
5477	default:
5478		return (EINVAL);
5479	}
5480	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5481	    name) >= size) {
5482		return (ENAMETOOLONG);
5483	}
5484	return (0);
5485}
5486
5487/*
5488 * Vnode operating to retrieve a named extended attribute.
5489 */
5490static int
5491zfs_getextattr(struct vop_getextattr_args *ap)
5492/*
5493vop_getextattr {
5494	IN struct vnode *a_vp;
5495	IN int a_attrnamespace;
5496	IN const char *a_name;
5497	INOUT struct uio *a_uio;
5498	OUT size_t *a_size;
5499	IN struct ucred *a_cred;
5500	IN struct thread *a_td;
5501};
5502*/
5503{
5504	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5505	struct thread *td = ap->a_td;
5506	struct nameidata nd;
5507	char attrname[255];
5508	struct vattr va;
5509	vnode_t *xvp = NULL, *vp;
5510	int error, flags;
5511
5512	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5513	    ap->a_cred, ap->a_td, VREAD);
5514	if (error != 0)
5515		return (error);
5516
5517	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5518	    sizeof(attrname));
5519	if (error != 0)
5520		return (error);
5521
5522	ZFS_ENTER(zfsvfs);
5523
5524	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5525	    LOOKUP_XATTR);
5526	if (error != 0) {
5527		ZFS_EXIT(zfsvfs);
5528		return (error);
5529	}
5530
5531	flags = FREAD;
5532	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5533	    xvp, td);
5534	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5535	vp = nd.ni_vp;
5536	NDFREE(&nd, NDF_ONLY_PNBUF);
5537	if (error != 0) {
5538		ZFS_EXIT(zfsvfs);
5539		if (error == ENOENT)
5540			error = ENOATTR;
5541		return (error);
5542	}
5543
5544	if (ap->a_size != NULL) {
5545		error = VOP_GETATTR(vp, &va, ap->a_cred);
5546		if (error == 0)
5547			*ap->a_size = (size_t)va.va_size;
5548	} else if (ap->a_uio != NULL)
5549		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5550
5551	VOP_UNLOCK(vp, 0);
5552	vn_close(vp, flags, ap->a_cred, td);
5553	ZFS_EXIT(zfsvfs);
5554
5555	return (error);
5556}
5557
5558/*
5559 * Vnode operation to remove a named attribute.
5560 */
5561int
5562zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5563/*
5564vop_deleteextattr {
5565	IN struct vnode *a_vp;
5566	IN int a_attrnamespace;
5567	IN const char *a_name;
5568	IN struct ucred *a_cred;
5569	IN struct thread *a_td;
5570};
5571*/
5572{
5573	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5574	struct thread *td = ap->a_td;
5575	struct nameidata nd;
5576	char attrname[255];
5577	struct vattr va;
5578	vnode_t *xvp = NULL, *vp;
5579	int error, flags;
5580
5581	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5582	    ap->a_cred, ap->a_td, VWRITE);
5583	if (error != 0)
5584		return (error);
5585
5586	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5587	    sizeof(attrname));
5588	if (error != 0)
5589		return (error);
5590
5591	ZFS_ENTER(zfsvfs);
5592
5593	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5594	    LOOKUP_XATTR);
5595	if (error != 0) {
5596		ZFS_EXIT(zfsvfs);
5597		return (error);
5598	}
5599
5600	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5601	    UIO_SYSSPACE, attrname, xvp, td);
5602	error = namei(&nd);
5603	vp = nd.ni_vp;
5604	if (error != 0) {
5605		ZFS_EXIT(zfsvfs);
5606		NDFREE(&nd, NDF_ONLY_PNBUF);
5607		if (error == ENOENT)
5608			error = ENOATTR;
5609		return (error);
5610	}
5611
5612	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5613	NDFREE(&nd, NDF_ONLY_PNBUF);
5614
5615	vput(nd.ni_dvp);
5616	if (vp == nd.ni_dvp)
5617		vrele(vp);
5618	else
5619		vput(vp);
5620	ZFS_EXIT(zfsvfs);
5621
5622	return (error);
5623}
5624
5625/*
5626 * Vnode operation to set a named attribute.
5627 */
5628static int
5629zfs_setextattr(struct vop_setextattr_args *ap)
5630/*
5631vop_setextattr {
5632	IN struct vnode *a_vp;
5633	IN int a_attrnamespace;
5634	IN const char *a_name;
5635	INOUT struct uio *a_uio;
5636	IN struct ucred *a_cred;
5637	IN struct thread *a_td;
5638};
5639*/
5640{
5641	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5642	struct thread *td = ap->a_td;
5643	struct nameidata nd;
5644	char attrname[255];
5645	struct vattr va;
5646	vnode_t *xvp = NULL, *vp;
5647	int error, flags;
5648
5649	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5650	    ap->a_cred, ap->a_td, VWRITE);
5651	if (error != 0)
5652		return (error);
5653
5654	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5655	    sizeof(attrname));
5656	if (error != 0)
5657		return (error);
5658
5659	ZFS_ENTER(zfsvfs);
5660
5661	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5662	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5663	if (error != 0) {
5664		ZFS_EXIT(zfsvfs);
5665		return (error);
5666	}
5667
5668	flags = FFLAGS(O_WRONLY | O_CREAT);
5669	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5670	    xvp, td);
5671	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5672	vp = nd.ni_vp;
5673	NDFREE(&nd, NDF_ONLY_PNBUF);
5674	if (error != 0) {
5675		ZFS_EXIT(zfsvfs);
5676		return (error);
5677	}
5678
5679	VATTR_NULL(&va);
5680	va.va_size = 0;
5681	error = VOP_SETATTR(vp, &va, ap->a_cred);
5682	if (error == 0)
5683		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5684
5685	VOP_UNLOCK(vp, 0);
5686	vn_close(vp, flags, ap->a_cred, td);
5687	ZFS_EXIT(zfsvfs);
5688
5689	return (error);
5690}
5691
5692/*
5693 * Vnode operation to retrieve extended attributes on a vnode.
5694 */
5695static int
5696zfs_listextattr(struct vop_listextattr_args *ap)
5697/*
5698vop_listextattr {
5699	IN struct vnode *a_vp;
5700	IN int a_attrnamespace;
5701	INOUT struct uio *a_uio;
5702	OUT size_t *a_size;
5703	IN struct ucred *a_cred;
5704	IN struct thread *a_td;
5705};
5706*/
5707{
5708	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5709	struct thread *td = ap->a_td;
5710	struct nameidata nd;
5711	char attrprefix[16];
5712	u_char dirbuf[sizeof(struct dirent)];
5713	struct dirent *dp;
5714	struct iovec aiov;
5715	struct uio auio, *uio = ap->a_uio;
5716	size_t *sizep = ap->a_size;
5717	size_t plen;
5718	vnode_t *xvp = NULL, *vp;
5719	int done, error, eof, pos;
5720
5721	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5722	    ap->a_cred, ap->a_td, VREAD);
5723	if (error != 0)
5724		return (error);
5725
5726	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5727	    sizeof(attrprefix));
5728	if (error != 0)
5729		return (error);
5730	plen = strlen(attrprefix);
5731
5732	ZFS_ENTER(zfsvfs);
5733
5734	if (sizep != NULL)
5735		*sizep = 0;
5736
5737	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5738	    LOOKUP_XATTR);
5739	if (error != 0) {
5740		ZFS_EXIT(zfsvfs);
5741		/*
5742		 * ENOATTR means that the EA directory does not yet exist,
5743		 * i.e. there are no extended attributes there.
5744		 */
5745		if (error == ENOATTR)
5746			error = 0;
5747		return (error);
5748	}
5749
5750	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5751	    UIO_SYSSPACE, ".", xvp, td);
5752	error = namei(&nd);
5753	vp = nd.ni_vp;
5754	NDFREE(&nd, NDF_ONLY_PNBUF);
5755	if (error != 0) {
5756		ZFS_EXIT(zfsvfs);
5757		return (error);
5758	}
5759
5760	auio.uio_iov = &aiov;
5761	auio.uio_iovcnt = 1;
5762	auio.uio_segflg = UIO_SYSSPACE;
5763	auio.uio_td = td;
5764	auio.uio_rw = UIO_READ;
5765	auio.uio_offset = 0;
5766
5767	do {
5768		u_char nlen;
5769
5770		aiov.iov_base = (void *)dirbuf;
5771		aiov.iov_len = sizeof(dirbuf);
5772		auio.uio_resid = sizeof(dirbuf);
5773		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5774		done = sizeof(dirbuf) - auio.uio_resid;
5775		if (error != 0)
5776			break;
5777		for (pos = 0; pos < done;) {
5778			dp = (struct dirent *)(dirbuf + pos);
5779			pos += dp->d_reclen;
5780			/*
5781			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5782			 * is what we get when attribute was created on Solaris.
5783			 */
5784			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5785				continue;
5786			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5787				continue;
5788			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5789				continue;
5790			nlen = dp->d_namlen - plen;
5791			if (sizep != NULL)
5792				*sizep += 1 + nlen;
5793			else if (uio != NULL) {
5794				/*
5795				 * Format of extattr name entry is one byte for
5796				 * length and the rest for name.
5797				 */
5798				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5799				if (error == 0) {
5800					error = uiomove(dp->d_name + plen, nlen,
5801					    uio->uio_rw, uio);
5802				}
5803				if (error != 0)
5804					break;
5805			}
5806		}
5807	} while (!eof && error == 0);
5808
5809	vput(vp);
5810	ZFS_EXIT(zfsvfs);
5811
5812	return (error);
5813}
5814
5815int
5816zfs_freebsd_getacl(ap)
5817	struct vop_getacl_args /* {
5818		struct vnode *vp;
5819		acl_type_t type;
5820		struct acl *aclp;
5821		struct ucred *cred;
5822		struct thread *td;
5823	} */ *ap;
5824{
5825	int		error;
5826	vsecattr_t      vsecattr;
5827
5828	if (ap->a_type != ACL_TYPE_NFS4)
5829		return (EINVAL);
5830
5831	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5832	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5833		return (error);
5834
5835	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5836	if (vsecattr.vsa_aclentp != NULL)
5837		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5838
5839	return (error);
5840}
5841
5842int
5843zfs_freebsd_setacl(ap)
5844	struct vop_setacl_args /* {
5845		struct vnode *vp;
5846		acl_type_t type;
5847		struct acl *aclp;
5848		struct ucred *cred;
5849		struct thread *td;
5850	} */ *ap;
5851{
5852	int		error;
5853	vsecattr_t      vsecattr;
5854	int		aclbsize;	/* size of acl list in bytes */
5855	aclent_t	*aaclp;
5856
5857	if (ap->a_type != ACL_TYPE_NFS4)
5858		return (EINVAL);
5859
5860	if (ap->a_aclp == NULL)
5861		return (EINVAL);
5862
5863	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5864		return (EINVAL);
5865
5866	/*
5867	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5868	 * splitting every entry into two and appending "canonical six"
5869	 * entries at the end.  Don't allow for setting an ACL that would
5870	 * cause chmod(2) to run out of ACL entries.
5871	 */
5872	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5873		return (ENOSPC);
5874
5875	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5876	if (error != 0)
5877		return (error);
5878
5879	vsecattr.vsa_mask = VSA_ACE;
5880	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5881	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5882	aaclp = vsecattr.vsa_aclentp;
5883	vsecattr.vsa_aclentsz = aclbsize;
5884
5885	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5886	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5887	kmem_free(aaclp, aclbsize);
5888
5889	return (error);
5890}
5891
5892int
5893zfs_freebsd_aclcheck(ap)
5894	struct vop_aclcheck_args /* {
5895		struct vnode *vp;
5896		acl_type_t type;
5897		struct acl *aclp;
5898		struct ucred *cred;
5899		struct thread *td;
5900	} */ *ap;
5901{
5902
5903	return (EOPNOTSUPP);
5904}
5905
5906static int
5907zfs_vptocnp(struct vop_vptocnp_args *ap)
5908{
5909	vnode_t *covered_vp;
5910	vnode_t *vp = ap->a_vp;;
5911	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5912	znode_t *zp = VTOZ(vp);
5913	int ltype;
5914	int error;
5915
5916	ZFS_ENTER(zfsvfs);
5917	ZFS_VERIFY_ZP(zp);
5918
5919	/*
5920	 * If we are a snapshot mounted under .zfs, run the operation
5921	 * on the covered vnode.
5922	 */
5923	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5924		char name[MAXNAMLEN + 1];
5925		znode_t *dzp;
5926		size_t len;
5927
5928		error = zfs_znode_parent_and_name(zp, &dzp, name);
5929		if (error == 0) {
5930			len = strlen(name);
5931			if (*ap->a_buflen < len)
5932				error = SET_ERROR(ENOMEM);
5933		}
5934		if (error == 0) {
5935			*ap->a_buflen -= len;
5936			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5937			*ap->a_vpp = ZTOV(dzp);
5938		}
5939		ZFS_EXIT(zfsvfs);
5940		return (error);
5941	}
5942	ZFS_EXIT(zfsvfs);
5943
5944	covered_vp = vp->v_mount->mnt_vnodecovered;
5945	vhold(covered_vp);
5946	ltype = VOP_ISLOCKED(vp);
5947	VOP_UNLOCK(vp, 0);
5948	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
5949	if (error == 0) {
5950		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5951		    ap->a_buf, ap->a_buflen);
5952		vput(covered_vp);
5953	}
5954	vn_lock(vp, ltype | LK_RETRY);
5955	if ((vp->v_iflag & VI_DOOMED) != 0)
5956		error = SET_ERROR(ENOENT);
5957	return (error);
5958}
5959
5960#ifdef DIAGNOSTIC
5961static int
5962zfs_lock(ap)
5963	struct vop_lock1_args /* {
5964		struct vnode *a_vp;
5965		int a_flags;
5966		char *file;
5967		int line;
5968	} */ *ap;
5969{
5970	vnode_t *vp;
5971	znode_t *zp;
5972	int err;
5973
5974	err = vop_stdlock(ap);
5975	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
5976		vp = ap->a_vp;
5977		zp = vp->v_data;
5978		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
5979		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
5980			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
5981	}
5982	return (err);
5983}
5984#endif
5985
5986struct vop_vector zfs_vnodeops;
5987struct vop_vector zfs_fifoops;
5988struct vop_vector zfs_shareops;
5989
5990struct vop_vector zfs_vnodeops = {
5991	.vop_default =		&default_vnodeops,
5992	.vop_inactive =		zfs_freebsd_inactive,
5993	.vop_reclaim =		zfs_freebsd_reclaim,
5994	.vop_access =		zfs_freebsd_access,
5995	.vop_lookup =		zfs_cache_lookup,
5996	.vop_cachedlookup =	zfs_freebsd_lookup,
5997	.vop_getattr =		zfs_freebsd_getattr,
5998	.vop_setattr =		zfs_freebsd_setattr,
5999	.vop_create =		zfs_freebsd_create,
6000	.vop_mknod =		zfs_freebsd_create,
6001	.vop_mkdir =		zfs_freebsd_mkdir,
6002	.vop_readdir =		zfs_freebsd_readdir,
6003	.vop_fsync =		zfs_freebsd_fsync,
6004	.vop_open =		zfs_freebsd_open,
6005	.vop_close =		zfs_freebsd_close,
6006	.vop_rmdir =		zfs_freebsd_rmdir,
6007	.vop_ioctl =		zfs_freebsd_ioctl,
6008	.vop_link =		zfs_freebsd_link,
6009	.vop_symlink =		zfs_freebsd_symlink,
6010	.vop_readlink =		zfs_freebsd_readlink,
6011	.vop_read =		zfs_freebsd_read,
6012	.vop_write =		zfs_freebsd_write,
6013	.vop_remove =		zfs_freebsd_remove,
6014	.vop_rename =		zfs_freebsd_rename,
6015	.vop_pathconf =		zfs_freebsd_pathconf,
6016	.vop_bmap =		zfs_freebsd_bmap,
6017	.vop_fid =		zfs_freebsd_fid,
6018	.vop_getextattr =	zfs_getextattr,
6019	.vop_deleteextattr =	zfs_deleteextattr,
6020	.vop_setextattr =	zfs_setextattr,
6021	.vop_listextattr =	zfs_listextattr,
6022	.vop_getacl =		zfs_freebsd_getacl,
6023	.vop_setacl =		zfs_freebsd_setacl,
6024	.vop_aclcheck =		zfs_freebsd_aclcheck,
6025	.vop_getpages =		zfs_freebsd_getpages,
6026	.vop_putpages =		zfs_freebsd_putpages,
6027	.vop_vptocnp =		zfs_vptocnp,
6028#ifdef DIAGNOSTIC
6029	.vop_lock1 =		zfs_lock,
6030#endif
6031};
6032
6033struct vop_vector zfs_fifoops = {
6034	.vop_default =		&fifo_specops,
6035	.vop_fsync =		zfs_freebsd_fsync,
6036	.vop_access =		zfs_freebsd_access,
6037	.vop_getattr =		zfs_freebsd_getattr,
6038	.vop_inactive =		zfs_freebsd_inactive,
6039	.vop_read =		VOP_PANIC,
6040	.vop_reclaim =		zfs_freebsd_reclaim,
6041	.vop_setattr =		zfs_freebsd_setattr,
6042	.vop_write =		VOP_PANIC,
6043	.vop_pathconf = 	zfs_freebsd_pathconf,
6044	.vop_fid =		zfs_freebsd_fid,
6045	.vop_getacl =		zfs_freebsd_getacl,
6046	.vop_setacl =		zfs_freebsd_setacl,
6047	.vop_aclcheck =		zfs_freebsd_aclcheck,
6048};
6049
6050/*
6051 * special share hidden files vnode operations template
6052 */
6053struct vop_vector zfs_shareops = {
6054	.vop_default =		&default_vnodeops,
6055	.vop_access =		zfs_freebsd_access,
6056	.vop_inactive =		zfs_freebsd_inactive,
6057	.vop_reclaim =		zfs_freebsd_reclaim,
6058	.vop_fid =		zfs_freebsd_fid,
6059	.vop_pathconf =		zfs_freebsd_pathconf,
6060};
6061