zfs_vnops.c revision 306819
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77
78/*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory.  The example below illustrates the following Big Rules:
88 *
89 *  (1)	A check must be made in each zfs thread for a mounted file system.
90 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93 *	can return EIO from the calling function.
94 *
95 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 *	First, if it's the last reference, the vnode/znode
98 *	can be freed, so the zp may point to freed memory.  Second, the last
99 *	reference will call zfs_zinactive(), which may induce a lot of work --
100 *	pushing cached pages (which acquires range locks) and syncing out
101 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102 *	which could deadlock the system if you were already holding one.
103 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106 *	as they can span dmu_tx_assign() calls.
107 *
108 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 *      dmu_tx_assign().  This is critical because we don't want to block
110 *      while holding locks.
111 *
112 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113 *	reduces lock contention and CPU usage when we must wait (note that if
114 *	throughput is constrained by the storage, nearly every transaction
115 *	must wait).
116 *
117 *      Note, in particular, that if a lock is sometimes acquired before
118 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119 *      to use a non-blocking assign can deadlock the system.  The scenario:
120 *
121 *	Thread A has grabbed a lock before calling dmu_tx_assign().
122 *	Thread B is in an already-assigned tx, and blocks for this lock.
123 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 *	forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
129 *	to indicate that this operation has already called dmu_tx_wait().
130 *	This will ensure that we don't retry forever, waiting a short bit
131 *	each time.
132 *
133 *  (5)	If the operation succeeded, generate the intent log entry for it
134 *	before dropping locks.  This ensures that the ordering of events
135 *	in the intent log matches the order in which they actually occurred.
136 *	During ZIL replay the zfs_log_* functions will update the sequence
137 *	number to indicate the zil transaction has replayed.
138 *
139 *  (6)	At the end of each vnode op, the DMU tx must always commit,
140 *	regardless of whether there were any errors.
141 *
142 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143 *	to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148 * top:
149 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150 *	rw_enter(...);			// grab any other locks you need
151 *	tx = dmu_tx_create(...);	// get DMU tx
152 *	dmu_tx_hold_*();		// hold each object you might modify
153 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
154 *	if (error) {
155 *		rw_exit(...);		// drop locks
156 *		zfs_dirent_unlock(dl);	// unlock directory entry
157 *		VN_RELE(...);		// release held vnodes
158 *		if (error == ERESTART) {
159 *			waited = B_TRUE;
160 *			dmu_tx_wait(tx);
161 *			dmu_tx_abort(tx);
162 *			goto top;
163 *		}
164 *		dmu_tx_abort(tx);	// abort DMU tx
165 *		ZFS_EXIT(zfsvfs);	// finished in zfs
166 *		return (error);		// really out of space
167 *	}
168 *	error = do_real_work();		// do whatever this VOP does
169 *	if (error == 0)
170 *		zfs_log_*(...);		// on success, make ZIL entry
171 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172 *	rw_exit(...);			// drop locks
173 *	zfs_dirent_unlock(dl);		// unlock directory entry
174 *	VN_RELE(...);			// release held vnodes
175 *	zil_commit(zilog, foid);	// synchronous when necessary
176 *	ZFS_EXIT(zfsvfs);		// finished in zfs
177 *	return (error);			// done, report error
178 */
179
180/* ARGSUSED */
181static int
182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183{
184	znode_t	*zp = VTOZ(*vpp);
185	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187	ZFS_ENTER(zfsvfs);
188	ZFS_VERIFY_ZP(zp);
189
190	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191	    ((flag & FAPPEND) == 0)) {
192		ZFS_EXIT(zfsvfs);
193		return (SET_ERROR(EPERM));
194	}
195
196	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197	    ZTOV(zp)->v_type == VREG &&
198	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199		if (fs_vscan(*vpp, cr, 0) != 0) {
200			ZFS_EXIT(zfsvfs);
201			return (SET_ERROR(EACCES));
202		}
203	}
204
205	/* Keep a count of the synchronous opens in the znode */
206	if (flag & (FSYNC | FDSYNC))
207		atomic_inc_32(&zp->z_sync_cnt);
208
209	ZFS_EXIT(zfsvfs);
210	return (0);
211}
212
213/* ARGSUSED */
214static int
215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216    caller_context_t *ct)
217{
218	znode_t	*zp = VTOZ(vp);
219	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221	/*
222	 * Clean up any locks held by this process on the vp.
223	 */
224	cleanlocks(vp, ddi_get_pid(), 0);
225	cleanshares(vp, ddi_get_pid());
226
227	ZFS_ENTER(zfsvfs);
228	ZFS_VERIFY_ZP(zp);
229
230	/* Decrement the synchronous opens in the znode */
231	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232		atomic_dec_32(&zp->z_sync_cnt);
233
234	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235	    ZTOV(zp)->v_type == VREG &&
236	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237		VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239	ZFS_EXIT(zfsvfs);
240	return (0);
241}
242
243/*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247static int
248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249{
250	znode_t	*zp = VTOZ(vp);
251	uint64_t noff = (uint64_t)*off; /* new offset */
252	uint64_t file_sz;
253	int error;
254	boolean_t hole;
255
256	file_sz = zp->z_size;
257	if (noff >= file_sz)  {
258		return (SET_ERROR(ENXIO));
259	}
260
261	if (cmd == _FIO_SEEK_HOLE)
262		hole = B_TRUE;
263	else
264		hole = B_FALSE;
265
266	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268	if (error == ESRCH)
269		return (SET_ERROR(ENXIO));
270
271	/*
272	 * We could find a hole that begins after the logical end-of-file,
273	 * because dmu_offset_next() only works on whole blocks.  If the
274	 * EOF falls mid-block, then indicate that the "virtual hole"
275	 * at the end of the file begins at the logical EOF, rather than
276	 * at the end of the last block.
277	 */
278	if (noff > file_sz) {
279		ASSERT(hole);
280		noff = file_sz;
281	}
282
283	if (noff < *off)
284		return (error);
285	*off = noff;
286	return (error);
287}
288
289/* ARGSUSED */
290static int
291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292    int *rvalp, caller_context_t *ct)
293{
294	offset_t off;
295	offset_t ndata;
296	dmu_object_info_t doi;
297	int error;
298	zfsvfs_t *zfsvfs;
299	znode_t *zp;
300
301	switch (com) {
302	case _FIOFFS:
303	{
304		return (0);
305
306		/*
307		 * The following two ioctls are used by bfu.  Faking out,
308		 * necessary to avoid bfu errors.
309		 */
310	}
311	case _FIOGDIO:
312	case _FIOSDIO:
313	{
314		return (0);
315	}
316
317	case _FIO_SEEK_DATA:
318	case _FIO_SEEK_HOLE:
319	{
320#ifdef illumos
321		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322			return (SET_ERROR(EFAULT));
323#else
324		off = *(offset_t *)data;
325#endif
326		zp = VTOZ(vp);
327		zfsvfs = zp->z_zfsvfs;
328		ZFS_ENTER(zfsvfs);
329		ZFS_VERIFY_ZP(zp);
330
331		/* offset parameter is in/out */
332		error = zfs_holey(vp, com, &off);
333		ZFS_EXIT(zfsvfs);
334		if (error)
335			return (error);
336#ifdef illumos
337		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		*(offset_t *)data = off;
341#endif
342		return (0);
343	}
344#ifdef illumos
345	case _FIO_COUNT_FILLED:
346	{
347		/*
348		 * _FIO_COUNT_FILLED adds a new ioctl command which
349		 * exposes the number of filled blocks in a
350		 * ZFS object.
351		 */
352		zp = VTOZ(vp);
353		zfsvfs = zp->z_zfsvfs;
354		ZFS_ENTER(zfsvfs);
355		ZFS_VERIFY_ZP(zp);
356
357		/*
358		 * Wait for all dirty blocks for this object
359		 * to get synced out to disk, and the DMU info
360		 * updated.
361		 */
362		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363		if (error) {
364			ZFS_EXIT(zfsvfs);
365			return (error);
366		}
367
368		/*
369		 * Retrieve fill count from DMU object.
370		 */
371		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372		if (error) {
373			ZFS_EXIT(zfsvfs);
374			return (error);
375		}
376
377		ndata = doi.doi_fill_count;
378
379		ZFS_EXIT(zfsvfs);
380		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381			return (SET_ERROR(EFAULT));
382		return (0);
383	}
384#endif
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389static vm_page_t
390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391{
392	vm_object_t obj;
393	vm_page_t pp;
394	int64_t end;
395
396	/*
397	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398	 * aligned boundaries, if the range is not aligned.  As a result a
399	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401	 * the whole page would be considred clean despite have some dirty data.
402	 * For this reason we should shrink the range to DEV_BSIZE aligned
403	 * boundaries before calling vm_page_clear_dirty.
404	 */
405	end = rounddown2(off + nbytes, DEV_BSIZE);
406	off = roundup2(off, DEV_BSIZE);
407	nbytes = end - off;
408
409	obj = vp->v_object;
410	zfs_vmobject_assert_wlocked(obj);
411
412	for (;;) {
413		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414		    pp->valid) {
415			if (vm_page_xbusied(pp)) {
416				/*
417				 * Reference the page before unlocking and
418				 * sleeping so that the page daemon is less
419				 * likely to reclaim it.
420				 */
421				vm_page_reference(pp);
422				vm_page_lock(pp);
423				zfs_vmobject_wunlock(obj);
424				vm_page_busy_sleep(pp, "zfsmwb");
425				zfs_vmobject_wlock(obj);
426				continue;
427			}
428			vm_page_sbusy(pp);
429		} else if (pp == NULL) {
430			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432			    VM_ALLOC_SBUSY);
433		} else {
434			ASSERT(pp != NULL && !pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb");
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes;
706	int		error = 0;
707	rl_t		*rl;
708	xuio_t		*xuio = NULL;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757	/*
758	 * If we are reading past end-of-file we can skip
759	 * to the end; but we might still need to set atime.
760	 */
761	if (uio->uio_loffset >= zp->z_size) {
762		error = 0;
763		goto out;
764	}
765
766	ASSERT(uio->uio_loffset < zp->z_size);
767	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769#ifdef illumos
770	if ((uio->uio_extflg == UIO_XUIO) &&
771	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772		int nblk;
773		int blksz = zp->z_blksz;
774		uint64_t offset = uio->uio_loffset;
775
776		xuio = (xuio_t *)uio;
777		if ((ISP2(blksz))) {
778			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779			    blksz)) / blksz;
780		} else {
781			ASSERT(offset + n <= blksz);
782			nblk = 1;
783		}
784		(void) dmu_xuio_init(xuio, nblk);
785
786		if (vn_has_cached_data(vp)) {
787			/*
788			 * For simplicity, we always allocate a full buffer
789			 * even if we only expect to read a portion of a block.
790			 */
791			while (--nblk >= 0) {
792				(void) dmu_xuio_add(xuio,
793				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794				    blksz), 0, blksz);
795			}
796		}
797	}
798#endif	/* illumos */
799
800	while (n > 0) {
801		nbytes = MIN(n, zfs_read_chunk_size -
802		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804#ifdef __FreeBSD__
805		if (uio->uio_segflg == UIO_NOCOPY)
806			error = mappedread_sf(vp, nbytes, uio);
807		else
808#endif /* __FreeBSD__ */
809		if (vn_has_cached_data(vp)) {
810			error = mappedread(vp, nbytes, uio);
811		} else {
812			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813			    uio, nbytes);
814		}
815		if (error) {
816			/* convert checksum errors into IO errors */
817			if (error == ECKSUM)
818				error = SET_ERROR(EIO);
819			break;
820		}
821
822		n -= nbytes;
823	}
824out:
825	zfs_range_unlock(rl);
826
827	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828	ZFS_EXIT(zfsvfs);
829	return (error);
830}
831
832/*
833 * Write the bytes to a file.
834 *
835 *	IN:	vp	- vnode of file to be written to.
836 *		uio	- structure supplying write location, range info,
837 *			  and data buffer.
838 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839 *			  set if in append mode.
840 *		cr	- credentials of caller.
841 *		ct	- caller context (NFS/CIFS fem monitor only)
842 *
843 *	OUT:	uio	- updated offset and range.
844 *
845 *	RETURN:	0 on success, error code on failure.
846 *
847 * Timestamps:
848 *	vp - ctime|mtime updated if byte count > 0
849 */
850
851/* ARGSUSED */
852static int
853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854{
855	znode_t		*zp = VTOZ(vp);
856	rlim64_t	limit = MAXOFFSET_T;
857	ssize_t		start_resid = uio->uio_resid;
858	ssize_t		tx_bytes;
859	uint64_t	end_size;
860	dmu_tx_t	*tx;
861	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862	zilog_t		*zilog;
863	offset_t	woff;
864	ssize_t		n, nbytes;
865	rl_t		*rl;
866	int		max_blksz = zfsvfs->z_max_blksz;
867	int		error = 0;
868	arc_buf_t	*abuf;
869	iovec_t		*aiov = NULL;
870	xuio_t		*xuio = NULL;
871	int		i_iov = 0;
872	int		iovcnt = uio->uio_iovcnt;
873	iovec_t		*iovp = uio->uio_iov;
874	int		write_eof;
875	int		count = 0;
876	sa_bulk_attr_t	bulk[4];
877	uint64_t	mtime[2], ctime[2];
878
879	/*
880	 * Fasttrack empty write
881	 */
882	n = start_resid;
883	if (n == 0)
884		return (0);
885
886	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887		limit = MAXOFFSET_T;
888
889	ZFS_ENTER(zfsvfs);
890	ZFS_VERIFY_ZP(zp);
891
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895	    &zp->z_size, 8);
896	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897	    &zp->z_pflags, 8);
898
899	/*
900	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901	 * callers might not be able to detect properly that we are read-only,
902	 * so check it explicitly here.
903	 */
904	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905		ZFS_EXIT(zfsvfs);
906		return (SET_ERROR(EROFS));
907	}
908
909	/*
910	 * If immutable or not appending then return EPERM
911	 */
912	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039#ifdef illumos
1040			size_t cbytes;
1041#endif
1042
1043			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1044			    max_blksz);
1045			ASSERT(abuf != NULL);
1046			ASSERT(arc_buf_size(abuf) == max_blksz);
1047#ifdef illumos
1048			if (error = uiocopy(abuf->b_data, max_blksz,
1049			    UIO_WRITE, uio, &cbytes)) {
1050				dmu_return_arcbuf(abuf);
1051				break;
1052			}
1053			ASSERT(cbytes == max_blksz);
1054#else
1055			ssize_t resid = uio->uio_resid;
1056			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1057			if (error != 0) {
1058				uio->uio_offset -= resid - uio->uio_resid;
1059				uio->uio_resid = resid;
1060				dmu_return_arcbuf(abuf);
1061				break;
1062			}
1063#endif
1064		}
1065
1066		/*
1067		 * Start a transaction.
1068		 */
1069		tx = dmu_tx_create(zfsvfs->z_os);
1070		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1071		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1072		zfs_sa_upgrade_txholds(tx, zp);
1073		error = dmu_tx_assign(tx, TXG_WAIT);
1074		if (error) {
1075			dmu_tx_abort(tx);
1076			if (abuf != NULL)
1077				dmu_return_arcbuf(abuf);
1078			break;
1079		}
1080
1081		/*
1082		 * If zfs_range_lock() over-locked we grow the blocksize
1083		 * and then reduce the lock range.  This will only happen
1084		 * on the first iteration since zfs_range_reduce() will
1085		 * shrink down r_len to the appropriate size.
1086		 */
1087		if (rl->r_len == UINT64_MAX) {
1088			uint64_t new_blksz;
1089
1090			if (zp->z_blksz > max_blksz) {
1091				/*
1092				 * File's blocksize is already larger than the
1093				 * "recordsize" property.  Only let it grow to
1094				 * the next power of 2.
1095				 */
1096				ASSERT(!ISP2(zp->z_blksz));
1097				new_blksz = MIN(end_size,
1098				    1 << highbit64(zp->z_blksz));
1099			} else {
1100				new_blksz = MIN(end_size, max_blksz);
1101			}
1102			zfs_grow_blocksize(zp, new_blksz, tx);
1103			zfs_range_reduce(rl, woff, n);
1104		}
1105
1106		/*
1107		 * XXX - should we really limit each write to z_max_blksz?
1108		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1109		 */
1110		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1111
1112		if (woff + nbytes > zp->z_size)
1113			vnode_pager_setsize(vp, woff + nbytes);
1114
1115		if (abuf == NULL) {
1116			tx_bytes = uio->uio_resid;
1117			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1118			    uio, nbytes, tx);
1119			tx_bytes -= uio->uio_resid;
1120		} else {
1121			tx_bytes = nbytes;
1122			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1123			/*
1124			 * If this is not a full block write, but we are
1125			 * extending the file past EOF and this data starts
1126			 * block-aligned, use assign_arcbuf().  Otherwise,
1127			 * write via dmu_write().
1128			 */
1129			if (tx_bytes < max_blksz && (!write_eof ||
1130			    aiov->iov_base != abuf->b_data)) {
1131				ASSERT(xuio);
1132				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1133				    aiov->iov_len, aiov->iov_base, tx);
1134				dmu_return_arcbuf(abuf);
1135				xuio_stat_wbuf_copied();
1136			} else {
1137				ASSERT(xuio || tx_bytes == max_blksz);
1138				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1139				    woff, abuf, tx);
1140			}
1141#ifdef illumos
1142			ASSERT(tx_bytes <= uio->uio_resid);
1143			uioskip(uio, tx_bytes);
1144#endif
1145		}
1146		if (tx_bytes && vn_has_cached_data(vp)) {
1147			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1148			    zp->z_id, uio->uio_segflg, tx);
1149		}
1150
1151		/*
1152		 * If we made no progress, we're done.  If we made even
1153		 * partial progress, update the znode and ZIL accordingly.
1154		 */
1155		if (tx_bytes == 0) {
1156			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1157			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1158			dmu_tx_commit(tx);
1159			ASSERT(error != 0);
1160			break;
1161		}
1162
1163		/*
1164		 * Clear Set-UID/Set-GID bits on successful write if not
1165		 * privileged and at least one of the excute bits is set.
1166		 *
1167		 * It would be nice to to this after all writes have
1168		 * been done, but that would still expose the ISUID/ISGID
1169		 * to another app after the partial write is committed.
1170		 *
1171		 * Note: we don't call zfs_fuid_map_id() here because
1172		 * user 0 is not an ephemeral uid.
1173		 */
1174		mutex_enter(&zp->z_acl_lock);
1175		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1176		    (S_IXUSR >> 6))) != 0 &&
1177		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1178		    secpolicy_vnode_setid_retain(vp, cr,
1179		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1180			uint64_t newmode;
1181			zp->z_mode &= ~(S_ISUID | S_ISGID);
1182			newmode = zp->z_mode;
1183			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1184			    (void *)&newmode, sizeof (uint64_t), tx);
1185		}
1186		mutex_exit(&zp->z_acl_lock);
1187
1188		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1189		    B_TRUE);
1190
1191		/*
1192		 * Update the file size (zp_size) if it has changed;
1193		 * account for possible concurrent updates.
1194		 */
1195		while ((end_size = zp->z_size) < uio->uio_loffset) {
1196			(void) atomic_cas_64(&zp->z_size, end_size,
1197			    uio->uio_loffset);
1198#ifdef illumos
1199			ASSERT(error == 0);
1200#else
1201			ASSERT(error == 0 || error == EFAULT);
1202#endif
1203		}
1204		/*
1205		 * If we are replaying and eof is non zero then force
1206		 * the file size to the specified eof. Note, there's no
1207		 * concurrency during replay.
1208		 */
1209		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1210			zp->z_size = zfsvfs->z_replay_eof;
1211
1212		if (error == 0)
1213			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1214		else
1215			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1216
1217		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1218		dmu_tx_commit(tx);
1219
1220		if (error != 0)
1221			break;
1222		ASSERT(tx_bytes == nbytes);
1223		n -= nbytes;
1224
1225#ifdef illumos
1226		if (!xuio && n > 0)
1227			uio_prefaultpages(MIN(n, max_blksz), uio);
1228#endif
1229	}
1230
1231	zfs_range_unlock(rl);
1232
1233	/*
1234	 * If we're in replay mode, or we made no progress, return error.
1235	 * Otherwise, it's at least a partial write, so it's successful.
1236	 */
1237	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1238		ZFS_EXIT(zfsvfs);
1239		return (error);
1240	}
1241
1242#ifdef __FreeBSD__
1243	/*
1244	 * EFAULT means that at least one page of the source buffer was not
1245	 * available.  VFS will re-try remaining I/O upon this error.
1246	 */
1247	if (error == EFAULT) {
1248		ZFS_EXIT(zfsvfs);
1249		return (error);
1250	}
1251#endif
1252
1253	if (ioflag & (FSYNC | FDSYNC) ||
1254	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1255		zil_commit(zilog, zp->z_id);
1256
1257	ZFS_EXIT(zfsvfs);
1258	return (0);
1259}
1260
1261void
1262zfs_get_done(zgd_t *zgd, int error)
1263{
1264	znode_t *zp = zgd->zgd_private;
1265	objset_t *os = zp->z_zfsvfs->z_os;
1266
1267	if (zgd->zgd_db)
1268		dmu_buf_rele(zgd->zgd_db, zgd);
1269
1270	zfs_range_unlock(zgd->zgd_rl);
1271
1272	/*
1273	 * Release the vnode asynchronously as we currently have the
1274	 * txg stopped from syncing.
1275	 */
1276	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1277
1278	if (error == 0 && zgd->zgd_bp)
1279		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1280
1281	kmem_free(zgd, sizeof (zgd_t));
1282}
1283
1284#ifdef DEBUG
1285static int zil_fault_io = 0;
1286#endif
1287
1288/*
1289 * Get data to generate a TX_WRITE intent log record.
1290 */
1291int
1292zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1293{
1294	zfsvfs_t *zfsvfs = arg;
1295	objset_t *os = zfsvfs->z_os;
1296	znode_t *zp;
1297	uint64_t object = lr->lr_foid;
1298	uint64_t offset = lr->lr_offset;
1299	uint64_t size = lr->lr_length;
1300	blkptr_t *bp = &lr->lr_blkptr;
1301	dmu_buf_t *db;
1302	zgd_t *zgd;
1303	int error = 0;
1304
1305	ASSERT(zio != NULL);
1306	ASSERT(size != 0);
1307
1308	/*
1309	 * Nothing to do if the file has been removed
1310	 */
1311	if (zfs_zget(zfsvfs, object, &zp) != 0)
1312		return (SET_ERROR(ENOENT));
1313	if (zp->z_unlinked) {
1314		/*
1315		 * Release the vnode asynchronously as we currently have the
1316		 * txg stopped from syncing.
1317		 */
1318		VN_RELE_ASYNC(ZTOV(zp),
1319		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1320		return (SET_ERROR(ENOENT));
1321	}
1322
1323	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1324	zgd->zgd_zilog = zfsvfs->z_log;
1325	zgd->zgd_private = zp;
1326
1327	/*
1328	 * Write records come in two flavors: immediate and indirect.
1329	 * For small writes it's cheaper to store the data with the
1330	 * log record (immediate); for large writes it's cheaper to
1331	 * sync the data and get a pointer to it (indirect) so that
1332	 * we don't have to write the data twice.
1333	 */
1334	if (buf != NULL) { /* immediate write */
1335		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1336		/* test for truncation needs to be done while range locked */
1337		if (offset >= zp->z_size) {
1338			error = SET_ERROR(ENOENT);
1339		} else {
1340			error = dmu_read(os, object, offset, size, buf,
1341			    DMU_READ_NO_PREFETCH);
1342		}
1343		ASSERT(error == 0 || error == ENOENT);
1344	} else { /* indirect write */
1345		/*
1346		 * Have to lock the whole block to ensure when it's
1347		 * written out and it's checksum is being calculated
1348		 * that no one can change the data. We need to re-check
1349		 * blocksize after we get the lock in case it's changed!
1350		 */
1351		for (;;) {
1352			uint64_t blkoff;
1353			size = zp->z_blksz;
1354			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1355			offset -= blkoff;
1356			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1357			    RL_READER);
1358			if (zp->z_blksz == size)
1359				break;
1360			offset += blkoff;
1361			zfs_range_unlock(zgd->zgd_rl);
1362		}
1363		/* test for truncation needs to be done while range locked */
1364		if (lr->lr_offset >= zp->z_size)
1365			error = SET_ERROR(ENOENT);
1366#ifdef DEBUG
1367		if (zil_fault_io) {
1368			error = SET_ERROR(EIO);
1369			zil_fault_io = 0;
1370		}
1371#endif
1372		if (error == 0)
1373			error = dmu_buf_hold(os, object, offset, zgd, &db,
1374			    DMU_READ_NO_PREFETCH);
1375
1376		if (error == 0) {
1377			blkptr_t *obp = dmu_buf_get_blkptr(db);
1378			if (obp) {
1379				ASSERT(BP_IS_HOLE(bp));
1380				*bp = *obp;
1381			}
1382
1383			zgd->zgd_db = db;
1384			zgd->zgd_bp = bp;
1385
1386			ASSERT(db->db_offset == offset);
1387			ASSERT(db->db_size == size);
1388
1389			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1390			    zfs_get_done, zgd);
1391			ASSERT(error || lr->lr_length <= zp->z_blksz);
1392
1393			/*
1394			 * On success, we need to wait for the write I/O
1395			 * initiated by dmu_sync() to complete before we can
1396			 * release this dbuf.  We will finish everything up
1397			 * in the zfs_get_done() callback.
1398			 */
1399			if (error == 0)
1400				return (0);
1401
1402			if (error == EALREADY) {
1403				lr->lr_common.lrc_txtype = TX_WRITE2;
1404				error = 0;
1405			}
1406		}
1407	}
1408
1409	zfs_get_done(zgd, error);
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415static int
1416zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1417    caller_context_t *ct)
1418{
1419	znode_t *zp = VTOZ(vp);
1420	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1421	int error;
1422
1423	ZFS_ENTER(zfsvfs);
1424	ZFS_VERIFY_ZP(zp);
1425
1426	if (flag & V_ACE_MASK)
1427		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1428	else
1429		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1430
1431	ZFS_EXIT(zfsvfs);
1432	return (error);
1433}
1434
1435static int
1436zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1437{
1438	int error;
1439
1440	*vpp = arg;
1441	error = vn_lock(*vpp, lkflags);
1442	if (error != 0)
1443		vrele(*vpp);
1444	return (error);
1445}
1446
1447static int
1448zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1449{
1450	znode_t *zdp = VTOZ(dvp);
1451	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1452	int error;
1453	int ltype;
1454
1455	ASSERT_VOP_LOCKED(dvp, __func__);
1456#ifdef DIAGNOSTIC
1457	ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1458#endif
1459
1460	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1461		ASSERT3P(dvp, ==, vp);
1462		vref(dvp);
1463		ltype = lkflags & LK_TYPE_MASK;
1464		if (ltype != VOP_ISLOCKED(dvp)) {
1465			if (ltype == LK_EXCLUSIVE)
1466				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1467			else /* if (ltype == LK_SHARED) */
1468				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1469
1470			/*
1471			 * Relock for the "." case could leave us with
1472			 * reclaimed vnode.
1473			 */
1474			if (dvp->v_iflag & VI_DOOMED) {
1475				vrele(dvp);
1476				return (SET_ERROR(ENOENT));
1477			}
1478		}
1479		return (0);
1480	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1481		/*
1482		 * Note that in this case, dvp is the child vnode, and we
1483		 * are looking up the parent vnode - exactly reverse from
1484		 * normal operation.  Unlocking dvp requires some rather
1485		 * tricky unlock/relock dance to prevent mp from being freed;
1486		 * use vn_vget_ino_gen() which takes care of all that.
1487		 *
1488		 * XXX Note that there is a time window when both vnodes are
1489		 * unlocked.  It is possible, although highly unlikely, that
1490		 * during that window the parent-child relationship between
1491		 * the vnodes may change, for example, get reversed.
1492		 * In that case we would have a wrong lock order for the vnodes.
1493		 * All other filesystems seem to ignore this problem, so we
1494		 * do the same here.
1495		 * A potential solution could be implemented as follows:
1496		 * - using LK_NOWAIT when locking the second vnode and retrying
1497		 *   if necessary
1498		 * - checking that the parent-child relationship still holds
1499		 *   after locking both vnodes and retrying if it doesn't
1500		 */
1501		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1502		return (error);
1503	} else {
1504		error = vn_lock(vp, lkflags);
1505		if (error != 0)
1506			vrele(vp);
1507		return (error);
1508	}
1509}
1510
1511/*
1512 * Lookup an entry in a directory, or an extended attribute directory.
1513 * If it exists, return a held vnode reference for it.
1514 *
1515 *	IN:	dvp	- vnode of directory to search.
1516 *		nm	- name of entry to lookup.
1517 *		pnp	- full pathname to lookup [UNUSED].
1518 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1519 *		rdir	- root directory vnode [UNUSED].
1520 *		cr	- credentials of caller.
1521 *		ct	- caller context
1522 *
1523 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1524 *
1525 *	RETURN:	0 on success, error code on failure.
1526 *
1527 * Timestamps:
1528 *	NA
1529 */
1530/* ARGSUSED */
1531static int
1532zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1533    int nameiop, cred_t *cr, kthread_t *td, int flags)
1534{
1535	znode_t *zdp = VTOZ(dvp);
1536	znode_t *zp;
1537	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1538	int	error = 0;
1539
1540	/* fast path (should be redundant with vfs namecache) */
1541	if (!(flags & LOOKUP_XATTR)) {
1542		if (dvp->v_type != VDIR) {
1543			return (SET_ERROR(ENOTDIR));
1544		} else if (zdp->z_sa_hdl == NULL) {
1545			return (SET_ERROR(EIO));
1546		}
1547	}
1548
1549	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1550
1551	ZFS_ENTER(zfsvfs);
1552	ZFS_VERIFY_ZP(zdp);
1553
1554	*vpp = NULL;
1555
1556	if (flags & LOOKUP_XATTR) {
1557#ifdef TODO
1558		/*
1559		 * If the xattr property is off, refuse the lookup request.
1560		 */
1561		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1562			ZFS_EXIT(zfsvfs);
1563			return (SET_ERROR(EINVAL));
1564		}
1565#endif
1566
1567		/*
1568		 * We don't allow recursive attributes..
1569		 * Maybe someday we will.
1570		 */
1571		if (zdp->z_pflags & ZFS_XATTR) {
1572			ZFS_EXIT(zfsvfs);
1573			return (SET_ERROR(EINVAL));
1574		}
1575
1576		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1577			ZFS_EXIT(zfsvfs);
1578			return (error);
1579		}
1580
1581		/*
1582		 * Do we have permission to get into attribute directory?
1583		 */
1584		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1585		    B_FALSE, cr)) {
1586			vrele(*vpp);
1587			*vpp = NULL;
1588		}
1589
1590		ZFS_EXIT(zfsvfs);
1591		return (error);
1592	}
1593
1594	/*
1595	 * Check accessibility of directory.
1596	 */
1597	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1598		ZFS_EXIT(zfsvfs);
1599		return (error);
1600	}
1601
1602	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1603	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1604		ZFS_EXIT(zfsvfs);
1605		return (SET_ERROR(EILSEQ));
1606	}
1607
1608
1609	/*
1610	 * First handle the special cases.
1611	 */
1612	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1613		/*
1614		 * If we are a snapshot mounted under .zfs, return
1615		 * the vp for the snapshot directory.
1616		 */
1617		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1618			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
1619			    "snapshot", vpp, NULL, 0, NULL, kcred,
1620			    NULL, NULL, NULL);
1621			ZFS_EXIT(zfsvfs);
1622			if (error == 0) {
1623				error = zfs_lookup_lock(dvp, *vpp, nm,
1624				    cnp->cn_lkflags);
1625			}
1626			goto out;
1627		}
1628	}
1629	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1630		error = 0;
1631		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1632			error = SET_ERROR(ENOTSUP);
1633		else
1634			*vpp = zfsctl_root(zdp);
1635		ZFS_EXIT(zfsvfs);
1636		if (error == 0)
1637			error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1638		goto out;
1639	}
1640
1641	/*
1642	 * The loop is retry the lookup if the parent-child relationship
1643	 * changes during the dot-dot locking complexities.
1644	 */
1645	for (;;) {
1646		uint64_t parent;
1647
1648		error = zfs_dirlook(zdp, nm, &zp);
1649		if (error == 0)
1650			*vpp = ZTOV(zp);
1651
1652		ZFS_EXIT(zfsvfs);
1653		if (error != 0)
1654			break;
1655
1656		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1657		if (error != 0) {
1658			/*
1659			 * If we've got a locking error, then the vnode
1660			 * got reclaimed because of a force unmount.
1661			 * We never enter doomed vnodes into the name cache.
1662			 */
1663			*vpp = NULL;
1664			return (error);
1665		}
1666
1667		if ((cnp->cn_flags & ISDOTDOT) == 0)
1668			break;
1669
1670		ZFS_ENTER(zfsvfs);
1671		if (zdp->z_sa_hdl == NULL) {
1672			error = SET_ERROR(EIO);
1673		} else {
1674			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1675			    &parent, sizeof (parent));
1676		}
1677		if (error != 0) {
1678			ZFS_EXIT(zfsvfs);
1679			vput(ZTOV(zp));
1680			break;
1681		}
1682		if (zp->z_id == parent) {
1683			ZFS_EXIT(zfsvfs);
1684			break;
1685		}
1686		vput(ZTOV(zp));
1687	}
1688
1689out:
1690	if (error != 0)
1691		*vpp = NULL;
1692
1693	/* Translate errors and add SAVENAME when needed. */
1694	if (cnp->cn_flags & ISLASTCN) {
1695		switch (nameiop) {
1696		case CREATE:
1697		case RENAME:
1698			if (error == ENOENT) {
1699				error = EJUSTRETURN;
1700				cnp->cn_flags |= SAVENAME;
1701				break;
1702			}
1703			/* FALLTHROUGH */
1704		case DELETE:
1705			if (error == 0)
1706				cnp->cn_flags |= SAVENAME;
1707			break;
1708		}
1709	}
1710
1711	/* Insert name into cache (as non-existent) if appropriate. */
1712	if (zfsvfs->z_use_namecache &&
1713	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1714		cache_enter(dvp, NULL, cnp);
1715
1716	/* Insert name into cache if appropriate. */
1717	if (zfsvfs->z_use_namecache &&
1718	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1719		if (!(cnp->cn_flags & ISLASTCN) ||
1720		    (nameiop != DELETE && nameiop != RENAME)) {
1721			cache_enter(dvp, *vpp, cnp);
1722		}
1723	}
1724
1725	return (error);
1726}
1727
1728/*
1729 * Attempt to create a new entry in a directory.  If the entry
1730 * already exists, truncate the file if permissible, else return
1731 * an error.  Return the vp of the created or trunc'd file.
1732 *
1733 *	IN:	dvp	- vnode of directory to put new file entry in.
1734 *		name	- name of new file entry.
1735 *		vap	- attributes of new file.
1736 *		excl	- flag indicating exclusive or non-exclusive mode.
1737 *		mode	- mode to open file with.
1738 *		cr	- credentials of caller.
1739 *		flag	- large file flag [UNUSED].
1740 *		ct	- caller context
1741 *		vsecp	- ACL to be set
1742 *
1743 *	OUT:	vpp	- vnode of created or trunc'd entry.
1744 *
1745 *	RETURN:	0 on success, error code on failure.
1746 *
1747 * Timestamps:
1748 *	dvp - ctime|mtime updated if new entry created
1749 *	 vp - ctime|mtime always, atime if new
1750 */
1751
1752/* ARGSUSED */
1753static int
1754zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1755    vnode_t **vpp, cred_t *cr, kthread_t *td)
1756{
1757	znode_t		*zp, *dzp = VTOZ(dvp);
1758	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1759	zilog_t		*zilog;
1760	objset_t	*os;
1761	dmu_tx_t	*tx;
1762	int		error;
1763	ksid_t		*ksid;
1764	uid_t		uid;
1765	gid_t		gid = crgetgid(cr);
1766	zfs_acl_ids_t   acl_ids;
1767	boolean_t	fuid_dirtied;
1768	void		*vsecp = NULL;
1769	int		flag = 0;
1770	uint64_t	txtype;
1771
1772	/*
1773	 * If we have an ephemeral id, ACL, or XVATTR then
1774	 * make sure file system is at proper version
1775	 */
1776
1777	ksid = crgetsid(cr, KSID_OWNER);
1778	if (ksid)
1779		uid = ksid_getid(ksid);
1780	else
1781		uid = crgetuid(cr);
1782
1783	if (zfsvfs->z_use_fuids == B_FALSE &&
1784	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1785	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1786		return (SET_ERROR(EINVAL));
1787
1788	ZFS_ENTER(zfsvfs);
1789	ZFS_VERIFY_ZP(dzp);
1790	os = zfsvfs->z_os;
1791	zilog = zfsvfs->z_log;
1792
1793	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1794	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1795		ZFS_EXIT(zfsvfs);
1796		return (SET_ERROR(EILSEQ));
1797	}
1798
1799	if (vap->va_mask & AT_XVATTR) {
1800		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1801		    crgetuid(cr), cr, vap->va_type)) != 0) {
1802			ZFS_EXIT(zfsvfs);
1803			return (error);
1804		}
1805	}
1806
1807	*vpp = NULL;
1808
1809	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1810		vap->va_mode &= ~S_ISVTX;
1811
1812	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1813	if (error) {
1814		ZFS_EXIT(zfsvfs);
1815		return (error);
1816	}
1817	ASSERT3P(zp, ==, NULL);
1818
1819	/*
1820	 * Create a new file object and update the directory
1821	 * to reference it.
1822	 */
1823	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1824		goto out;
1825	}
1826
1827	/*
1828	 * We only support the creation of regular files in
1829	 * extended attribute directories.
1830	 */
1831
1832	if ((dzp->z_pflags & ZFS_XATTR) &&
1833	    (vap->va_type != VREG)) {
1834		error = SET_ERROR(EINVAL);
1835		goto out;
1836	}
1837
1838	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1839	    cr, vsecp, &acl_ids)) != 0)
1840		goto out;
1841
1842	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1843		zfs_acl_ids_free(&acl_ids);
1844		error = SET_ERROR(EDQUOT);
1845		goto out;
1846	}
1847
1848	getnewvnode_reserve(1);
1849
1850	tx = dmu_tx_create(os);
1851
1852	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1853	    ZFS_SA_BASE_ATTR_SIZE);
1854
1855	fuid_dirtied = zfsvfs->z_fuid_dirty;
1856	if (fuid_dirtied)
1857		zfs_fuid_txhold(zfsvfs, tx);
1858	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1859	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1860	if (!zfsvfs->z_use_sa &&
1861	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1862		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1863		    0, acl_ids.z_aclp->z_acl_bytes);
1864	}
1865	error = dmu_tx_assign(tx, TXG_WAIT);
1866	if (error) {
1867		zfs_acl_ids_free(&acl_ids);
1868		dmu_tx_abort(tx);
1869		getnewvnode_drop_reserve();
1870		ZFS_EXIT(zfsvfs);
1871		return (error);
1872	}
1873	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1874
1875	if (fuid_dirtied)
1876		zfs_fuid_sync(zfsvfs, tx);
1877
1878	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1879	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1880	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1881	    vsecp, acl_ids.z_fuidp, vap);
1882	zfs_acl_ids_free(&acl_ids);
1883	dmu_tx_commit(tx);
1884
1885	getnewvnode_drop_reserve();
1886
1887out:
1888	if (error == 0) {
1889		*vpp = ZTOV(zp);
1890	}
1891
1892	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1893		zil_commit(zilog, 0);
1894
1895	ZFS_EXIT(zfsvfs);
1896	return (error);
1897}
1898
1899/*
1900 * Remove an entry from a directory.
1901 *
1902 *	IN:	dvp	- vnode of directory to remove entry from.
1903 *		name	- name of entry to remove.
1904 *		cr	- credentials of caller.
1905 *		ct	- caller context
1906 *		flags	- case flags
1907 *
1908 *	RETURN:	0 on success, error code on failure.
1909 *
1910 * Timestamps:
1911 *	dvp - ctime|mtime
1912 *	 vp - ctime (if nlink > 0)
1913 */
1914
1915/*ARGSUSED*/
1916static int
1917zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1918{
1919	znode_t		*dzp = VTOZ(dvp);
1920	znode_t		*zp = VTOZ(vp);
1921	znode_t		*xzp;
1922	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1923	zilog_t		*zilog;
1924	uint64_t	acl_obj, xattr_obj;
1925	uint64_t	obj = 0;
1926	dmu_tx_t	*tx;
1927	boolean_t	unlinked, toobig = FALSE;
1928	uint64_t	txtype;
1929	int		error;
1930
1931	ZFS_ENTER(zfsvfs);
1932	ZFS_VERIFY_ZP(dzp);
1933	ZFS_VERIFY_ZP(zp);
1934	zilog = zfsvfs->z_log;
1935	zp = VTOZ(vp);
1936
1937	xattr_obj = 0;
1938	xzp = NULL;
1939
1940	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1941		goto out;
1942	}
1943
1944	/*
1945	 * Need to use rmdir for removing directories.
1946	 */
1947	if (vp->v_type == VDIR) {
1948		error = SET_ERROR(EPERM);
1949		goto out;
1950	}
1951
1952	vnevent_remove(vp, dvp, name, ct);
1953
1954	obj = zp->z_id;
1955
1956	/* are there any extended attributes? */
1957	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1958	    &xattr_obj, sizeof (xattr_obj));
1959	if (error == 0 && xattr_obj) {
1960		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1961		ASSERT0(error);
1962	}
1963
1964	/*
1965	 * We may delete the znode now, or we may put it in the unlinked set;
1966	 * it depends on whether we're the last link, and on whether there are
1967	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1968	 * allow for either case.
1969	 */
1970	tx = dmu_tx_create(zfsvfs->z_os);
1971	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1972	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1973	zfs_sa_upgrade_txholds(tx, zp);
1974	zfs_sa_upgrade_txholds(tx, dzp);
1975
1976	if (xzp) {
1977		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1978		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1979	}
1980
1981	/* charge as an update -- would be nice not to charge at all */
1982	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1983
1984	/*
1985	 * Mark this transaction as typically resulting in a net free of space
1986	 */
1987	dmu_tx_mark_netfree(tx);
1988
1989	error = dmu_tx_assign(tx, TXG_WAIT);
1990	if (error) {
1991		dmu_tx_abort(tx);
1992		ZFS_EXIT(zfsvfs);
1993		return (error);
1994	}
1995
1996	/*
1997	 * Remove the directory entry.
1998	 */
1999	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2000
2001	if (error) {
2002		dmu_tx_commit(tx);
2003		goto out;
2004	}
2005
2006	if (unlinked) {
2007		zfs_unlinked_add(zp, tx);
2008		vp->v_vflag |= VV_NOSYNC;
2009	}
2010
2011	txtype = TX_REMOVE;
2012	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2013
2014	dmu_tx_commit(tx);
2015out:
2016
2017	if (xzp)
2018		vrele(ZTOV(xzp));
2019
2020	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2021		zil_commit(zilog, 0);
2022
2023	ZFS_EXIT(zfsvfs);
2024	return (error);
2025}
2026
2027/*
2028 * Create a new directory and insert it into dvp using the name
2029 * provided.  Return a pointer to the inserted directory.
2030 *
2031 *	IN:	dvp	- vnode of directory to add subdir to.
2032 *		dirname	- name of new directory.
2033 *		vap	- attributes of new directory.
2034 *		cr	- credentials of caller.
2035 *		ct	- caller context
2036 *		flags	- case flags
2037 *		vsecp	- ACL to be set
2038 *
2039 *	OUT:	vpp	- vnode of created directory.
2040 *
2041 *	RETURN:	0 on success, error code on failure.
2042 *
2043 * Timestamps:
2044 *	dvp - ctime|mtime updated
2045 *	 vp - ctime|mtime|atime updated
2046 */
2047/*ARGSUSED*/
2048static int
2049zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2050{
2051	znode_t		*zp, *dzp = VTOZ(dvp);
2052	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2053	zilog_t		*zilog;
2054	uint64_t	txtype;
2055	dmu_tx_t	*tx;
2056	int		error;
2057	ksid_t		*ksid;
2058	uid_t		uid;
2059	gid_t		gid = crgetgid(cr);
2060	zfs_acl_ids_t   acl_ids;
2061	boolean_t	fuid_dirtied;
2062
2063	ASSERT(vap->va_type == VDIR);
2064
2065	/*
2066	 * If we have an ephemeral id, ACL, or XVATTR then
2067	 * make sure file system is at proper version
2068	 */
2069
2070	ksid = crgetsid(cr, KSID_OWNER);
2071	if (ksid)
2072		uid = ksid_getid(ksid);
2073	else
2074		uid = crgetuid(cr);
2075	if (zfsvfs->z_use_fuids == B_FALSE &&
2076	    ((vap->va_mask & AT_XVATTR) ||
2077	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2078		return (SET_ERROR(EINVAL));
2079
2080	ZFS_ENTER(zfsvfs);
2081	ZFS_VERIFY_ZP(dzp);
2082	zilog = zfsvfs->z_log;
2083
2084	if (dzp->z_pflags & ZFS_XATTR) {
2085		ZFS_EXIT(zfsvfs);
2086		return (SET_ERROR(EINVAL));
2087	}
2088
2089	if (zfsvfs->z_utf8 && u8_validate(dirname,
2090	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2091		ZFS_EXIT(zfsvfs);
2092		return (SET_ERROR(EILSEQ));
2093	}
2094
2095	if (vap->va_mask & AT_XVATTR) {
2096		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2097		    crgetuid(cr), cr, vap->va_type)) != 0) {
2098			ZFS_EXIT(zfsvfs);
2099			return (error);
2100		}
2101	}
2102
2103	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2104	    NULL, &acl_ids)) != 0) {
2105		ZFS_EXIT(zfsvfs);
2106		return (error);
2107	}
2108
2109	/*
2110	 * First make sure the new directory doesn't exist.
2111	 *
2112	 * Existence is checked first to make sure we don't return
2113	 * EACCES instead of EEXIST which can cause some applications
2114	 * to fail.
2115	 */
2116	*vpp = NULL;
2117
2118	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2119		zfs_acl_ids_free(&acl_ids);
2120		ZFS_EXIT(zfsvfs);
2121		return (error);
2122	}
2123	ASSERT3P(zp, ==, NULL);
2124
2125	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2126		zfs_acl_ids_free(&acl_ids);
2127		ZFS_EXIT(zfsvfs);
2128		return (error);
2129	}
2130
2131	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2132		zfs_acl_ids_free(&acl_ids);
2133		ZFS_EXIT(zfsvfs);
2134		return (SET_ERROR(EDQUOT));
2135	}
2136
2137	/*
2138	 * Add a new entry to the directory.
2139	 */
2140	getnewvnode_reserve(1);
2141	tx = dmu_tx_create(zfsvfs->z_os);
2142	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2143	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2144	fuid_dirtied = zfsvfs->z_fuid_dirty;
2145	if (fuid_dirtied)
2146		zfs_fuid_txhold(zfsvfs, tx);
2147	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2148		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2149		    acl_ids.z_aclp->z_acl_bytes);
2150	}
2151
2152	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2153	    ZFS_SA_BASE_ATTR_SIZE);
2154
2155	error = dmu_tx_assign(tx, TXG_WAIT);
2156	if (error) {
2157		zfs_acl_ids_free(&acl_ids);
2158		dmu_tx_abort(tx);
2159		getnewvnode_drop_reserve();
2160		ZFS_EXIT(zfsvfs);
2161		return (error);
2162	}
2163
2164	/*
2165	 * Create new node.
2166	 */
2167	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2168
2169	if (fuid_dirtied)
2170		zfs_fuid_sync(zfsvfs, tx);
2171
2172	/*
2173	 * Now put new name in parent dir.
2174	 */
2175	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2176
2177	*vpp = ZTOV(zp);
2178
2179	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2180	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2181	    acl_ids.z_fuidp, vap);
2182
2183	zfs_acl_ids_free(&acl_ids);
2184
2185	dmu_tx_commit(tx);
2186
2187	getnewvnode_drop_reserve();
2188
2189	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2190		zil_commit(zilog, 0);
2191
2192	ZFS_EXIT(zfsvfs);
2193	return (0);
2194}
2195
2196/*
2197 * Remove a directory subdir entry.  If the current working
2198 * directory is the same as the subdir to be removed, the
2199 * remove will fail.
2200 *
2201 *	IN:	dvp	- vnode of directory to remove from.
2202 *		name	- name of directory to be removed.
2203 *		cwd	- vnode of current working directory.
2204 *		cr	- credentials of caller.
2205 *		ct	- caller context
2206 *		flags	- case flags
2207 *
2208 *	RETURN:	0 on success, error code on failure.
2209 *
2210 * Timestamps:
2211 *	dvp - ctime|mtime updated
2212 */
2213/*ARGSUSED*/
2214static int
2215zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2216{
2217	znode_t		*dzp = VTOZ(dvp);
2218	znode_t		*zp = VTOZ(vp);
2219	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2220	zilog_t		*zilog;
2221	dmu_tx_t	*tx;
2222	int		error;
2223
2224	ZFS_ENTER(zfsvfs);
2225	ZFS_VERIFY_ZP(dzp);
2226	ZFS_VERIFY_ZP(zp);
2227	zilog = zfsvfs->z_log;
2228
2229
2230	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2231		goto out;
2232	}
2233
2234	if (vp->v_type != VDIR) {
2235		error = SET_ERROR(ENOTDIR);
2236		goto out;
2237	}
2238
2239	vnevent_rmdir(vp, dvp, name, ct);
2240
2241	tx = dmu_tx_create(zfsvfs->z_os);
2242	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2243	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2244	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2245	zfs_sa_upgrade_txholds(tx, zp);
2246	zfs_sa_upgrade_txholds(tx, dzp);
2247	dmu_tx_mark_netfree(tx);
2248	error = dmu_tx_assign(tx, TXG_WAIT);
2249	if (error) {
2250		dmu_tx_abort(tx);
2251		ZFS_EXIT(zfsvfs);
2252		return (error);
2253	}
2254
2255	cache_purge(dvp);
2256
2257	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2258
2259	if (error == 0) {
2260		uint64_t txtype = TX_RMDIR;
2261		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2262	}
2263
2264	dmu_tx_commit(tx);
2265
2266	cache_purge(vp);
2267out:
2268	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2269		zil_commit(zilog, 0);
2270
2271	ZFS_EXIT(zfsvfs);
2272	return (error);
2273}
2274
2275/*
2276 * Read as many directory entries as will fit into the provided
2277 * buffer from the given directory cursor position (specified in
2278 * the uio structure).
2279 *
2280 *	IN:	vp	- vnode of directory to read.
2281 *		uio	- structure supplying read location, range info,
2282 *			  and return buffer.
2283 *		cr	- credentials of caller.
2284 *		ct	- caller context
2285 *		flags	- case flags
2286 *
2287 *	OUT:	uio	- updated offset and range, buffer filled.
2288 *		eofp	- set to true if end-of-file detected.
2289 *
2290 *	RETURN:	0 on success, error code on failure.
2291 *
2292 * Timestamps:
2293 *	vp - atime updated
2294 *
2295 * Note that the low 4 bits of the cookie returned by zap is always zero.
2296 * This allows us to use the low range for "special" directory entries:
2297 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2298 * we use the offset 2 for the '.zfs' directory.
2299 */
2300/* ARGSUSED */
2301static int
2302zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2303{
2304	znode_t		*zp = VTOZ(vp);
2305	iovec_t		*iovp;
2306	edirent_t	*eodp;
2307	dirent64_t	*odp;
2308	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2309	objset_t	*os;
2310	caddr_t		outbuf;
2311	size_t		bufsize;
2312	zap_cursor_t	zc;
2313	zap_attribute_t	zap;
2314	uint_t		bytes_wanted;
2315	uint64_t	offset; /* must be unsigned; checks for < 1 */
2316	uint64_t	parent;
2317	int		local_eof;
2318	int		outcount;
2319	int		error;
2320	uint8_t		prefetch;
2321	boolean_t	check_sysattrs;
2322	uint8_t		type;
2323	int		ncooks;
2324	u_long		*cooks = NULL;
2325	int		flags = 0;
2326
2327	ZFS_ENTER(zfsvfs);
2328	ZFS_VERIFY_ZP(zp);
2329
2330	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2331	    &parent, sizeof (parent))) != 0) {
2332		ZFS_EXIT(zfsvfs);
2333		return (error);
2334	}
2335
2336	/*
2337	 * If we are not given an eof variable,
2338	 * use a local one.
2339	 */
2340	if (eofp == NULL)
2341		eofp = &local_eof;
2342
2343	/*
2344	 * Check for valid iov_len.
2345	 */
2346	if (uio->uio_iov->iov_len <= 0) {
2347		ZFS_EXIT(zfsvfs);
2348		return (SET_ERROR(EINVAL));
2349	}
2350
2351	/*
2352	 * Quit if directory has been removed (posix)
2353	 */
2354	if ((*eofp = zp->z_unlinked) != 0) {
2355		ZFS_EXIT(zfsvfs);
2356		return (0);
2357	}
2358
2359	error = 0;
2360	os = zfsvfs->z_os;
2361	offset = uio->uio_loffset;
2362	prefetch = zp->z_zn_prefetch;
2363
2364	/*
2365	 * Initialize the iterator cursor.
2366	 */
2367	if (offset <= 3) {
2368		/*
2369		 * Start iteration from the beginning of the directory.
2370		 */
2371		zap_cursor_init(&zc, os, zp->z_id);
2372	} else {
2373		/*
2374		 * The offset is a serialized cursor.
2375		 */
2376		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2377	}
2378
2379	/*
2380	 * Get space to change directory entries into fs independent format.
2381	 */
2382	iovp = uio->uio_iov;
2383	bytes_wanted = iovp->iov_len;
2384	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2385		bufsize = bytes_wanted;
2386		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2387		odp = (struct dirent64 *)outbuf;
2388	} else {
2389		bufsize = bytes_wanted;
2390		outbuf = NULL;
2391		odp = (struct dirent64 *)iovp->iov_base;
2392	}
2393	eodp = (struct edirent *)odp;
2394
2395	if (ncookies != NULL) {
2396		/*
2397		 * Minimum entry size is dirent size and 1 byte for a file name.
2398		 */
2399		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2400		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2401		*cookies = cooks;
2402		*ncookies = ncooks;
2403	}
2404	/*
2405	 * If this VFS supports the system attribute view interface; and
2406	 * we're looking at an extended attribute directory; and we care
2407	 * about normalization conflicts on this vfs; then we must check
2408	 * for normalization conflicts with the sysattr name space.
2409	 */
2410#ifdef TODO
2411	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2412	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2413	    (flags & V_RDDIR_ENTFLAGS);
2414#else
2415	check_sysattrs = 0;
2416#endif
2417
2418	/*
2419	 * Transform to file-system independent format
2420	 */
2421	outcount = 0;
2422	while (outcount < bytes_wanted) {
2423		ino64_t objnum;
2424		ushort_t reclen;
2425		off64_t *next = NULL;
2426
2427		/*
2428		 * Special case `.', `..', and `.zfs'.
2429		 */
2430		if (offset == 0) {
2431			(void) strcpy(zap.za_name, ".");
2432			zap.za_normalization_conflict = 0;
2433			objnum = zp->z_id;
2434			type = DT_DIR;
2435		} else if (offset == 1) {
2436			(void) strcpy(zap.za_name, "..");
2437			zap.za_normalization_conflict = 0;
2438			objnum = parent;
2439			type = DT_DIR;
2440		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2441			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2442			zap.za_normalization_conflict = 0;
2443			objnum = ZFSCTL_INO_ROOT;
2444			type = DT_DIR;
2445		} else {
2446			/*
2447			 * Grab next entry.
2448			 */
2449			if (error = zap_cursor_retrieve(&zc, &zap)) {
2450				if ((*eofp = (error == ENOENT)) != 0)
2451					break;
2452				else
2453					goto update;
2454			}
2455
2456			if (zap.za_integer_length != 8 ||
2457			    zap.za_num_integers != 1) {
2458				cmn_err(CE_WARN, "zap_readdir: bad directory "
2459				    "entry, obj = %lld, offset = %lld\n",
2460				    (u_longlong_t)zp->z_id,
2461				    (u_longlong_t)offset);
2462				error = SET_ERROR(ENXIO);
2463				goto update;
2464			}
2465
2466			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2467			/*
2468			 * MacOS X can extract the object type here such as:
2469			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2470			 */
2471			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2472
2473			if (check_sysattrs && !zap.za_normalization_conflict) {
2474#ifdef TODO
2475				zap.za_normalization_conflict =
2476				    xattr_sysattr_casechk(zap.za_name);
2477#else
2478				panic("%s:%u: TODO", __func__, __LINE__);
2479#endif
2480			}
2481		}
2482
2483		if (flags & V_RDDIR_ACCFILTER) {
2484			/*
2485			 * If we have no access at all, don't include
2486			 * this entry in the returned information
2487			 */
2488			znode_t	*ezp;
2489			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2490				goto skip_entry;
2491			if (!zfs_has_access(ezp, cr)) {
2492				vrele(ZTOV(ezp));
2493				goto skip_entry;
2494			}
2495			vrele(ZTOV(ezp));
2496		}
2497
2498		if (flags & V_RDDIR_ENTFLAGS)
2499			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2500		else
2501			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2502
2503		/*
2504		 * Will this entry fit in the buffer?
2505		 */
2506		if (outcount + reclen > bufsize) {
2507			/*
2508			 * Did we manage to fit anything in the buffer?
2509			 */
2510			if (!outcount) {
2511				error = SET_ERROR(EINVAL);
2512				goto update;
2513			}
2514			break;
2515		}
2516		if (flags & V_RDDIR_ENTFLAGS) {
2517			/*
2518			 * Add extended flag entry:
2519			 */
2520			eodp->ed_ino = objnum;
2521			eodp->ed_reclen = reclen;
2522			/* NOTE: ed_off is the offset for the *next* entry */
2523			next = &(eodp->ed_off);
2524			eodp->ed_eflags = zap.za_normalization_conflict ?
2525			    ED_CASE_CONFLICT : 0;
2526			(void) strncpy(eodp->ed_name, zap.za_name,
2527			    EDIRENT_NAMELEN(reclen));
2528			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2529		} else {
2530			/*
2531			 * Add normal entry:
2532			 */
2533			odp->d_ino = objnum;
2534			odp->d_reclen = reclen;
2535			odp->d_namlen = strlen(zap.za_name);
2536			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2537			odp->d_type = type;
2538			odp = (dirent64_t *)((intptr_t)odp + reclen);
2539		}
2540		outcount += reclen;
2541
2542		ASSERT(outcount <= bufsize);
2543
2544		/* Prefetch znode */
2545		if (prefetch)
2546			dmu_prefetch(os, objnum, 0, 0, 0,
2547			    ZIO_PRIORITY_SYNC_READ);
2548
2549	skip_entry:
2550		/*
2551		 * Move to the next entry, fill in the previous offset.
2552		 */
2553		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2554			zap_cursor_advance(&zc);
2555			offset = zap_cursor_serialize(&zc);
2556		} else {
2557			offset += 1;
2558		}
2559
2560		if (cooks != NULL) {
2561			*cooks++ = offset;
2562			ncooks--;
2563			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2564		}
2565	}
2566	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2567
2568	/* Subtract unused cookies */
2569	if (ncookies != NULL)
2570		*ncookies -= ncooks;
2571
2572	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2573		iovp->iov_base += outcount;
2574		iovp->iov_len -= outcount;
2575		uio->uio_resid -= outcount;
2576	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2577		/*
2578		 * Reset the pointer.
2579		 */
2580		offset = uio->uio_loffset;
2581	}
2582
2583update:
2584	zap_cursor_fini(&zc);
2585	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2586		kmem_free(outbuf, bufsize);
2587
2588	if (error == ENOENT)
2589		error = 0;
2590
2591	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2592
2593	uio->uio_loffset = offset;
2594	ZFS_EXIT(zfsvfs);
2595	if (error != 0 && cookies != NULL) {
2596		free(*cookies, M_TEMP);
2597		*cookies = NULL;
2598		*ncookies = 0;
2599	}
2600	return (error);
2601}
2602
2603ulong_t zfs_fsync_sync_cnt = 4;
2604
2605static int
2606zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2607{
2608	znode_t	*zp = VTOZ(vp);
2609	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2610
2611	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2612
2613	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2614		ZFS_ENTER(zfsvfs);
2615		ZFS_VERIFY_ZP(zp);
2616		zil_commit(zfsvfs->z_log, zp->z_id);
2617		ZFS_EXIT(zfsvfs);
2618	}
2619	return (0);
2620}
2621
2622
2623/*
2624 * Get the requested file attributes and place them in the provided
2625 * vattr structure.
2626 *
2627 *	IN:	vp	- vnode of file.
2628 *		vap	- va_mask identifies requested attributes.
2629 *			  If AT_XVATTR set, then optional attrs are requested
2630 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2631 *		cr	- credentials of caller.
2632 *		ct	- caller context
2633 *
2634 *	OUT:	vap	- attribute values.
2635 *
2636 *	RETURN:	0 (always succeeds).
2637 */
2638/* ARGSUSED */
2639static int
2640zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2641    caller_context_t *ct)
2642{
2643	znode_t *zp = VTOZ(vp);
2644	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2645	int	error = 0;
2646	uint32_t blksize;
2647	u_longlong_t nblocks;
2648	uint64_t links;
2649	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2650	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2651	xoptattr_t *xoap = NULL;
2652	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2653	sa_bulk_attr_t bulk[4];
2654	int count = 0;
2655
2656	ZFS_ENTER(zfsvfs);
2657	ZFS_VERIFY_ZP(zp);
2658
2659	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2660
2661	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2662	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2663	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2664	if (vp->v_type == VBLK || vp->v_type == VCHR)
2665		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2666		    &rdev, 8);
2667
2668	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2669		ZFS_EXIT(zfsvfs);
2670		return (error);
2671	}
2672
2673	/*
2674	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2675	 * Also, if we are the owner don't bother, since owner should
2676	 * always be allowed to read basic attributes of file.
2677	 */
2678	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2679	    (vap->va_uid != crgetuid(cr))) {
2680		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2681		    skipaclchk, cr)) {
2682			ZFS_EXIT(zfsvfs);
2683			return (error);
2684		}
2685	}
2686
2687	/*
2688	 * Return all attributes.  It's cheaper to provide the answer
2689	 * than to determine whether we were asked the question.
2690	 */
2691
2692	vap->va_type = IFTOVT(zp->z_mode);
2693	vap->va_mode = zp->z_mode & ~S_IFMT;
2694#ifdef illumos
2695	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2696#else
2697	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2698#endif
2699	vap->va_nodeid = zp->z_id;
2700	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2701		links = zp->z_links + 1;
2702	else
2703		links = zp->z_links;
2704	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2705	vap->va_size = zp->z_size;
2706#ifdef illumos
2707	vap->va_rdev = vp->v_rdev;
2708#else
2709	if (vp->v_type == VBLK || vp->v_type == VCHR)
2710		vap->va_rdev = zfs_cmpldev(rdev);
2711#endif
2712	vap->va_seq = zp->z_seq;
2713	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2714	vap->va_filerev = zp->z_seq;
2715
2716	/*
2717	 * Add in any requested optional attributes and the create time.
2718	 * Also set the corresponding bits in the returned attribute bitmap.
2719	 */
2720	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2721		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2722			xoap->xoa_archive =
2723			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2724			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2725		}
2726
2727		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2728			xoap->xoa_readonly =
2729			    ((zp->z_pflags & ZFS_READONLY) != 0);
2730			XVA_SET_RTN(xvap, XAT_READONLY);
2731		}
2732
2733		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2734			xoap->xoa_system =
2735			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2736			XVA_SET_RTN(xvap, XAT_SYSTEM);
2737		}
2738
2739		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2740			xoap->xoa_hidden =
2741			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2742			XVA_SET_RTN(xvap, XAT_HIDDEN);
2743		}
2744
2745		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2746			xoap->xoa_nounlink =
2747			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2748			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2749		}
2750
2751		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2752			xoap->xoa_immutable =
2753			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2754			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2755		}
2756
2757		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2758			xoap->xoa_appendonly =
2759			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2760			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2761		}
2762
2763		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2764			xoap->xoa_nodump =
2765			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2766			XVA_SET_RTN(xvap, XAT_NODUMP);
2767		}
2768
2769		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2770			xoap->xoa_opaque =
2771			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2772			XVA_SET_RTN(xvap, XAT_OPAQUE);
2773		}
2774
2775		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2776			xoap->xoa_av_quarantined =
2777			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2778			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2779		}
2780
2781		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2782			xoap->xoa_av_modified =
2783			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2784			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2785		}
2786
2787		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2788		    vp->v_type == VREG) {
2789			zfs_sa_get_scanstamp(zp, xvap);
2790		}
2791
2792		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2793			uint64_t times[2];
2794
2795			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2796			    times, sizeof (times));
2797			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2798			XVA_SET_RTN(xvap, XAT_CREATETIME);
2799		}
2800
2801		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2802			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2803			XVA_SET_RTN(xvap, XAT_REPARSE);
2804		}
2805		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2806			xoap->xoa_generation = zp->z_gen;
2807			XVA_SET_RTN(xvap, XAT_GEN);
2808		}
2809
2810		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2811			xoap->xoa_offline =
2812			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2813			XVA_SET_RTN(xvap, XAT_OFFLINE);
2814		}
2815
2816		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2817			xoap->xoa_sparse =
2818			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2819			XVA_SET_RTN(xvap, XAT_SPARSE);
2820		}
2821	}
2822
2823	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2824	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2825	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2826	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2827
2828
2829	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2830	vap->va_blksize = blksize;
2831	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2832
2833	if (zp->z_blksz == 0) {
2834		/*
2835		 * Block size hasn't been set; suggest maximal I/O transfers.
2836		 */
2837		vap->va_blksize = zfsvfs->z_max_blksz;
2838	}
2839
2840	ZFS_EXIT(zfsvfs);
2841	return (0);
2842}
2843
2844/*
2845 * Set the file attributes to the values contained in the
2846 * vattr structure.
2847 *
2848 *	IN:	vp	- vnode of file to be modified.
2849 *		vap	- new attribute values.
2850 *			  If AT_XVATTR set, then optional attrs are being set
2851 *		flags	- ATTR_UTIME set if non-default time values provided.
2852 *			- ATTR_NOACLCHECK (CIFS context only).
2853 *		cr	- credentials of caller.
2854 *		ct	- caller context
2855 *
2856 *	RETURN:	0 on success, error code on failure.
2857 *
2858 * Timestamps:
2859 *	vp - ctime updated, mtime updated if size changed.
2860 */
2861/* ARGSUSED */
2862static int
2863zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2864    caller_context_t *ct)
2865{
2866	znode_t		*zp = VTOZ(vp);
2867	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2868	zilog_t		*zilog;
2869	dmu_tx_t	*tx;
2870	vattr_t		oldva;
2871	xvattr_t	tmpxvattr;
2872	uint_t		mask = vap->va_mask;
2873	uint_t		saved_mask = 0;
2874	uint64_t	saved_mode;
2875	int		trim_mask = 0;
2876	uint64_t	new_mode;
2877	uint64_t	new_uid, new_gid;
2878	uint64_t	xattr_obj;
2879	uint64_t	mtime[2], ctime[2];
2880	znode_t		*attrzp;
2881	int		need_policy = FALSE;
2882	int		err, err2;
2883	zfs_fuid_info_t *fuidp = NULL;
2884	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2885	xoptattr_t	*xoap;
2886	zfs_acl_t	*aclp;
2887	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2888	boolean_t	fuid_dirtied = B_FALSE;
2889	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2890	int		count = 0, xattr_count = 0;
2891
2892	if (mask == 0)
2893		return (0);
2894
2895	if (mask & AT_NOSET)
2896		return (SET_ERROR(EINVAL));
2897
2898	ZFS_ENTER(zfsvfs);
2899	ZFS_VERIFY_ZP(zp);
2900
2901	zilog = zfsvfs->z_log;
2902
2903	/*
2904	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2905	 * that file system is at proper version level
2906	 */
2907
2908	if (zfsvfs->z_use_fuids == B_FALSE &&
2909	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2910	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2911	    (mask & AT_XVATTR))) {
2912		ZFS_EXIT(zfsvfs);
2913		return (SET_ERROR(EINVAL));
2914	}
2915
2916	if (mask & AT_SIZE && vp->v_type == VDIR) {
2917		ZFS_EXIT(zfsvfs);
2918		return (SET_ERROR(EISDIR));
2919	}
2920
2921	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2922		ZFS_EXIT(zfsvfs);
2923		return (SET_ERROR(EINVAL));
2924	}
2925
2926	/*
2927	 * If this is an xvattr_t, then get a pointer to the structure of
2928	 * optional attributes.  If this is NULL, then we have a vattr_t.
2929	 */
2930	xoap = xva_getxoptattr(xvap);
2931
2932	xva_init(&tmpxvattr);
2933
2934	/*
2935	 * Immutable files can only alter immutable bit and atime
2936	 */
2937	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2938	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2939	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2940		ZFS_EXIT(zfsvfs);
2941		return (SET_ERROR(EPERM));
2942	}
2943
2944	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2945		ZFS_EXIT(zfsvfs);
2946		return (SET_ERROR(EPERM));
2947	}
2948
2949	/*
2950	 * Verify timestamps doesn't overflow 32 bits.
2951	 * ZFS can handle large timestamps, but 32bit syscalls can't
2952	 * handle times greater than 2039.  This check should be removed
2953	 * once large timestamps are fully supported.
2954	 */
2955	if (mask & (AT_ATIME | AT_MTIME)) {
2956		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2957		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2958			ZFS_EXIT(zfsvfs);
2959			return (SET_ERROR(EOVERFLOW));
2960		}
2961	}
2962
2963	attrzp = NULL;
2964	aclp = NULL;
2965
2966	/* Can this be moved to before the top label? */
2967	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2968		ZFS_EXIT(zfsvfs);
2969		return (SET_ERROR(EROFS));
2970	}
2971
2972	/*
2973	 * First validate permissions
2974	 */
2975
2976	if (mask & AT_SIZE) {
2977		/*
2978		 * XXX - Note, we are not providing any open
2979		 * mode flags here (like FNDELAY), so we may
2980		 * block if there are locks present... this
2981		 * should be addressed in openat().
2982		 */
2983		/* XXX - would it be OK to generate a log record here? */
2984		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2985		if (err) {
2986			ZFS_EXIT(zfsvfs);
2987			return (err);
2988		}
2989	}
2990
2991	if (mask & (AT_ATIME|AT_MTIME) ||
2992	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2993	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2994	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2995	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2996	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2997	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2998	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2999		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3000		    skipaclchk, cr);
3001	}
3002
3003	if (mask & (AT_UID|AT_GID)) {
3004		int	idmask = (mask & (AT_UID|AT_GID));
3005		int	take_owner;
3006		int	take_group;
3007
3008		/*
3009		 * NOTE: even if a new mode is being set,
3010		 * we may clear S_ISUID/S_ISGID bits.
3011		 */
3012
3013		if (!(mask & AT_MODE))
3014			vap->va_mode = zp->z_mode;
3015
3016		/*
3017		 * Take ownership or chgrp to group we are a member of
3018		 */
3019
3020		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3021		take_group = (mask & AT_GID) &&
3022		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3023
3024		/*
3025		 * If both AT_UID and AT_GID are set then take_owner and
3026		 * take_group must both be set in order to allow taking
3027		 * ownership.
3028		 *
3029		 * Otherwise, send the check through secpolicy_vnode_setattr()
3030		 *
3031		 */
3032
3033		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3034		    ((idmask == AT_UID) && take_owner) ||
3035		    ((idmask == AT_GID) && take_group)) {
3036			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3037			    skipaclchk, cr) == 0) {
3038				/*
3039				 * Remove setuid/setgid for non-privileged users
3040				 */
3041				secpolicy_setid_clear(vap, vp, cr);
3042				trim_mask = (mask & (AT_UID|AT_GID));
3043			} else {
3044				need_policy =  TRUE;
3045			}
3046		} else {
3047			need_policy =  TRUE;
3048		}
3049	}
3050
3051	oldva.va_mode = zp->z_mode;
3052	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3053	if (mask & AT_XVATTR) {
3054		/*
3055		 * Update xvattr mask to include only those attributes
3056		 * that are actually changing.
3057		 *
3058		 * the bits will be restored prior to actually setting
3059		 * the attributes so the caller thinks they were set.
3060		 */
3061		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3062			if (xoap->xoa_appendonly !=
3063			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3064				need_policy = TRUE;
3065			} else {
3066				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3067				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3068			}
3069		}
3070
3071		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3072			if (xoap->xoa_nounlink !=
3073			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3074				need_policy = TRUE;
3075			} else {
3076				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3077				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3078			}
3079		}
3080
3081		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3082			if (xoap->xoa_immutable !=
3083			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3084				need_policy = TRUE;
3085			} else {
3086				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3087				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3088			}
3089		}
3090
3091		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3092			if (xoap->xoa_nodump !=
3093			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3094				need_policy = TRUE;
3095			} else {
3096				XVA_CLR_REQ(xvap, XAT_NODUMP);
3097				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3098			}
3099		}
3100
3101		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3102			if (xoap->xoa_av_modified !=
3103			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3104				need_policy = TRUE;
3105			} else {
3106				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3107				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3108			}
3109		}
3110
3111		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3112			if ((vp->v_type != VREG &&
3113			    xoap->xoa_av_quarantined) ||
3114			    xoap->xoa_av_quarantined !=
3115			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3116				need_policy = TRUE;
3117			} else {
3118				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3119				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3120			}
3121		}
3122
3123		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3124			ZFS_EXIT(zfsvfs);
3125			return (SET_ERROR(EPERM));
3126		}
3127
3128		if (need_policy == FALSE &&
3129		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3130		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3131			need_policy = TRUE;
3132		}
3133	}
3134
3135	if (mask & AT_MODE) {
3136		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3137			err = secpolicy_setid_setsticky_clear(vp, vap,
3138			    &oldva, cr);
3139			if (err) {
3140				ZFS_EXIT(zfsvfs);
3141				return (err);
3142			}
3143			trim_mask |= AT_MODE;
3144		} else {
3145			need_policy = TRUE;
3146		}
3147	}
3148
3149	if (need_policy) {
3150		/*
3151		 * If trim_mask is set then take ownership
3152		 * has been granted or write_acl is present and user
3153		 * has the ability to modify mode.  In that case remove
3154		 * UID|GID and or MODE from mask so that
3155		 * secpolicy_vnode_setattr() doesn't revoke it.
3156		 */
3157
3158		if (trim_mask) {
3159			saved_mask = vap->va_mask;
3160			vap->va_mask &= ~trim_mask;
3161			if (trim_mask & AT_MODE) {
3162				/*
3163				 * Save the mode, as secpolicy_vnode_setattr()
3164				 * will overwrite it with ova.va_mode.
3165				 */
3166				saved_mode = vap->va_mode;
3167			}
3168		}
3169		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3170		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3171		if (err) {
3172			ZFS_EXIT(zfsvfs);
3173			return (err);
3174		}
3175
3176		if (trim_mask) {
3177			vap->va_mask |= saved_mask;
3178			if (trim_mask & AT_MODE) {
3179				/*
3180				 * Recover the mode after
3181				 * secpolicy_vnode_setattr().
3182				 */
3183				vap->va_mode = saved_mode;
3184			}
3185		}
3186	}
3187
3188	/*
3189	 * secpolicy_vnode_setattr, or take ownership may have
3190	 * changed va_mask
3191	 */
3192	mask = vap->va_mask;
3193
3194	if ((mask & (AT_UID | AT_GID))) {
3195		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3196		    &xattr_obj, sizeof (xattr_obj));
3197
3198		if (err == 0 && xattr_obj) {
3199			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3200			if (err == 0) {
3201				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3202				if (err != 0)
3203					vrele(ZTOV(attrzp));
3204			}
3205			if (err)
3206				goto out2;
3207		}
3208		if (mask & AT_UID) {
3209			new_uid = zfs_fuid_create(zfsvfs,
3210			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3211			if (new_uid != zp->z_uid &&
3212			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3213				if (attrzp)
3214					vput(ZTOV(attrzp));
3215				err = SET_ERROR(EDQUOT);
3216				goto out2;
3217			}
3218		}
3219
3220		if (mask & AT_GID) {
3221			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3222			    cr, ZFS_GROUP, &fuidp);
3223			if (new_gid != zp->z_gid &&
3224			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3225				if (attrzp)
3226					vput(ZTOV(attrzp));
3227				err = SET_ERROR(EDQUOT);
3228				goto out2;
3229			}
3230		}
3231	}
3232	tx = dmu_tx_create(zfsvfs->z_os);
3233
3234	if (mask & AT_MODE) {
3235		uint64_t pmode = zp->z_mode;
3236		uint64_t acl_obj;
3237		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3238
3239		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3240		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3241			err = SET_ERROR(EPERM);
3242			goto out;
3243		}
3244
3245		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3246			goto out;
3247
3248		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3249			/*
3250			 * Are we upgrading ACL from old V0 format
3251			 * to V1 format?
3252			 */
3253			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3254			    zfs_znode_acl_version(zp) ==
3255			    ZFS_ACL_VERSION_INITIAL) {
3256				dmu_tx_hold_free(tx, acl_obj, 0,
3257				    DMU_OBJECT_END);
3258				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3259				    0, aclp->z_acl_bytes);
3260			} else {
3261				dmu_tx_hold_write(tx, acl_obj, 0,
3262				    aclp->z_acl_bytes);
3263			}
3264		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3265			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3266			    0, aclp->z_acl_bytes);
3267		}
3268		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3269	} else {
3270		if ((mask & AT_XVATTR) &&
3271		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3272			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3273		else
3274			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3275	}
3276
3277	if (attrzp) {
3278		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3279	}
3280
3281	fuid_dirtied = zfsvfs->z_fuid_dirty;
3282	if (fuid_dirtied)
3283		zfs_fuid_txhold(zfsvfs, tx);
3284
3285	zfs_sa_upgrade_txholds(tx, zp);
3286
3287	err = dmu_tx_assign(tx, TXG_WAIT);
3288	if (err)
3289		goto out;
3290
3291	count = 0;
3292	/*
3293	 * Set each attribute requested.
3294	 * We group settings according to the locks they need to acquire.
3295	 *
3296	 * Note: you cannot set ctime directly, although it will be
3297	 * updated as a side-effect of calling this function.
3298	 */
3299
3300	if (mask & (AT_UID|AT_GID|AT_MODE))
3301		mutex_enter(&zp->z_acl_lock);
3302
3303	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3304	    &zp->z_pflags, sizeof (zp->z_pflags));
3305
3306	if (attrzp) {
3307		if (mask & (AT_UID|AT_GID|AT_MODE))
3308			mutex_enter(&attrzp->z_acl_lock);
3309		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3310		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3311		    sizeof (attrzp->z_pflags));
3312	}
3313
3314	if (mask & (AT_UID|AT_GID)) {
3315
3316		if (mask & AT_UID) {
3317			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3318			    &new_uid, sizeof (new_uid));
3319			zp->z_uid = new_uid;
3320			if (attrzp) {
3321				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3322				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3323				    sizeof (new_uid));
3324				attrzp->z_uid = new_uid;
3325			}
3326		}
3327
3328		if (mask & AT_GID) {
3329			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3330			    NULL, &new_gid, sizeof (new_gid));
3331			zp->z_gid = new_gid;
3332			if (attrzp) {
3333				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3334				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3335				    sizeof (new_gid));
3336				attrzp->z_gid = new_gid;
3337			}
3338		}
3339		if (!(mask & AT_MODE)) {
3340			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3341			    NULL, &new_mode, sizeof (new_mode));
3342			new_mode = zp->z_mode;
3343		}
3344		err = zfs_acl_chown_setattr(zp);
3345		ASSERT(err == 0);
3346		if (attrzp) {
3347			err = zfs_acl_chown_setattr(attrzp);
3348			ASSERT(err == 0);
3349		}
3350	}
3351
3352	if (mask & AT_MODE) {
3353		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3354		    &new_mode, sizeof (new_mode));
3355		zp->z_mode = new_mode;
3356		ASSERT3U((uintptr_t)aclp, !=, 0);
3357		err = zfs_aclset_common(zp, aclp, cr, tx);
3358		ASSERT0(err);
3359		if (zp->z_acl_cached)
3360			zfs_acl_free(zp->z_acl_cached);
3361		zp->z_acl_cached = aclp;
3362		aclp = NULL;
3363	}
3364
3365
3366	if (mask & AT_ATIME) {
3367		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3368		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3369		    &zp->z_atime, sizeof (zp->z_atime));
3370	}
3371
3372	if (mask & AT_MTIME) {
3373		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3374		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3375		    mtime, sizeof (mtime));
3376	}
3377
3378	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3379	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3380		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3381		    NULL, mtime, sizeof (mtime));
3382		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3383		    &ctime, sizeof (ctime));
3384		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3385		    B_TRUE);
3386	} else if (mask != 0) {
3387		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3388		    &ctime, sizeof (ctime));
3389		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3390		    B_TRUE);
3391		if (attrzp) {
3392			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3393			    SA_ZPL_CTIME(zfsvfs), NULL,
3394			    &ctime, sizeof (ctime));
3395			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3396			    mtime, ctime, B_TRUE);
3397		}
3398	}
3399	/*
3400	 * Do this after setting timestamps to prevent timestamp
3401	 * update from toggling bit
3402	 */
3403
3404	if (xoap && (mask & AT_XVATTR)) {
3405
3406		/*
3407		 * restore trimmed off masks
3408		 * so that return masks can be set for caller.
3409		 */
3410
3411		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3412			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3413		}
3414		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3415			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3416		}
3417		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3418			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3419		}
3420		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3421			XVA_SET_REQ(xvap, XAT_NODUMP);
3422		}
3423		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3424			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3425		}
3426		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3427			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3428		}
3429
3430		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3431			ASSERT(vp->v_type == VREG);
3432
3433		zfs_xvattr_set(zp, xvap, tx);
3434	}
3435
3436	if (fuid_dirtied)
3437		zfs_fuid_sync(zfsvfs, tx);
3438
3439	if (mask != 0)
3440		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3441
3442	if (mask & (AT_UID|AT_GID|AT_MODE))
3443		mutex_exit(&zp->z_acl_lock);
3444
3445	if (attrzp) {
3446		if (mask & (AT_UID|AT_GID|AT_MODE))
3447			mutex_exit(&attrzp->z_acl_lock);
3448	}
3449out:
3450	if (err == 0 && attrzp) {
3451		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3452		    xattr_count, tx);
3453		ASSERT(err2 == 0);
3454	}
3455
3456	if (attrzp)
3457		vput(ZTOV(attrzp));
3458
3459	if (aclp)
3460		zfs_acl_free(aclp);
3461
3462	if (fuidp) {
3463		zfs_fuid_info_free(fuidp);
3464		fuidp = NULL;
3465	}
3466
3467	if (err) {
3468		dmu_tx_abort(tx);
3469	} else {
3470		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3471		dmu_tx_commit(tx);
3472	}
3473
3474out2:
3475	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3476		zil_commit(zilog, 0);
3477
3478	ZFS_EXIT(zfsvfs);
3479	return (err);
3480}
3481
3482/*
3483 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3484 * fail to acquire any lock in the path we will drop all held locks,
3485 * acquire the new lock in a blocking fashion, and then release it and
3486 * restart the rename.  This acquire/release step ensures that we do not
3487 * spin on a lock waiting for release.  On error release all vnode locks
3488 * and decrement references the way tmpfs_rename() would do.
3489 */
3490static int
3491zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3492    struct vnode *tdvp, struct vnode **tvpp,
3493    const struct componentname *scnp, const struct componentname *tcnp)
3494{
3495	zfsvfs_t	*zfsvfs;
3496	struct vnode	*nvp, *svp, *tvp;
3497	znode_t		*sdzp, *tdzp, *szp, *tzp;
3498	const char	*snm = scnp->cn_nameptr;
3499	const char	*tnm = tcnp->cn_nameptr;
3500	int error;
3501
3502	VOP_UNLOCK(tdvp, 0);
3503	if (*tvpp != NULL && *tvpp != tdvp)
3504		VOP_UNLOCK(*tvpp, 0);
3505
3506relock:
3507	error = vn_lock(sdvp, LK_EXCLUSIVE);
3508	if (error)
3509		goto out;
3510	sdzp = VTOZ(sdvp);
3511
3512	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3513	if (error != 0) {
3514		VOP_UNLOCK(sdvp, 0);
3515		if (error != EBUSY)
3516			goto out;
3517		error = vn_lock(tdvp, LK_EXCLUSIVE);
3518		if (error)
3519			goto out;
3520		VOP_UNLOCK(tdvp, 0);
3521		goto relock;
3522	}
3523	tdzp = VTOZ(tdvp);
3524
3525	/*
3526	 * Before using sdzp and tdzp we must ensure that they are live.
3527	 * As a porting legacy from illumos we have two things to worry
3528	 * about.  One is typical for FreeBSD and it is that the vnode is
3529	 * not reclaimed (doomed).  The other is that the znode is live.
3530	 * The current code can invalidate the znode without acquiring the
3531	 * corresponding vnode lock if the object represented by the znode
3532	 * and vnode is no longer valid after a rollback or receive operation.
3533	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3534	 * that protects the znodes from the invalidation.
3535	 */
3536	zfsvfs = sdzp->z_zfsvfs;
3537	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3538	ZFS_ENTER(zfsvfs);
3539
3540	/*
3541	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3542	 * bypassing the cleanup code in the case of an error.
3543	 */
3544	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3545		ZFS_EXIT(zfsvfs);
3546		VOP_UNLOCK(sdvp, 0);
3547		VOP_UNLOCK(tdvp, 0);
3548		error = SET_ERROR(EIO);
3549		goto out;
3550	}
3551
3552	/*
3553	 * Re-resolve svp to be certain it still exists and fetch the
3554	 * correct vnode.
3555	 */
3556	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3557	if (error != 0) {
3558		/* Source entry invalid or not there. */
3559		ZFS_EXIT(zfsvfs);
3560		VOP_UNLOCK(sdvp, 0);
3561		VOP_UNLOCK(tdvp, 0);
3562		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3563		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3564			error = SET_ERROR(EINVAL);
3565		goto out;
3566	}
3567	svp = ZTOV(szp);
3568
3569	/*
3570	 * Re-resolve tvp, if it disappeared we just carry on.
3571	 */
3572	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3573	if (error != 0) {
3574		ZFS_EXIT(zfsvfs);
3575		VOP_UNLOCK(sdvp, 0);
3576		VOP_UNLOCK(tdvp, 0);
3577		vrele(svp);
3578		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3579			error = SET_ERROR(EINVAL);
3580		goto out;
3581	}
3582	if (tzp != NULL)
3583		tvp = ZTOV(tzp);
3584	else
3585		tvp = NULL;
3586
3587	/*
3588	 * At present the vnode locks must be acquired before z_teardown_lock,
3589	 * although it would be more logical to use the opposite order.
3590	 */
3591	ZFS_EXIT(zfsvfs);
3592
3593	/*
3594	 * Now try acquire locks on svp and tvp.
3595	 */
3596	nvp = svp;
3597	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3598	if (error != 0) {
3599		VOP_UNLOCK(sdvp, 0);
3600		VOP_UNLOCK(tdvp, 0);
3601		if (tvp != NULL)
3602			vrele(tvp);
3603		if (error != EBUSY) {
3604			vrele(nvp);
3605			goto out;
3606		}
3607		error = vn_lock(nvp, LK_EXCLUSIVE);
3608		if (error != 0) {
3609			vrele(nvp);
3610			goto out;
3611		}
3612		VOP_UNLOCK(nvp, 0);
3613		/*
3614		 * Concurrent rename race.
3615		 * XXX ?
3616		 */
3617		if (nvp == tdvp) {
3618			vrele(nvp);
3619			error = SET_ERROR(EINVAL);
3620			goto out;
3621		}
3622		vrele(*svpp);
3623		*svpp = nvp;
3624		goto relock;
3625	}
3626	vrele(*svpp);
3627	*svpp = nvp;
3628
3629	if (*tvpp != NULL)
3630		vrele(*tvpp);
3631	*tvpp = NULL;
3632	if (tvp != NULL) {
3633		nvp = tvp;
3634		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3635		if (error != 0) {
3636			VOP_UNLOCK(sdvp, 0);
3637			VOP_UNLOCK(tdvp, 0);
3638			VOP_UNLOCK(*svpp, 0);
3639			if (error != EBUSY) {
3640				vrele(nvp);
3641				goto out;
3642			}
3643			error = vn_lock(nvp, LK_EXCLUSIVE);
3644			if (error != 0) {
3645				vrele(nvp);
3646				goto out;
3647			}
3648			vput(nvp);
3649			goto relock;
3650		}
3651		*tvpp = nvp;
3652	}
3653
3654	return (0);
3655
3656out:
3657	return (error);
3658}
3659
3660/*
3661 * Note that we must use VRELE_ASYNC in this function as it walks
3662 * up the directory tree and vrele may need to acquire an exclusive
3663 * lock if a last reference to a vnode is dropped.
3664 */
3665static int
3666zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3667{
3668	zfsvfs_t	*zfsvfs;
3669	znode_t		*zp, *zp1;
3670	uint64_t	parent;
3671	int		error;
3672
3673	zfsvfs = tdzp->z_zfsvfs;
3674	if (tdzp == szp)
3675		return (SET_ERROR(EINVAL));
3676	if (tdzp == sdzp)
3677		return (0);
3678	if (tdzp->z_id == zfsvfs->z_root)
3679		return (0);
3680	zp = tdzp;
3681	for (;;) {
3682		ASSERT(!zp->z_unlinked);
3683		if ((error = sa_lookup(zp->z_sa_hdl,
3684		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3685			break;
3686
3687		if (parent == szp->z_id) {
3688			error = SET_ERROR(EINVAL);
3689			break;
3690		}
3691		if (parent == zfsvfs->z_root)
3692			break;
3693		if (parent == sdzp->z_id)
3694			break;
3695
3696		error = zfs_zget(zfsvfs, parent, &zp1);
3697		if (error != 0)
3698			break;
3699
3700		if (zp != tdzp)
3701			VN_RELE_ASYNC(ZTOV(zp),
3702			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3703		zp = zp1;
3704	}
3705
3706	if (error == ENOTDIR)
3707		panic("checkpath: .. not a directory\n");
3708	if (zp != tdzp)
3709		VN_RELE_ASYNC(ZTOV(zp),
3710		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3711	return (error);
3712}
3713
3714/*
3715 * Move an entry from the provided source directory to the target
3716 * directory.  Change the entry name as indicated.
3717 *
3718 *	IN:	sdvp	- Source directory containing the "old entry".
3719 *		snm	- Old entry name.
3720 *		tdvp	- Target directory to contain the "new entry".
3721 *		tnm	- New entry name.
3722 *		cr	- credentials of caller.
3723 *		ct	- caller context
3724 *		flags	- case flags
3725 *
3726 *	RETURN:	0 on success, error code on failure.
3727 *
3728 * Timestamps:
3729 *	sdvp,tdvp - ctime|mtime updated
3730 */
3731/*ARGSUSED*/
3732static int
3733zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3734    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3735    cred_t *cr)
3736{
3737	zfsvfs_t	*zfsvfs;
3738	znode_t		*sdzp, *tdzp, *szp, *tzp;
3739	zilog_t		*zilog = NULL;
3740	dmu_tx_t	*tx;
3741	char		*snm = scnp->cn_nameptr;
3742	char		*tnm = tcnp->cn_nameptr;
3743	int		error = 0;
3744
3745	/* Reject renames across filesystems. */
3746	if ((*svpp)->v_mount != tdvp->v_mount ||
3747	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3748		error = SET_ERROR(EXDEV);
3749		goto out;
3750	}
3751
3752	if (zfsctl_is_node(tdvp)) {
3753		error = SET_ERROR(EXDEV);
3754		goto out;
3755	}
3756
3757	/*
3758	 * Lock all four vnodes to ensure safety and semantics of renaming.
3759	 */
3760	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3761	if (error != 0) {
3762		/* no vnodes are locked in the case of error here */
3763		return (error);
3764	}
3765
3766	tdzp = VTOZ(tdvp);
3767	sdzp = VTOZ(sdvp);
3768	zfsvfs = tdzp->z_zfsvfs;
3769	zilog = zfsvfs->z_log;
3770
3771	/*
3772	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3773	 * znodes involved.
3774	 */
3775	ZFS_ENTER(zfsvfs);
3776
3777	if (zfsvfs->z_utf8 && u8_validate(tnm,
3778	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3779		error = SET_ERROR(EILSEQ);
3780		goto unlockout;
3781	}
3782
3783	/* If source and target are the same file, there is nothing to do. */
3784	if ((*svpp) == (*tvpp)) {
3785		error = 0;
3786		goto unlockout;
3787	}
3788
3789	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3790	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3791	    (*tvpp)->v_mountedhere != NULL)) {
3792		error = SET_ERROR(EXDEV);
3793		goto unlockout;
3794	}
3795
3796	/*
3797	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3798	 * bypassing the cleanup code in the case of an error.
3799	 */
3800	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3801		error = SET_ERROR(EIO);
3802		goto unlockout;
3803	}
3804
3805	szp = VTOZ(*svpp);
3806	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3807	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3808		error = SET_ERROR(EIO);
3809		goto unlockout;
3810	}
3811
3812	/*
3813	 * This is to prevent the creation of links into attribute space
3814	 * by renaming a linked file into/outof an attribute directory.
3815	 * See the comment in zfs_link() for why this is considered bad.
3816	 */
3817	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3818		error = SET_ERROR(EINVAL);
3819		goto unlockout;
3820	}
3821
3822	/*
3823	 * Must have write access at the source to remove the old entry
3824	 * and write access at the target to create the new entry.
3825	 * Note that if target and source are the same, this can be
3826	 * done in a single check.
3827	 */
3828	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3829		goto unlockout;
3830
3831	if ((*svpp)->v_type == VDIR) {
3832		/*
3833		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3834		 */
3835		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3836		    sdzp == szp ||
3837		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3838			error = EINVAL;
3839			goto unlockout;
3840		}
3841
3842		/*
3843		 * Check to make sure rename is valid.
3844		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3845		 */
3846		if (error = zfs_rename_check(szp, sdzp, tdzp))
3847			goto unlockout;
3848	}
3849
3850	/*
3851	 * Does target exist?
3852	 */
3853	if (tzp) {
3854		/*
3855		 * Source and target must be the same type.
3856		 */
3857		if ((*svpp)->v_type == VDIR) {
3858			if ((*tvpp)->v_type != VDIR) {
3859				error = SET_ERROR(ENOTDIR);
3860				goto unlockout;
3861			} else {
3862				cache_purge(tdvp);
3863				if (sdvp != tdvp)
3864					cache_purge(sdvp);
3865			}
3866		} else {
3867			if ((*tvpp)->v_type == VDIR) {
3868				error = SET_ERROR(EISDIR);
3869				goto unlockout;
3870			}
3871		}
3872	}
3873
3874	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3875	if (tzp)
3876		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3877
3878	/*
3879	 * notify the target directory if it is not the same
3880	 * as source directory.
3881	 */
3882	if (tdvp != sdvp) {
3883		vnevent_rename_dest_dir(tdvp, ct);
3884	}
3885
3886	tx = dmu_tx_create(zfsvfs->z_os);
3887	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3888	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3889	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3890	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3891	if (sdzp != tdzp) {
3892		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3893		zfs_sa_upgrade_txholds(tx, tdzp);
3894	}
3895	if (tzp) {
3896		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3897		zfs_sa_upgrade_txholds(tx, tzp);
3898	}
3899
3900	zfs_sa_upgrade_txholds(tx, szp);
3901	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3902	error = dmu_tx_assign(tx, TXG_WAIT);
3903	if (error) {
3904		dmu_tx_abort(tx);
3905		goto unlockout;
3906	}
3907
3908
3909	if (tzp)	/* Attempt to remove the existing target */
3910		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3911
3912	if (error == 0) {
3913		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3914		if (error == 0) {
3915			szp->z_pflags |= ZFS_AV_MODIFIED;
3916
3917			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3918			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3919			ASSERT0(error);
3920
3921			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3922			    NULL);
3923			if (error == 0) {
3924				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3925				    snm, tdzp, tnm, szp);
3926
3927				/*
3928				 * Update path information for the target vnode
3929				 */
3930				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3931			} else {
3932				/*
3933				 * At this point, we have successfully created
3934				 * the target name, but have failed to remove
3935				 * the source name.  Since the create was done
3936				 * with the ZRENAMING flag, there are
3937				 * complications; for one, the link count is
3938				 * wrong.  The easiest way to deal with this
3939				 * is to remove the newly created target, and
3940				 * return the original error.  This must
3941				 * succeed; fortunately, it is very unlikely to
3942				 * fail, since we just created it.
3943				 */
3944				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3945				    ZRENAMING, NULL), ==, 0);
3946			}
3947		}
3948		if (error == 0) {
3949			cache_purge(*svpp);
3950			if (*tvpp != NULL)
3951				cache_purge(*tvpp);
3952			cache_purge_negative(tdvp);
3953		}
3954	}
3955
3956	dmu_tx_commit(tx);
3957
3958unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3959	ZFS_EXIT(zfsvfs);
3960	VOP_UNLOCK(*svpp, 0);
3961	VOP_UNLOCK(sdvp, 0);
3962
3963out:				/* original two vnodes are locked */
3964	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3965		zil_commit(zilog, 0);
3966
3967	if (*tvpp != NULL)
3968		VOP_UNLOCK(*tvpp, 0);
3969	if (tdvp != *tvpp)
3970		VOP_UNLOCK(tdvp, 0);
3971	return (error);
3972}
3973
3974/*
3975 * Insert the indicated symbolic reference entry into the directory.
3976 *
3977 *	IN:	dvp	- Directory to contain new symbolic link.
3978 *		link	- Name for new symlink entry.
3979 *		vap	- Attributes of new entry.
3980 *		cr	- credentials of caller.
3981 *		ct	- caller context
3982 *		flags	- case flags
3983 *
3984 *	RETURN:	0 on success, error code on failure.
3985 *
3986 * Timestamps:
3987 *	dvp - ctime|mtime updated
3988 */
3989/*ARGSUSED*/
3990static int
3991zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3992    cred_t *cr, kthread_t *td)
3993{
3994	znode_t		*zp, *dzp = VTOZ(dvp);
3995	dmu_tx_t	*tx;
3996	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3997	zilog_t		*zilog;
3998	uint64_t	len = strlen(link);
3999	int		error;
4000	zfs_acl_ids_t	acl_ids;
4001	boolean_t	fuid_dirtied;
4002	uint64_t	txtype = TX_SYMLINK;
4003	int		flags = 0;
4004
4005	ASSERT(vap->va_type == VLNK);
4006
4007	ZFS_ENTER(zfsvfs);
4008	ZFS_VERIFY_ZP(dzp);
4009	zilog = zfsvfs->z_log;
4010
4011	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4012	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4013		ZFS_EXIT(zfsvfs);
4014		return (SET_ERROR(EILSEQ));
4015	}
4016
4017	if (len > MAXPATHLEN) {
4018		ZFS_EXIT(zfsvfs);
4019		return (SET_ERROR(ENAMETOOLONG));
4020	}
4021
4022	if ((error = zfs_acl_ids_create(dzp, 0,
4023	    vap, cr, NULL, &acl_ids)) != 0) {
4024		ZFS_EXIT(zfsvfs);
4025		return (error);
4026	}
4027
4028	/*
4029	 * Attempt to lock directory; fail if entry already exists.
4030	 */
4031	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4032	if (error) {
4033		zfs_acl_ids_free(&acl_ids);
4034		ZFS_EXIT(zfsvfs);
4035		return (error);
4036	}
4037
4038	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4039		zfs_acl_ids_free(&acl_ids);
4040		ZFS_EXIT(zfsvfs);
4041		return (error);
4042	}
4043
4044	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4045		zfs_acl_ids_free(&acl_ids);
4046		ZFS_EXIT(zfsvfs);
4047		return (SET_ERROR(EDQUOT));
4048	}
4049
4050	getnewvnode_reserve(1);
4051	tx = dmu_tx_create(zfsvfs->z_os);
4052	fuid_dirtied = zfsvfs->z_fuid_dirty;
4053	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4054	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4055	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4056	    ZFS_SA_BASE_ATTR_SIZE + len);
4057	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4058	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4059		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4060		    acl_ids.z_aclp->z_acl_bytes);
4061	}
4062	if (fuid_dirtied)
4063		zfs_fuid_txhold(zfsvfs, tx);
4064	error = dmu_tx_assign(tx, TXG_WAIT);
4065	if (error) {
4066		zfs_acl_ids_free(&acl_ids);
4067		dmu_tx_abort(tx);
4068		getnewvnode_drop_reserve();
4069		ZFS_EXIT(zfsvfs);
4070		return (error);
4071	}
4072
4073	/*
4074	 * Create a new object for the symlink.
4075	 * for version 4 ZPL datsets the symlink will be an SA attribute
4076	 */
4077	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4078
4079	if (fuid_dirtied)
4080		zfs_fuid_sync(zfsvfs, tx);
4081
4082	if (zp->z_is_sa)
4083		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4084		    link, len, tx);
4085	else
4086		zfs_sa_symlink(zp, link, len, tx);
4087
4088	zp->z_size = len;
4089	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4090	    &zp->z_size, sizeof (zp->z_size), tx);
4091	/*
4092	 * Insert the new object into the directory.
4093	 */
4094	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4095
4096	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4097	*vpp = ZTOV(zp);
4098
4099	zfs_acl_ids_free(&acl_ids);
4100
4101	dmu_tx_commit(tx);
4102
4103	getnewvnode_drop_reserve();
4104
4105	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4106		zil_commit(zilog, 0);
4107
4108	ZFS_EXIT(zfsvfs);
4109	return (error);
4110}
4111
4112/*
4113 * Return, in the buffer contained in the provided uio structure,
4114 * the symbolic path referred to by vp.
4115 *
4116 *	IN:	vp	- vnode of symbolic link.
4117 *		uio	- structure to contain the link path.
4118 *		cr	- credentials of caller.
4119 *		ct	- caller context
4120 *
4121 *	OUT:	uio	- structure containing the link path.
4122 *
4123 *	RETURN:	0 on success, error code on failure.
4124 *
4125 * Timestamps:
4126 *	vp - atime updated
4127 */
4128/* ARGSUSED */
4129static int
4130zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4131{
4132	znode_t		*zp = VTOZ(vp);
4133	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4134	int		error;
4135
4136	ZFS_ENTER(zfsvfs);
4137	ZFS_VERIFY_ZP(zp);
4138
4139	if (zp->z_is_sa)
4140		error = sa_lookup_uio(zp->z_sa_hdl,
4141		    SA_ZPL_SYMLINK(zfsvfs), uio);
4142	else
4143		error = zfs_sa_readlink(zp, uio);
4144
4145	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4146
4147	ZFS_EXIT(zfsvfs);
4148	return (error);
4149}
4150
4151/*
4152 * Insert a new entry into directory tdvp referencing svp.
4153 *
4154 *	IN:	tdvp	- Directory to contain new entry.
4155 *		svp	- vnode of new entry.
4156 *		name	- name of new entry.
4157 *		cr	- credentials of caller.
4158 *		ct	- caller context
4159 *
4160 *	RETURN:	0 on success, error code on failure.
4161 *
4162 * Timestamps:
4163 *	tdvp - ctime|mtime updated
4164 *	 svp - ctime updated
4165 */
4166/* ARGSUSED */
4167static int
4168zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4169    caller_context_t *ct, int flags)
4170{
4171	znode_t		*dzp = VTOZ(tdvp);
4172	znode_t		*tzp, *szp;
4173	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4174	zilog_t		*zilog;
4175	dmu_tx_t	*tx;
4176	int		error;
4177	uint64_t	parent;
4178	uid_t		owner;
4179
4180	ASSERT(tdvp->v_type == VDIR);
4181
4182	ZFS_ENTER(zfsvfs);
4183	ZFS_VERIFY_ZP(dzp);
4184	zilog = zfsvfs->z_log;
4185
4186	/*
4187	 * POSIX dictates that we return EPERM here.
4188	 * Better choices include ENOTSUP or EISDIR.
4189	 */
4190	if (svp->v_type == VDIR) {
4191		ZFS_EXIT(zfsvfs);
4192		return (SET_ERROR(EPERM));
4193	}
4194
4195	szp = VTOZ(svp);
4196	ZFS_VERIFY_ZP(szp);
4197
4198	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4199		ZFS_EXIT(zfsvfs);
4200		return (SET_ERROR(EPERM));
4201	}
4202
4203	/* Prevent links to .zfs/shares files */
4204
4205	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4206	    &parent, sizeof (uint64_t))) != 0) {
4207		ZFS_EXIT(zfsvfs);
4208		return (error);
4209	}
4210	if (parent == zfsvfs->z_shares_dir) {
4211		ZFS_EXIT(zfsvfs);
4212		return (SET_ERROR(EPERM));
4213	}
4214
4215	if (zfsvfs->z_utf8 && u8_validate(name,
4216	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4217		ZFS_EXIT(zfsvfs);
4218		return (SET_ERROR(EILSEQ));
4219	}
4220
4221	/*
4222	 * We do not support links between attributes and non-attributes
4223	 * because of the potential security risk of creating links
4224	 * into "normal" file space in order to circumvent restrictions
4225	 * imposed in attribute space.
4226	 */
4227	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4228		ZFS_EXIT(zfsvfs);
4229		return (SET_ERROR(EINVAL));
4230	}
4231
4232
4233	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4234	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4235		ZFS_EXIT(zfsvfs);
4236		return (SET_ERROR(EPERM));
4237	}
4238
4239	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4240		ZFS_EXIT(zfsvfs);
4241		return (error);
4242	}
4243
4244	/*
4245	 * Attempt to lock directory; fail if entry already exists.
4246	 */
4247	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4248	if (error) {
4249		ZFS_EXIT(zfsvfs);
4250		return (error);
4251	}
4252
4253	tx = dmu_tx_create(zfsvfs->z_os);
4254	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4255	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4256	zfs_sa_upgrade_txholds(tx, szp);
4257	zfs_sa_upgrade_txholds(tx, dzp);
4258	error = dmu_tx_assign(tx, TXG_WAIT);
4259	if (error) {
4260		dmu_tx_abort(tx);
4261		ZFS_EXIT(zfsvfs);
4262		return (error);
4263	}
4264
4265	error = zfs_link_create(dzp, name, szp, tx, 0);
4266
4267	if (error == 0) {
4268		uint64_t txtype = TX_LINK;
4269		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4270	}
4271
4272	dmu_tx_commit(tx);
4273
4274	if (error == 0) {
4275		vnevent_link(svp, ct);
4276	}
4277
4278	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4279		zil_commit(zilog, 0);
4280
4281	ZFS_EXIT(zfsvfs);
4282	return (error);
4283}
4284
4285
4286/*ARGSUSED*/
4287void
4288zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4289{
4290	znode_t	*zp = VTOZ(vp);
4291	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4292	int error;
4293
4294	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4295	if (zp->z_sa_hdl == NULL) {
4296		/*
4297		 * The fs has been unmounted, or we did a
4298		 * suspend/resume and this file no longer exists.
4299		 */
4300		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4301		vrecycle(vp);
4302		return;
4303	}
4304
4305	if (zp->z_unlinked) {
4306		/*
4307		 * Fast path to recycle a vnode of a removed file.
4308		 */
4309		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4310		vrecycle(vp);
4311		return;
4312	}
4313
4314	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4315		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4316
4317		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4318		zfs_sa_upgrade_txholds(tx, zp);
4319		error = dmu_tx_assign(tx, TXG_WAIT);
4320		if (error) {
4321			dmu_tx_abort(tx);
4322		} else {
4323			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4324			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4325			zp->z_atime_dirty = 0;
4326			dmu_tx_commit(tx);
4327		}
4328	}
4329	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4330}
4331
4332
4333CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4334CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4335
4336/*ARGSUSED*/
4337static int
4338zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4339{
4340	znode_t		*zp = VTOZ(vp);
4341	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4342	uint32_t	gen;
4343	uint64_t	gen64;
4344	uint64_t	object = zp->z_id;
4345	zfid_short_t	*zfid;
4346	int		size, i, error;
4347
4348	ZFS_ENTER(zfsvfs);
4349	ZFS_VERIFY_ZP(zp);
4350
4351	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4352	    &gen64, sizeof (uint64_t))) != 0) {
4353		ZFS_EXIT(zfsvfs);
4354		return (error);
4355	}
4356
4357	gen = (uint32_t)gen64;
4358
4359	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4360
4361#ifdef illumos
4362	if (fidp->fid_len < size) {
4363		fidp->fid_len = size;
4364		ZFS_EXIT(zfsvfs);
4365		return (SET_ERROR(ENOSPC));
4366	}
4367#else
4368	fidp->fid_len = size;
4369#endif
4370
4371	zfid = (zfid_short_t *)fidp;
4372
4373	zfid->zf_len = size;
4374
4375	for (i = 0; i < sizeof (zfid->zf_object); i++)
4376		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4377
4378	/* Must have a non-zero generation number to distinguish from .zfs */
4379	if (gen == 0)
4380		gen = 1;
4381	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4382		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4383
4384	if (size == LONG_FID_LEN) {
4385		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4386		zfid_long_t	*zlfid;
4387
4388		zlfid = (zfid_long_t *)fidp;
4389
4390		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4391			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4392
4393		/* XXX - this should be the generation number for the objset */
4394		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4395			zlfid->zf_setgen[i] = 0;
4396	}
4397
4398	ZFS_EXIT(zfsvfs);
4399	return (0);
4400}
4401
4402static int
4403zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4404    caller_context_t *ct)
4405{
4406	znode_t		*zp, *xzp;
4407	zfsvfs_t	*zfsvfs;
4408	int		error;
4409
4410	switch (cmd) {
4411	case _PC_LINK_MAX:
4412		*valp = INT_MAX;
4413		return (0);
4414
4415	case _PC_FILESIZEBITS:
4416		*valp = 64;
4417		return (0);
4418#ifdef illumos
4419	case _PC_XATTR_EXISTS:
4420		zp = VTOZ(vp);
4421		zfsvfs = zp->z_zfsvfs;
4422		ZFS_ENTER(zfsvfs);
4423		ZFS_VERIFY_ZP(zp);
4424		*valp = 0;
4425		error = zfs_dirent_lookup(zp, "", &xzp,
4426		    ZXATTR | ZEXISTS | ZSHARED);
4427		if (error == 0) {
4428			if (!zfs_dirempty(xzp))
4429				*valp = 1;
4430			vrele(ZTOV(xzp));
4431		} else if (error == ENOENT) {
4432			/*
4433			 * If there aren't extended attributes, it's the
4434			 * same as having zero of them.
4435			 */
4436			error = 0;
4437		}
4438		ZFS_EXIT(zfsvfs);
4439		return (error);
4440
4441	case _PC_SATTR_ENABLED:
4442	case _PC_SATTR_EXISTS:
4443		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4444		    (vp->v_type == VREG || vp->v_type == VDIR);
4445		return (0);
4446
4447	case _PC_ACCESS_FILTERING:
4448		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4449		    vp->v_type == VDIR;
4450		return (0);
4451
4452	case _PC_ACL_ENABLED:
4453		*valp = _ACL_ACE_ENABLED;
4454		return (0);
4455#endif	/* illumos */
4456	case _PC_MIN_HOLE_SIZE:
4457		*valp = (int)SPA_MINBLOCKSIZE;
4458		return (0);
4459#ifdef illumos
4460	case _PC_TIMESTAMP_RESOLUTION:
4461		/* nanosecond timestamp resolution */
4462		*valp = 1L;
4463		return (0);
4464#endif
4465	case _PC_ACL_EXTENDED:
4466		*valp = 0;
4467		return (0);
4468
4469	case _PC_ACL_NFS4:
4470		*valp = 1;
4471		return (0);
4472
4473	case _PC_ACL_PATH_MAX:
4474		*valp = ACL_MAX_ENTRIES;
4475		return (0);
4476
4477	default:
4478		return (EOPNOTSUPP);
4479	}
4480}
4481
4482/*ARGSUSED*/
4483static int
4484zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4485    caller_context_t *ct)
4486{
4487	znode_t *zp = VTOZ(vp);
4488	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4489	int error;
4490	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4491
4492	ZFS_ENTER(zfsvfs);
4493	ZFS_VERIFY_ZP(zp);
4494	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4495	ZFS_EXIT(zfsvfs);
4496
4497	return (error);
4498}
4499
4500/*ARGSUSED*/
4501int
4502zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4503    caller_context_t *ct)
4504{
4505	znode_t *zp = VTOZ(vp);
4506	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4507	int error;
4508	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4509	zilog_t	*zilog = zfsvfs->z_log;
4510
4511	ZFS_ENTER(zfsvfs);
4512	ZFS_VERIFY_ZP(zp);
4513
4514	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4515
4516	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4517		zil_commit(zilog, 0);
4518
4519	ZFS_EXIT(zfsvfs);
4520	return (error);
4521}
4522
4523static int
4524ioflags(int ioflags)
4525{
4526	int flags = 0;
4527
4528	if (ioflags & IO_APPEND)
4529		flags |= FAPPEND;
4530	if (ioflags & IO_NDELAY)
4531		flags |= FNONBLOCK;
4532	if (ioflags & IO_SYNC)
4533		flags |= (FSYNC | FDSYNC | FRSYNC);
4534
4535	return (flags);
4536}
4537
4538static int
4539zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4540{
4541	znode_t *zp = VTOZ(vp);
4542	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4543	objset_t *os = zp->z_zfsvfs->z_os;
4544	vm_page_t mfirst, mlast, mreq;
4545	vm_object_t object;
4546	caddr_t va;
4547	struct sf_buf *sf;
4548	off_t startoff, endoff;
4549	int i, error;
4550	vm_pindex_t reqstart, reqend;
4551	int pcount, lsize, reqsize, size;
4552
4553	ZFS_ENTER(zfsvfs);
4554	ZFS_VERIFY_ZP(zp);
4555
4556	pcount = OFF_TO_IDX(round_page(count));
4557	mreq = m[reqpage];
4558	object = mreq->object;
4559	error = 0;
4560
4561	KASSERT(vp->v_object == object, ("mismatching object"));
4562
4563	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4564		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4565		reqstart = OFF_TO_IDX(round_page(startoff));
4566		if (reqstart < m[0]->pindex)
4567			reqstart = 0;
4568		else
4569			reqstart = reqstart - m[0]->pindex;
4570		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4571		    zp->z_blksz);
4572		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4573		if (reqend > m[pcount - 1]->pindex)
4574			reqend = m[pcount - 1]->pindex;
4575		reqsize = reqend - m[reqstart]->pindex + 1;
4576		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4577		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4578	} else {
4579		reqstart = reqpage;
4580		reqsize = 1;
4581	}
4582	mfirst = m[reqstart];
4583	mlast = m[reqstart + reqsize - 1];
4584
4585	zfs_vmobject_wlock(object);
4586
4587	for (i = 0; i < reqstart; i++) {
4588		vm_page_lock(m[i]);
4589		vm_page_free(m[i]);
4590		vm_page_unlock(m[i]);
4591	}
4592	for (i = reqstart + reqsize; i < pcount; i++) {
4593		vm_page_lock(m[i]);
4594		vm_page_free(m[i]);
4595		vm_page_unlock(m[i]);
4596	}
4597
4598	if (mreq->valid && reqsize == 1) {
4599		if (mreq->valid != VM_PAGE_BITS_ALL)
4600			vm_page_zero_invalid(mreq, TRUE);
4601		zfs_vmobject_wunlock(object);
4602		ZFS_EXIT(zfsvfs);
4603		return (zfs_vm_pagerret_ok);
4604	}
4605
4606	PCPU_INC(cnt.v_vnodein);
4607	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4608
4609	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4610		for (i = reqstart; i < reqstart + reqsize; i++) {
4611			if (i != reqpage) {
4612				vm_page_lock(m[i]);
4613				vm_page_free(m[i]);
4614				vm_page_unlock(m[i]);
4615			}
4616		}
4617		zfs_vmobject_wunlock(object);
4618		ZFS_EXIT(zfsvfs);
4619		return (zfs_vm_pagerret_bad);
4620	}
4621
4622	lsize = PAGE_SIZE;
4623	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4624		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4625
4626	zfs_vmobject_wunlock(object);
4627
4628	for (i = reqstart; i < reqstart + reqsize; i++) {
4629		size = PAGE_SIZE;
4630		if (i == (reqstart + reqsize - 1))
4631			size = lsize;
4632		va = zfs_map_page(m[i], &sf);
4633		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4634		    size, va, DMU_READ_PREFETCH);
4635		if (size != PAGE_SIZE)
4636			bzero(va + size, PAGE_SIZE - size);
4637		zfs_unmap_page(sf);
4638		if (error != 0)
4639			break;
4640	}
4641
4642	zfs_vmobject_wlock(object);
4643
4644	for (i = reqstart; i < reqstart + reqsize; i++) {
4645		if (!error)
4646			m[i]->valid = VM_PAGE_BITS_ALL;
4647		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4648		if (i != reqpage)
4649			vm_page_readahead_finish(m[i]);
4650	}
4651
4652	zfs_vmobject_wunlock(object);
4653
4654	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4655	ZFS_EXIT(zfsvfs);
4656	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4657}
4658
4659static int
4660zfs_freebsd_getpages(ap)
4661	struct vop_getpages_args /* {
4662		struct vnode *a_vp;
4663		vm_page_t *a_m;
4664		int a_count;
4665		int a_reqpage;
4666		vm_ooffset_t a_offset;
4667	} */ *ap;
4668{
4669
4670	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4671}
4672
4673static int
4674zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4675    int *rtvals)
4676{
4677	znode_t		*zp = VTOZ(vp);
4678	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4679	rl_t		*rl;
4680	dmu_tx_t	*tx;
4681	struct sf_buf	*sf;
4682	vm_object_t	object;
4683	vm_page_t	m;
4684	caddr_t		va;
4685	size_t		tocopy;
4686	size_t		lo_len;
4687	vm_ooffset_t	lo_off;
4688	vm_ooffset_t	off;
4689	uint_t		blksz;
4690	int		ncount;
4691	int		pcount;
4692	int		err;
4693	int		i;
4694
4695	ZFS_ENTER(zfsvfs);
4696	ZFS_VERIFY_ZP(zp);
4697
4698	object = vp->v_object;
4699	pcount = btoc(len);
4700	ncount = pcount;
4701
4702	KASSERT(ma[0]->object == object, ("mismatching object"));
4703	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4704
4705	for (i = 0; i < pcount; i++)
4706		rtvals[i] = zfs_vm_pagerret_error;
4707
4708	off = IDX_TO_OFF(ma[0]->pindex);
4709	blksz = zp->z_blksz;
4710	lo_off = rounddown(off, blksz);
4711	lo_len = roundup(len + (off - lo_off), blksz);
4712	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4713
4714	zfs_vmobject_wlock(object);
4715	if (len + off > object->un_pager.vnp.vnp_size) {
4716		if (object->un_pager.vnp.vnp_size > off) {
4717			int pgoff;
4718
4719			len = object->un_pager.vnp.vnp_size - off;
4720			ncount = btoc(len);
4721			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4722				/*
4723				 * If the object is locked and the following
4724				 * conditions hold, then the page's dirty
4725				 * field cannot be concurrently changed by a
4726				 * pmap operation.
4727				 */
4728				m = ma[ncount - 1];
4729				vm_page_assert_sbusied(m);
4730				KASSERT(!pmap_page_is_write_mapped(m),
4731				    ("zfs_putpages: page %p is not read-only", m));
4732				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4733				    pgoff);
4734			}
4735		} else {
4736			len = 0;
4737			ncount = 0;
4738		}
4739		if (ncount < pcount) {
4740			for (i = ncount; i < pcount; i++) {
4741				rtvals[i] = zfs_vm_pagerret_bad;
4742			}
4743		}
4744	}
4745	zfs_vmobject_wunlock(object);
4746
4747	if (ncount == 0)
4748		goto out;
4749
4750	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4751	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4752		goto out;
4753	}
4754
4755top:
4756	tx = dmu_tx_create(zfsvfs->z_os);
4757	dmu_tx_hold_write(tx, zp->z_id, off, len);
4758
4759	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4760	zfs_sa_upgrade_txholds(tx, zp);
4761	err = dmu_tx_assign(tx, TXG_NOWAIT);
4762	if (err != 0) {
4763		if (err == ERESTART) {
4764			dmu_tx_wait(tx);
4765			dmu_tx_abort(tx);
4766			goto top;
4767		}
4768		dmu_tx_abort(tx);
4769		goto out;
4770	}
4771
4772	if (zp->z_blksz < PAGE_SIZE) {
4773		i = 0;
4774		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4775			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4776			va = zfs_map_page(ma[i], &sf);
4777			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4778			zfs_unmap_page(sf);
4779		}
4780	} else {
4781		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4782	}
4783
4784	if (err == 0) {
4785		uint64_t mtime[2], ctime[2];
4786		sa_bulk_attr_t bulk[3];
4787		int count = 0;
4788
4789		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4790		    &mtime, 16);
4791		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4792		    &ctime, 16);
4793		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4794		    &zp->z_pflags, 8);
4795		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4796		    B_TRUE);
4797		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4798		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4799
4800		zfs_vmobject_wlock(object);
4801		for (i = 0; i < ncount; i++) {
4802			rtvals[i] = zfs_vm_pagerret_ok;
4803			vm_page_undirty(ma[i]);
4804		}
4805		zfs_vmobject_wunlock(object);
4806		PCPU_INC(cnt.v_vnodeout);
4807		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4808	}
4809	dmu_tx_commit(tx);
4810
4811out:
4812	zfs_range_unlock(rl);
4813	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4814	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4815		zil_commit(zfsvfs->z_log, zp->z_id);
4816	ZFS_EXIT(zfsvfs);
4817	return (rtvals[0]);
4818}
4819
4820int
4821zfs_freebsd_putpages(ap)
4822	struct vop_putpages_args /* {
4823		struct vnode *a_vp;
4824		vm_page_t *a_m;
4825		int a_count;
4826		int a_sync;
4827		int *a_rtvals;
4828		vm_ooffset_t a_offset;
4829	} */ *ap;
4830{
4831
4832	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4833	    ap->a_rtvals));
4834}
4835
4836static int
4837zfs_freebsd_bmap(ap)
4838	struct vop_bmap_args /* {
4839		struct vnode *a_vp;
4840		daddr_t  a_bn;
4841		struct bufobj **a_bop;
4842		daddr_t *a_bnp;
4843		int *a_runp;
4844		int *a_runb;
4845	} */ *ap;
4846{
4847
4848	if (ap->a_bop != NULL)
4849		*ap->a_bop = &ap->a_vp->v_bufobj;
4850	if (ap->a_bnp != NULL)
4851		*ap->a_bnp = ap->a_bn;
4852	if (ap->a_runp != NULL)
4853		*ap->a_runp = 0;
4854	if (ap->a_runb != NULL)
4855		*ap->a_runb = 0;
4856
4857	return (0);
4858}
4859
4860static int
4861zfs_freebsd_open(ap)
4862	struct vop_open_args /* {
4863		struct vnode *a_vp;
4864		int a_mode;
4865		struct ucred *a_cred;
4866		struct thread *a_td;
4867	} */ *ap;
4868{
4869	vnode_t	*vp = ap->a_vp;
4870	znode_t *zp = VTOZ(vp);
4871	int error;
4872
4873	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4874	if (error == 0)
4875		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4876	return (error);
4877}
4878
4879static int
4880zfs_freebsd_close(ap)
4881	struct vop_close_args /* {
4882		struct vnode *a_vp;
4883		int  a_fflag;
4884		struct ucred *a_cred;
4885		struct thread *a_td;
4886	} */ *ap;
4887{
4888
4889	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4890}
4891
4892static int
4893zfs_freebsd_ioctl(ap)
4894	struct vop_ioctl_args /* {
4895		struct vnode *a_vp;
4896		u_long a_command;
4897		caddr_t a_data;
4898		int a_fflag;
4899		struct ucred *cred;
4900		struct thread *td;
4901	} */ *ap;
4902{
4903
4904	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4905	    ap->a_fflag, ap->a_cred, NULL, NULL));
4906}
4907
4908static int
4909zfs_freebsd_read(ap)
4910	struct vop_read_args /* {
4911		struct vnode *a_vp;
4912		struct uio *a_uio;
4913		int a_ioflag;
4914		struct ucred *a_cred;
4915	} */ *ap;
4916{
4917
4918	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4919	    ap->a_cred, NULL));
4920}
4921
4922static int
4923zfs_freebsd_write(ap)
4924	struct vop_write_args /* {
4925		struct vnode *a_vp;
4926		struct uio *a_uio;
4927		int a_ioflag;
4928		struct ucred *a_cred;
4929	} */ *ap;
4930{
4931
4932	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4933	    ap->a_cred, NULL));
4934}
4935
4936static int
4937zfs_freebsd_access(ap)
4938	struct vop_access_args /* {
4939		struct vnode *a_vp;
4940		accmode_t a_accmode;
4941		struct ucred *a_cred;
4942		struct thread *a_td;
4943	} */ *ap;
4944{
4945	vnode_t *vp = ap->a_vp;
4946	znode_t *zp = VTOZ(vp);
4947	accmode_t accmode;
4948	int error = 0;
4949
4950	/*
4951	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4952	 */
4953	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4954	if (accmode != 0)
4955		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4956
4957	/*
4958	 * VADMIN has to be handled by vaccess().
4959	 */
4960	if (error == 0) {
4961		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4962		if (accmode != 0) {
4963			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4964			    zp->z_gid, accmode, ap->a_cred, NULL);
4965		}
4966	}
4967
4968	/*
4969	 * For VEXEC, ensure that at least one execute bit is set for
4970	 * non-directories.
4971	 */
4972	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4973	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4974		error = EACCES;
4975	}
4976
4977	return (error);
4978}
4979
4980static int
4981zfs_freebsd_lookup(ap)
4982	struct vop_lookup_args /* {
4983		struct vnode *a_dvp;
4984		struct vnode **a_vpp;
4985		struct componentname *a_cnp;
4986	} */ *ap;
4987{
4988	struct componentname *cnp = ap->a_cnp;
4989	char nm[NAME_MAX + 1];
4990
4991	ASSERT(cnp->cn_namelen < sizeof(nm));
4992	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4993
4994	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4995	    cnp->cn_cred, cnp->cn_thread, 0));
4996}
4997
4998static int
4999zfs_cache_lookup(ap)
5000	struct vop_lookup_args /* {
5001		struct vnode *a_dvp;
5002		struct vnode **a_vpp;
5003		struct componentname *a_cnp;
5004	} */ *ap;
5005{
5006	zfsvfs_t *zfsvfs;
5007
5008	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5009	if (zfsvfs->z_use_namecache)
5010		return (vfs_cache_lookup(ap));
5011	else
5012		return (zfs_freebsd_lookup(ap));
5013}
5014
5015static int
5016zfs_freebsd_create(ap)
5017	struct vop_create_args /* {
5018		struct vnode *a_dvp;
5019		struct vnode **a_vpp;
5020		struct componentname *a_cnp;
5021		struct vattr *a_vap;
5022	} */ *ap;
5023{
5024	zfsvfs_t *zfsvfs;
5025	struct componentname *cnp = ap->a_cnp;
5026	vattr_t *vap = ap->a_vap;
5027	int error, mode;
5028
5029	ASSERT(cnp->cn_flags & SAVENAME);
5030
5031	vattr_init_mask(vap);
5032	mode = vap->va_mode & ALLPERMS;
5033	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5034
5035	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5036	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5037	if (zfsvfs->z_use_namecache &&
5038	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5039		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5040	return (error);
5041}
5042
5043static int
5044zfs_freebsd_remove(ap)
5045	struct vop_remove_args /* {
5046		struct vnode *a_dvp;
5047		struct vnode *a_vp;
5048		struct componentname *a_cnp;
5049	} */ *ap;
5050{
5051
5052	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5053
5054	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5055	    ap->a_cnp->cn_cred));
5056}
5057
5058static int
5059zfs_freebsd_mkdir(ap)
5060	struct vop_mkdir_args /* {
5061		struct vnode *a_dvp;
5062		struct vnode **a_vpp;
5063		struct componentname *a_cnp;
5064		struct vattr *a_vap;
5065	} */ *ap;
5066{
5067	vattr_t *vap = ap->a_vap;
5068
5069	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5070
5071	vattr_init_mask(vap);
5072
5073	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5074	    ap->a_cnp->cn_cred));
5075}
5076
5077static int
5078zfs_freebsd_rmdir(ap)
5079	struct vop_rmdir_args /* {
5080		struct vnode *a_dvp;
5081		struct vnode *a_vp;
5082		struct componentname *a_cnp;
5083	} */ *ap;
5084{
5085	struct componentname *cnp = ap->a_cnp;
5086
5087	ASSERT(cnp->cn_flags & SAVENAME);
5088
5089	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5090}
5091
5092static int
5093zfs_freebsd_readdir(ap)
5094	struct vop_readdir_args /* {
5095		struct vnode *a_vp;
5096		struct uio *a_uio;
5097		struct ucred *a_cred;
5098		int *a_eofflag;
5099		int *a_ncookies;
5100		u_long **a_cookies;
5101	} */ *ap;
5102{
5103
5104	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5105	    ap->a_ncookies, ap->a_cookies));
5106}
5107
5108static int
5109zfs_freebsd_fsync(ap)
5110	struct vop_fsync_args /* {
5111		struct vnode *a_vp;
5112		int a_waitfor;
5113		struct thread *a_td;
5114	} */ *ap;
5115{
5116
5117	vop_stdfsync(ap);
5118	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5119}
5120
5121static int
5122zfs_freebsd_getattr(ap)
5123	struct vop_getattr_args /* {
5124		struct vnode *a_vp;
5125		struct vattr *a_vap;
5126		struct ucred *a_cred;
5127	} */ *ap;
5128{
5129	vattr_t *vap = ap->a_vap;
5130	xvattr_t xvap;
5131	u_long fflags = 0;
5132	int error;
5133
5134	xva_init(&xvap);
5135	xvap.xva_vattr = *vap;
5136	xvap.xva_vattr.va_mask |= AT_XVATTR;
5137
5138	/* Convert chflags into ZFS-type flags. */
5139	/* XXX: what about SF_SETTABLE?. */
5140	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5141	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5142	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5143	XVA_SET_REQ(&xvap, XAT_NODUMP);
5144	XVA_SET_REQ(&xvap, XAT_READONLY);
5145	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5146	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5147	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5148	XVA_SET_REQ(&xvap, XAT_REPARSE);
5149	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5150	XVA_SET_REQ(&xvap, XAT_SPARSE);
5151
5152	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5153	if (error != 0)
5154		return (error);
5155
5156	/* Convert ZFS xattr into chflags. */
5157#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5158	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5159		fflags |= (fflag);					\
5160} while (0)
5161	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5162	    xvap.xva_xoptattrs.xoa_immutable);
5163	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5164	    xvap.xva_xoptattrs.xoa_appendonly);
5165	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5166	    xvap.xva_xoptattrs.xoa_nounlink);
5167	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5168	    xvap.xva_xoptattrs.xoa_archive);
5169	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5170	    xvap.xva_xoptattrs.xoa_nodump);
5171	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5172	    xvap.xva_xoptattrs.xoa_readonly);
5173	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5174	    xvap.xva_xoptattrs.xoa_system);
5175	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5176	    xvap.xva_xoptattrs.xoa_hidden);
5177	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5178	    xvap.xva_xoptattrs.xoa_reparse);
5179	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5180	    xvap.xva_xoptattrs.xoa_offline);
5181	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5182	    xvap.xva_xoptattrs.xoa_sparse);
5183
5184#undef	FLAG_CHECK
5185	*vap = xvap.xva_vattr;
5186	vap->va_flags = fflags;
5187	return (0);
5188}
5189
5190static int
5191zfs_freebsd_setattr(ap)
5192	struct vop_setattr_args /* {
5193		struct vnode *a_vp;
5194		struct vattr *a_vap;
5195		struct ucred *a_cred;
5196	} */ *ap;
5197{
5198	vnode_t *vp = ap->a_vp;
5199	vattr_t *vap = ap->a_vap;
5200	cred_t *cred = ap->a_cred;
5201	xvattr_t xvap;
5202	u_long fflags;
5203	uint64_t zflags;
5204
5205	vattr_init_mask(vap);
5206	vap->va_mask &= ~AT_NOSET;
5207
5208	xva_init(&xvap);
5209	xvap.xva_vattr = *vap;
5210
5211	zflags = VTOZ(vp)->z_pflags;
5212
5213	if (vap->va_flags != VNOVAL) {
5214		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5215		int error;
5216
5217		if (zfsvfs->z_use_fuids == B_FALSE)
5218			return (EOPNOTSUPP);
5219
5220		fflags = vap->va_flags;
5221		/*
5222		 * XXX KDM
5223		 * We need to figure out whether it makes sense to allow
5224		 * UF_REPARSE through, since we don't really have other
5225		 * facilities to handle reparse points and zfs_setattr()
5226		 * doesn't currently allow setting that attribute anyway.
5227		 */
5228		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5229		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5230		     UF_OFFLINE|UF_SPARSE)) != 0)
5231			return (EOPNOTSUPP);
5232		/*
5233		 * Unprivileged processes are not permitted to unset system
5234		 * flags, or modify flags if any system flags are set.
5235		 * Privileged non-jail processes may not modify system flags
5236		 * if securelevel > 0 and any existing system flags are set.
5237		 * Privileged jail processes behave like privileged non-jail
5238		 * processes if the security.jail.chflags_allowed sysctl is
5239		 * is non-zero; otherwise, they behave like unprivileged
5240		 * processes.
5241		 */
5242		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5243		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5244			if (zflags &
5245			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5246				error = securelevel_gt(cred, 0);
5247				if (error != 0)
5248					return (error);
5249			}
5250		} else {
5251			/*
5252			 * Callers may only modify the file flags on objects they
5253			 * have VADMIN rights for.
5254			 */
5255			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5256				return (error);
5257			if (zflags &
5258			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5259				return (EPERM);
5260			}
5261			if (fflags &
5262			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5263				return (EPERM);
5264			}
5265		}
5266
5267#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5268	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5269	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5270		XVA_SET_REQ(&xvap, (xflag));				\
5271		(xfield) = ((fflags & (fflag)) != 0);			\
5272	}								\
5273} while (0)
5274		/* Convert chflags into ZFS-type flags. */
5275		/* XXX: what about SF_SETTABLE?. */
5276		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5277		    xvap.xva_xoptattrs.xoa_immutable);
5278		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5279		    xvap.xva_xoptattrs.xoa_appendonly);
5280		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5281		    xvap.xva_xoptattrs.xoa_nounlink);
5282		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5283		    xvap.xva_xoptattrs.xoa_archive);
5284		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5285		    xvap.xva_xoptattrs.xoa_nodump);
5286		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5287		    xvap.xva_xoptattrs.xoa_readonly);
5288		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5289		    xvap.xva_xoptattrs.xoa_system);
5290		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5291		    xvap.xva_xoptattrs.xoa_hidden);
5292		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5293		    xvap.xva_xoptattrs.xoa_hidden);
5294		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5295		    xvap.xva_xoptattrs.xoa_offline);
5296		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5297		    xvap.xva_xoptattrs.xoa_sparse);
5298#undef	FLAG_CHANGE
5299	}
5300	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5301}
5302
5303static int
5304zfs_freebsd_rename(ap)
5305	struct vop_rename_args  /* {
5306		struct vnode *a_fdvp;
5307		struct vnode *a_fvp;
5308		struct componentname *a_fcnp;
5309		struct vnode *a_tdvp;
5310		struct vnode *a_tvp;
5311		struct componentname *a_tcnp;
5312	} */ *ap;
5313{
5314	vnode_t *fdvp = ap->a_fdvp;
5315	vnode_t *fvp = ap->a_fvp;
5316	vnode_t *tdvp = ap->a_tdvp;
5317	vnode_t *tvp = ap->a_tvp;
5318	int error;
5319
5320	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5321	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5322
5323	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5324	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5325
5326	vrele(fdvp);
5327	vrele(fvp);
5328	vrele(tdvp);
5329	if (tvp != NULL)
5330		vrele(tvp);
5331
5332	return (error);
5333}
5334
5335static int
5336zfs_freebsd_symlink(ap)
5337	struct vop_symlink_args /* {
5338		struct vnode *a_dvp;
5339		struct vnode **a_vpp;
5340		struct componentname *a_cnp;
5341		struct vattr *a_vap;
5342		char *a_target;
5343	} */ *ap;
5344{
5345	struct componentname *cnp = ap->a_cnp;
5346	vattr_t *vap = ap->a_vap;
5347
5348	ASSERT(cnp->cn_flags & SAVENAME);
5349
5350	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5351	vattr_init_mask(vap);
5352
5353	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5354	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5355}
5356
5357static int
5358zfs_freebsd_readlink(ap)
5359	struct vop_readlink_args /* {
5360		struct vnode *a_vp;
5361		struct uio *a_uio;
5362		struct ucred *a_cred;
5363	} */ *ap;
5364{
5365
5366	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5367}
5368
5369static int
5370zfs_freebsd_link(ap)
5371	struct vop_link_args /* {
5372		struct vnode *a_tdvp;
5373		struct vnode *a_vp;
5374		struct componentname *a_cnp;
5375	} */ *ap;
5376{
5377	struct componentname *cnp = ap->a_cnp;
5378	vnode_t *vp = ap->a_vp;
5379	vnode_t *tdvp = ap->a_tdvp;
5380
5381	if (tdvp->v_mount != vp->v_mount)
5382		return (EXDEV);
5383
5384	ASSERT(cnp->cn_flags & SAVENAME);
5385
5386	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5387}
5388
5389static int
5390zfs_freebsd_inactive(ap)
5391	struct vop_inactive_args /* {
5392		struct vnode *a_vp;
5393		struct thread *a_td;
5394	} */ *ap;
5395{
5396	vnode_t *vp = ap->a_vp;
5397
5398	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5399	return (0);
5400}
5401
5402static int
5403zfs_freebsd_reclaim(ap)
5404	struct vop_reclaim_args /* {
5405		struct vnode *a_vp;
5406		struct thread *a_td;
5407	} */ *ap;
5408{
5409	vnode_t	*vp = ap->a_vp;
5410	znode_t	*zp = VTOZ(vp);
5411	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5412
5413	ASSERT(zp != NULL);
5414
5415	/* Destroy the vm object and flush associated pages. */
5416	vnode_destroy_vobject(vp);
5417
5418	/*
5419	 * z_teardown_inactive_lock protects from a race with
5420	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5421	 * force unmount.
5422	 */
5423	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5424	if (zp->z_sa_hdl == NULL)
5425		zfs_znode_free(zp);
5426	else
5427		zfs_zinactive(zp);
5428	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5429
5430	vp->v_data = NULL;
5431	return (0);
5432}
5433
5434static int
5435zfs_freebsd_fid(ap)
5436	struct vop_fid_args /* {
5437		struct vnode *a_vp;
5438		struct fid *a_fid;
5439	} */ *ap;
5440{
5441
5442	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5443}
5444
5445static int
5446zfs_freebsd_pathconf(ap)
5447	struct vop_pathconf_args /* {
5448		struct vnode *a_vp;
5449		int a_name;
5450		register_t *a_retval;
5451	} */ *ap;
5452{
5453	ulong_t val;
5454	int error;
5455
5456	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5457	if (error == 0)
5458		*ap->a_retval = val;
5459	else if (error == EOPNOTSUPP)
5460		error = vop_stdpathconf(ap);
5461	return (error);
5462}
5463
5464static int
5465zfs_freebsd_fifo_pathconf(ap)
5466	struct vop_pathconf_args /* {
5467		struct vnode *a_vp;
5468		int a_name;
5469		register_t *a_retval;
5470	} */ *ap;
5471{
5472
5473	switch (ap->a_name) {
5474	case _PC_ACL_EXTENDED:
5475	case _PC_ACL_NFS4:
5476	case _PC_ACL_PATH_MAX:
5477	case _PC_MAC_PRESENT:
5478		return (zfs_freebsd_pathconf(ap));
5479	default:
5480		return (fifo_specops.vop_pathconf(ap));
5481	}
5482}
5483
5484/*
5485 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5486 * extended attribute name:
5487 *
5488 *	NAMESPACE	PREFIX
5489 *	system		freebsd:system:
5490 *	user		(none, can be used to access ZFS fsattr(5) attributes
5491 *			created on Solaris)
5492 */
5493static int
5494zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5495    size_t size)
5496{
5497	const char *namespace, *prefix, *suffix;
5498
5499	/* We don't allow '/' character in attribute name. */
5500	if (strchr(name, '/') != NULL)
5501		return (EINVAL);
5502	/* We don't allow attribute names that start with "freebsd:" string. */
5503	if (strncmp(name, "freebsd:", 8) == 0)
5504		return (EINVAL);
5505
5506	bzero(attrname, size);
5507
5508	switch (attrnamespace) {
5509	case EXTATTR_NAMESPACE_USER:
5510#if 0
5511		prefix = "freebsd:";
5512		namespace = EXTATTR_NAMESPACE_USER_STRING;
5513		suffix = ":";
5514#else
5515		/*
5516		 * This is the default namespace by which we can access all
5517		 * attributes created on Solaris.
5518		 */
5519		prefix = namespace = suffix = "";
5520#endif
5521		break;
5522	case EXTATTR_NAMESPACE_SYSTEM:
5523		prefix = "freebsd:";
5524		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5525		suffix = ":";
5526		break;
5527	case EXTATTR_NAMESPACE_EMPTY:
5528	default:
5529		return (EINVAL);
5530	}
5531	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5532	    name) >= size) {
5533		return (ENAMETOOLONG);
5534	}
5535	return (0);
5536}
5537
5538/*
5539 * Vnode operating to retrieve a named extended attribute.
5540 */
5541static int
5542zfs_getextattr(struct vop_getextattr_args *ap)
5543/*
5544vop_getextattr {
5545	IN struct vnode *a_vp;
5546	IN int a_attrnamespace;
5547	IN const char *a_name;
5548	INOUT struct uio *a_uio;
5549	OUT size_t *a_size;
5550	IN struct ucred *a_cred;
5551	IN struct thread *a_td;
5552};
5553*/
5554{
5555	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5556	struct thread *td = ap->a_td;
5557	struct nameidata nd;
5558	char attrname[255];
5559	struct vattr va;
5560	vnode_t *xvp = NULL, *vp;
5561	int error, flags;
5562
5563	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5564	    ap->a_cred, ap->a_td, VREAD);
5565	if (error != 0)
5566		return (error);
5567
5568	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5569	    sizeof(attrname));
5570	if (error != 0)
5571		return (error);
5572
5573	ZFS_ENTER(zfsvfs);
5574
5575	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5576	    LOOKUP_XATTR);
5577	if (error != 0) {
5578		ZFS_EXIT(zfsvfs);
5579		return (error);
5580	}
5581
5582	flags = FREAD;
5583	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5584	    xvp, td);
5585	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5586	vp = nd.ni_vp;
5587	NDFREE(&nd, NDF_ONLY_PNBUF);
5588	if (error != 0) {
5589		ZFS_EXIT(zfsvfs);
5590		if (error == ENOENT)
5591			error = ENOATTR;
5592		return (error);
5593	}
5594
5595	if (ap->a_size != NULL) {
5596		error = VOP_GETATTR(vp, &va, ap->a_cred);
5597		if (error == 0)
5598			*ap->a_size = (size_t)va.va_size;
5599	} else if (ap->a_uio != NULL)
5600		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5601
5602	VOP_UNLOCK(vp, 0);
5603	vn_close(vp, flags, ap->a_cred, td);
5604	ZFS_EXIT(zfsvfs);
5605
5606	return (error);
5607}
5608
5609/*
5610 * Vnode operation to remove a named attribute.
5611 */
5612int
5613zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5614/*
5615vop_deleteextattr {
5616	IN struct vnode *a_vp;
5617	IN int a_attrnamespace;
5618	IN const char *a_name;
5619	IN struct ucred *a_cred;
5620	IN struct thread *a_td;
5621};
5622*/
5623{
5624	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5625	struct thread *td = ap->a_td;
5626	struct nameidata nd;
5627	char attrname[255];
5628	struct vattr va;
5629	vnode_t *xvp = NULL, *vp;
5630	int error, flags;
5631
5632	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5633	    ap->a_cred, ap->a_td, VWRITE);
5634	if (error != 0)
5635		return (error);
5636
5637	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5638	    sizeof(attrname));
5639	if (error != 0)
5640		return (error);
5641
5642	ZFS_ENTER(zfsvfs);
5643
5644	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5645	    LOOKUP_XATTR);
5646	if (error != 0) {
5647		ZFS_EXIT(zfsvfs);
5648		return (error);
5649	}
5650
5651	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5652	    UIO_SYSSPACE, attrname, xvp, td);
5653	error = namei(&nd);
5654	vp = nd.ni_vp;
5655	if (error != 0) {
5656		ZFS_EXIT(zfsvfs);
5657		NDFREE(&nd, NDF_ONLY_PNBUF);
5658		if (error == ENOENT)
5659			error = ENOATTR;
5660		return (error);
5661	}
5662
5663	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5664	NDFREE(&nd, NDF_ONLY_PNBUF);
5665
5666	vput(nd.ni_dvp);
5667	if (vp == nd.ni_dvp)
5668		vrele(vp);
5669	else
5670		vput(vp);
5671	ZFS_EXIT(zfsvfs);
5672
5673	return (error);
5674}
5675
5676/*
5677 * Vnode operation to set a named attribute.
5678 */
5679static int
5680zfs_setextattr(struct vop_setextattr_args *ap)
5681/*
5682vop_setextattr {
5683	IN struct vnode *a_vp;
5684	IN int a_attrnamespace;
5685	IN const char *a_name;
5686	INOUT struct uio *a_uio;
5687	IN struct ucred *a_cred;
5688	IN struct thread *a_td;
5689};
5690*/
5691{
5692	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5693	struct thread *td = ap->a_td;
5694	struct nameidata nd;
5695	char attrname[255];
5696	struct vattr va;
5697	vnode_t *xvp = NULL, *vp;
5698	int error, flags;
5699
5700	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5701	    ap->a_cred, ap->a_td, VWRITE);
5702	if (error != 0)
5703		return (error);
5704
5705	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5706	    sizeof(attrname));
5707	if (error != 0)
5708		return (error);
5709
5710	ZFS_ENTER(zfsvfs);
5711
5712	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5713	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5714	if (error != 0) {
5715		ZFS_EXIT(zfsvfs);
5716		return (error);
5717	}
5718
5719	flags = FFLAGS(O_WRONLY | O_CREAT);
5720	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5721	    xvp, td);
5722	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5723	vp = nd.ni_vp;
5724	NDFREE(&nd, NDF_ONLY_PNBUF);
5725	if (error != 0) {
5726		ZFS_EXIT(zfsvfs);
5727		return (error);
5728	}
5729
5730	VATTR_NULL(&va);
5731	va.va_size = 0;
5732	error = VOP_SETATTR(vp, &va, ap->a_cred);
5733	if (error == 0)
5734		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5735
5736	VOP_UNLOCK(vp, 0);
5737	vn_close(vp, flags, ap->a_cred, td);
5738	ZFS_EXIT(zfsvfs);
5739
5740	return (error);
5741}
5742
5743/*
5744 * Vnode operation to retrieve extended attributes on a vnode.
5745 */
5746static int
5747zfs_listextattr(struct vop_listextattr_args *ap)
5748/*
5749vop_listextattr {
5750	IN struct vnode *a_vp;
5751	IN int a_attrnamespace;
5752	INOUT struct uio *a_uio;
5753	OUT size_t *a_size;
5754	IN struct ucred *a_cred;
5755	IN struct thread *a_td;
5756};
5757*/
5758{
5759	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5760	struct thread *td = ap->a_td;
5761	struct nameidata nd;
5762	char attrprefix[16];
5763	u_char dirbuf[sizeof(struct dirent)];
5764	struct dirent *dp;
5765	struct iovec aiov;
5766	struct uio auio, *uio = ap->a_uio;
5767	size_t *sizep = ap->a_size;
5768	size_t plen;
5769	vnode_t *xvp = NULL, *vp;
5770	int done, error, eof, pos;
5771
5772	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5773	    ap->a_cred, ap->a_td, VREAD);
5774	if (error != 0)
5775		return (error);
5776
5777	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5778	    sizeof(attrprefix));
5779	if (error != 0)
5780		return (error);
5781	plen = strlen(attrprefix);
5782
5783	ZFS_ENTER(zfsvfs);
5784
5785	if (sizep != NULL)
5786		*sizep = 0;
5787
5788	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5789	    LOOKUP_XATTR);
5790	if (error != 0) {
5791		ZFS_EXIT(zfsvfs);
5792		/*
5793		 * ENOATTR means that the EA directory does not yet exist,
5794		 * i.e. there are no extended attributes there.
5795		 */
5796		if (error == ENOATTR)
5797			error = 0;
5798		return (error);
5799	}
5800
5801	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5802	    UIO_SYSSPACE, ".", xvp, td);
5803	error = namei(&nd);
5804	vp = nd.ni_vp;
5805	NDFREE(&nd, NDF_ONLY_PNBUF);
5806	if (error != 0) {
5807		ZFS_EXIT(zfsvfs);
5808		return (error);
5809	}
5810
5811	auio.uio_iov = &aiov;
5812	auio.uio_iovcnt = 1;
5813	auio.uio_segflg = UIO_SYSSPACE;
5814	auio.uio_td = td;
5815	auio.uio_rw = UIO_READ;
5816	auio.uio_offset = 0;
5817
5818	do {
5819		u_char nlen;
5820
5821		aiov.iov_base = (void *)dirbuf;
5822		aiov.iov_len = sizeof(dirbuf);
5823		auio.uio_resid = sizeof(dirbuf);
5824		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5825		done = sizeof(dirbuf) - auio.uio_resid;
5826		if (error != 0)
5827			break;
5828		for (pos = 0; pos < done;) {
5829			dp = (struct dirent *)(dirbuf + pos);
5830			pos += dp->d_reclen;
5831			/*
5832			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5833			 * is what we get when attribute was created on Solaris.
5834			 */
5835			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5836				continue;
5837			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5838				continue;
5839			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5840				continue;
5841			nlen = dp->d_namlen - plen;
5842			if (sizep != NULL)
5843				*sizep += 1 + nlen;
5844			else if (uio != NULL) {
5845				/*
5846				 * Format of extattr name entry is one byte for
5847				 * length and the rest for name.
5848				 */
5849				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5850				if (error == 0) {
5851					error = uiomove(dp->d_name + plen, nlen,
5852					    uio->uio_rw, uio);
5853				}
5854				if (error != 0)
5855					break;
5856			}
5857		}
5858	} while (!eof && error == 0);
5859
5860	vput(vp);
5861	ZFS_EXIT(zfsvfs);
5862
5863	return (error);
5864}
5865
5866int
5867zfs_freebsd_getacl(ap)
5868	struct vop_getacl_args /* {
5869		struct vnode *vp;
5870		acl_type_t type;
5871		struct acl *aclp;
5872		struct ucred *cred;
5873		struct thread *td;
5874	} */ *ap;
5875{
5876	int		error;
5877	vsecattr_t      vsecattr;
5878
5879	if (ap->a_type != ACL_TYPE_NFS4)
5880		return (EINVAL);
5881
5882	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5883	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5884		return (error);
5885
5886	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5887	if (vsecattr.vsa_aclentp != NULL)
5888		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5889
5890	return (error);
5891}
5892
5893int
5894zfs_freebsd_setacl(ap)
5895	struct vop_setacl_args /* {
5896		struct vnode *vp;
5897		acl_type_t type;
5898		struct acl *aclp;
5899		struct ucred *cred;
5900		struct thread *td;
5901	} */ *ap;
5902{
5903	int		error;
5904	vsecattr_t      vsecattr;
5905	int		aclbsize;	/* size of acl list in bytes */
5906	aclent_t	*aaclp;
5907
5908	if (ap->a_type != ACL_TYPE_NFS4)
5909		return (EINVAL);
5910
5911	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5912		return (EINVAL);
5913
5914	/*
5915	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5916	 * splitting every entry into two and appending "canonical six"
5917	 * entries at the end.  Don't allow for setting an ACL that would
5918	 * cause chmod(2) to run out of ACL entries.
5919	 */
5920	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5921		return (ENOSPC);
5922
5923	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5924	if (error != 0)
5925		return (error);
5926
5927	vsecattr.vsa_mask = VSA_ACE;
5928	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5929	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5930	aaclp = vsecattr.vsa_aclentp;
5931	vsecattr.vsa_aclentsz = aclbsize;
5932
5933	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5934	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5935	kmem_free(aaclp, aclbsize);
5936
5937	return (error);
5938}
5939
5940int
5941zfs_freebsd_aclcheck(ap)
5942	struct vop_aclcheck_args /* {
5943		struct vnode *vp;
5944		acl_type_t type;
5945		struct acl *aclp;
5946		struct ucred *cred;
5947		struct thread *td;
5948	} */ *ap;
5949{
5950
5951	return (EOPNOTSUPP);
5952}
5953
5954static int
5955zfs_vptocnp(struct vop_vptocnp_args *ap)
5956{
5957	vnode_t *covered_vp;
5958	vnode_t *vp = ap->a_vp;;
5959	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5960	znode_t *zp = VTOZ(vp);
5961	uint64_t parent;
5962	int ltype;
5963	int error;
5964
5965	ZFS_ENTER(zfsvfs);
5966	ZFS_VERIFY_ZP(zp);
5967
5968	/*
5969	 * If we are a snapshot mounted under .zfs, run the operation
5970	 * on the covered vnode.
5971	 */
5972	if ((error = sa_lookup(zp->z_sa_hdl,
5973	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) {
5974		ZFS_EXIT(zfsvfs);
5975		return (error);
5976	}
5977
5978	if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) {
5979		ZFS_EXIT(zfsvfs);
5980		return (vop_stdvptocnp(ap));
5981	}
5982	ZFS_EXIT(zfsvfs);
5983
5984	covered_vp = vp->v_mount->mnt_vnodecovered;
5985	vhold(covered_vp);
5986	ltype = VOP_ISLOCKED(vp);
5987	VOP_UNLOCK(vp, 0);
5988	error = vget(covered_vp, LK_EXCLUSIVE, curthread);
5989	vdrop(covered_vp);
5990	if (error == 0) {
5991		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5992		    ap->a_buf, ap->a_buflen);
5993		vput(covered_vp);
5994	}
5995	vn_lock(vp, ltype | LK_RETRY);
5996	if ((vp->v_iflag & VI_DOOMED) != 0)
5997		error = SET_ERROR(ENOENT);
5998	return (error);
5999}
6000
6001#ifdef DIAGNOSTIC
6002static int
6003zfs_lock(ap)
6004	struct vop_lock1_args /* {
6005		struct vnode *a_vp;
6006		int a_flags;
6007		char *file;
6008		int line;
6009	} */ *ap;
6010{
6011	zfsvfs_t *zfsvfs;
6012	znode_t *zp;
6013	vnode_t *vp;
6014	int flags;
6015	int err;
6016
6017	vp = ap->a_vp;
6018	flags = ap->a_flags;
6019	if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
6020	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
6021		zfsvfs = zp->z_zfsvfs;
6022		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
6023	}
6024	err = vop_stdlock(ap);
6025	if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
6026	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
6027		zfsvfs = zp->z_zfsvfs;
6028		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
6029	}
6030	return (err);
6031}
6032#endif
6033
6034struct vop_vector zfs_vnodeops;
6035struct vop_vector zfs_fifoops;
6036struct vop_vector zfs_shareops;
6037
6038struct vop_vector zfs_vnodeops = {
6039	.vop_default =		&default_vnodeops,
6040	.vop_inactive =		zfs_freebsd_inactive,
6041	.vop_reclaim =		zfs_freebsd_reclaim,
6042	.vop_access =		zfs_freebsd_access,
6043	.vop_lookup =		zfs_cache_lookup,
6044	.vop_cachedlookup =	zfs_freebsd_lookup,
6045	.vop_getattr =		zfs_freebsd_getattr,
6046	.vop_setattr =		zfs_freebsd_setattr,
6047	.vop_create =		zfs_freebsd_create,
6048	.vop_mknod =		zfs_freebsd_create,
6049	.vop_mkdir =		zfs_freebsd_mkdir,
6050	.vop_readdir =		zfs_freebsd_readdir,
6051	.vop_fsync =		zfs_freebsd_fsync,
6052	.vop_open =		zfs_freebsd_open,
6053	.vop_close =		zfs_freebsd_close,
6054	.vop_rmdir =		zfs_freebsd_rmdir,
6055	.vop_ioctl =		zfs_freebsd_ioctl,
6056	.vop_link =		zfs_freebsd_link,
6057	.vop_symlink =		zfs_freebsd_symlink,
6058	.vop_readlink =		zfs_freebsd_readlink,
6059	.vop_read =		zfs_freebsd_read,
6060	.vop_write =		zfs_freebsd_write,
6061	.vop_remove =		zfs_freebsd_remove,
6062	.vop_rename =		zfs_freebsd_rename,
6063	.vop_pathconf =		zfs_freebsd_pathconf,
6064	.vop_bmap =		zfs_freebsd_bmap,
6065	.vop_fid =		zfs_freebsd_fid,
6066	.vop_getextattr =	zfs_getextattr,
6067	.vop_deleteextattr =	zfs_deleteextattr,
6068	.vop_setextattr =	zfs_setextattr,
6069	.vop_listextattr =	zfs_listextattr,
6070	.vop_getacl =		zfs_freebsd_getacl,
6071	.vop_setacl =		zfs_freebsd_setacl,
6072	.vop_aclcheck =		zfs_freebsd_aclcheck,
6073	.vop_getpages =		zfs_freebsd_getpages,
6074	.vop_putpages =		zfs_freebsd_putpages,
6075	.vop_vptocnp =		zfs_vptocnp,
6076#ifdef DIAGNOSTIC
6077	.vop_lock1 =		zfs_lock,
6078#endif
6079};
6080
6081struct vop_vector zfs_fifoops = {
6082	.vop_default =		&fifo_specops,
6083	.vop_fsync =		zfs_freebsd_fsync,
6084	.vop_access =		zfs_freebsd_access,
6085	.vop_getattr =		zfs_freebsd_getattr,
6086	.vop_inactive =		zfs_freebsd_inactive,
6087	.vop_read =		VOP_PANIC,
6088	.vop_reclaim =		zfs_freebsd_reclaim,
6089	.vop_setattr =		zfs_freebsd_setattr,
6090	.vop_write =		VOP_PANIC,
6091	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6092	.vop_fid =		zfs_freebsd_fid,
6093	.vop_getacl =		zfs_freebsd_getacl,
6094	.vop_setacl =		zfs_freebsd_setacl,
6095	.vop_aclcheck =		zfs_freebsd_aclcheck,
6096};
6097
6098/*
6099 * special share hidden files vnode operations template
6100 */
6101struct vop_vector zfs_shareops = {
6102	.vop_default =		&default_vnodeops,
6103	.vop_access =		zfs_freebsd_access,
6104	.vop_inactive =		zfs_freebsd_inactive,
6105	.vop_reclaim =		zfs_freebsd_reclaim,
6106	.vop_fid =		zfs_freebsd_fid,
6107	.vop_pathconf =		zfs_freebsd_pathconf,
6108};
6109