zfs_vnops.c revision 326428
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77
78/*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory.  The example below illustrates the following Big Rules:
88 *
89 *  (1)	A check must be made in each zfs thread for a mounted file system.
90 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93 *	can return EIO from the calling function.
94 *
95 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 *	First, if it's the last reference, the vnode/znode
98 *	can be freed, so the zp may point to freed memory.  Second, the last
99 *	reference will call zfs_zinactive(), which may induce a lot of work --
100 *	pushing cached pages (which acquires range locks) and syncing out
101 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102 *	which could deadlock the system if you were already holding one.
103 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106 *	as they can span dmu_tx_assign() calls.
107 *
108 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 *      dmu_tx_assign().  This is critical because we don't want to block
110 *      while holding locks.
111 *
112 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113 *	reduces lock contention and CPU usage when we must wait (note that if
114 *	throughput is constrained by the storage, nearly every transaction
115 *	must wait).
116 *
117 *      Note, in particular, that if a lock is sometimes acquired before
118 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119 *      to use a non-blocking assign can deadlock the system.  The scenario:
120 *
121 *	Thread A has grabbed a lock before calling dmu_tx_assign().
122 *	Thread B is in an already-assigned tx, and blocks for this lock.
123 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 *	forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
129 *	to indicate that this operation has already called dmu_tx_wait().
130 *	This will ensure that we don't retry forever, waiting a short bit
131 *	each time.
132 *
133 *  (5)	If the operation succeeded, generate the intent log entry for it
134 *	before dropping locks.  This ensures that the ordering of events
135 *	in the intent log matches the order in which they actually occurred.
136 *	During ZIL replay the zfs_log_* functions will update the sequence
137 *	number to indicate the zil transaction has replayed.
138 *
139 *  (6)	At the end of each vnode op, the DMU tx must always commit,
140 *	regardless of whether there were any errors.
141 *
142 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143 *	to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148 * top:
149 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150 *	rw_enter(...);			// grab any other locks you need
151 *	tx = dmu_tx_create(...);	// get DMU tx
152 *	dmu_tx_hold_*();		// hold each object you might modify
153 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
154 *	if (error) {
155 *		rw_exit(...);		// drop locks
156 *		zfs_dirent_unlock(dl);	// unlock directory entry
157 *		VN_RELE(...);		// release held vnodes
158 *		if (error == ERESTART) {
159 *			waited = B_TRUE;
160 *			dmu_tx_wait(tx);
161 *			dmu_tx_abort(tx);
162 *			goto top;
163 *		}
164 *		dmu_tx_abort(tx);	// abort DMU tx
165 *		ZFS_EXIT(zfsvfs);	// finished in zfs
166 *		return (error);		// really out of space
167 *	}
168 *	error = do_real_work();		// do whatever this VOP does
169 *	if (error == 0)
170 *		zfs_log_*(...);		// on success, make ZIL entry
171 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172 *	rw_exit(...);			// drop locks
173 *	zfs_dirent_unlock(dl);		// unlock directory entry
174 *	VN_RELE(...);			// release held vnodes
175 *	zil_commit(zilog, foid);	// synchronous when necessary
176 *	ZFS_EXIT(zfsvfs);		// finished in zfs
177 *	return (error);			// done, report error
178 */
179
180/* ARGSUSED */
181static int
182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183{
184	znode_t	*zp = VTOZ(*vpp);
185	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187	ZFS_ENTER(zfsvfs);
188	ZFS_VERIFY_ZP(zp);
189
190	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191	    ((flag & FAPPEND) == 0)) {
192		ZFS_EXIT(zfsvfs);
193		return (SET_ERROR(EPERM));
194	}
195
196	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197	    ZTOV(zp)->v_type == VREG &&
198	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199		if (fs_vscan(*vpp, cr, 0) != 0) {
200			ZFS_EXIT(zfsvfs);
201			return (SET_ERROR(EACCES));
202		}
203	}
204
205	/* Keep a count of the synchronous opens in the znode */
206	if (flag & (FSYNC | FDSYNC))
207		atomic_inc_32(&zp->z_sync_cnt);
208
209	ZFS_EXIT(zfsvfs);
210	return (0);
211}
212
213/* ARGSUSED */
214static int
215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216    caller_context_t *ct)
217{
218	znode_t	*zp = VTOZ(vp);
219	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221	/*
222	 * Clean up any locks held by this process on the vp.
223	 */
224	cleanlocks(vp, ddi_get_pid(), 0);
225	cleanshares(vp, ddi_get_pid());
226
227	ZFS_ENTER(zfsvfs);
228	ZFS_VERIFY_ZP(zp);
229
230	/* Decrement the synchronous opens in the znode */
231	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232		atomic_dec_32(&zp->z_sync_cnt);
233
234	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235	    ZTOV(zp)->v_type == VREG &&
236	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237		VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239	ZFS_EXIT(zfsvfs);
240	return (0);
241}
242
243/*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247static int
248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249{
250	znode_t	*zp = VTOZ(vp);
251	uint64_t noff = (uint64_t)*off; /* new offset */
252	uint64_t file_sz;
253	int error;
254	boolean_t hole;
255
256	file_sz = zp->z_size;
257	if (noff >= file_sz)  {
258		return (SET_ERROR(ENXIO));
259	}
260
261	if (cmd == _FIO_SEEK_HOLE)
262		hole = B_TRUE;
263	else
264		hole = B_FALSE;
265
266	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268	if (error == ESRCH)
269		return (SET_ERROR(ENXIO));
270
271	/*
272	 * We could find a hole that begins after the logical end-of-file,
273	 * because dmu_offset_next() only works on whole blocks.  If the
274	 * EOF falls mid-block, then indicate that the "virtual hole"
275	 * at the end of the file begins at the logical EOF, rather than
276	 * at the end of the last block.
277	 */
278	if (noff > file_sz) {
279		ASSERT(hole);
280		noff = file_sz;
281	}
282
283	if (noff < *off)
284		return (error);
285	*off = noff;
286	return (error);
287}
288
289/* ARGSUSED */
290static int
291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292    int *rvalp, caller_context_t *ct)
293{
294	offset_t off;
295	offset_t ndata;
296	dmu_object_info_t doi;
297	int error;
298	zfsvfs_t *zfsvfs;
299	znode_t *zp;
300
301	switch (com) {
302	case _FIOFFS:
303	{
304		return (0);
305
306		/*
307		 * The following two ioctls are used by bfu.  Faking out,
308		 * necessary to avoid bfu errors.
309		 */
310	}
311	case _FIOGDIO:
312	case _FIOSDIO:
313	{
314		return (0);
315	}
316
317	case _FIO_SEEK_DATA:
318	case _FIO_SEEK_HOLE:
319	{
320#ifdef illumos
321		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322			return (SET_ERROR(EFAULT));
323#else
324		off = *(offset_t *)data;
325#endif
326		zp = VTOZ(vp);
327		zfsvfs = zp->z_zfsvfs;
328		ZFS_ENTER(zfsvfs);
329		ZFS_VERIFY_ZP(zp);
330
331		/* offset parameter is in/out */
332		error = zfs_holey(vp, com, &off);
333		ZFS_EXIT(zfsvfs);
334		if (error)
335			return (error);
336#ifdef illumos
337		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		*(offset_t *)data = off;
341#endif
342		return (0);
343	}
344#ifdef illumos
345	case _FIO_COUNT_FILLED:
346	{
347		/*
348		 * _FIO_COUNT_FILLED adds a new ioctl command which
349		 * exposes the number of filled blocks in a
350		 * ZFS object.
351		 */
352		zp = VTOZ(vp);
353		zfsvfs = zp->z_zfsvfs;
354		ZFS_ENTER(zfsvfs);
355		ZFS_VERIFY_ZP(zp);
356
357		/*
358		 * Wait for all dirty blocks for this object
359		 * to get synced out to disk, and the DMU info
360		 * updated.
361		 */
362		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363		if (error) {
364			ZFS_EXIT(zfsvfs);
365			return (error);
366		}
367
368		/*
369		 * Retrieve fill count from DMU object.
370		 */
371		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372		if (error) {
373			ZFS_EXIT(zfsvfs);
374			return (error);
375		}
376
377		ndata = doi.doi_fill_count;
378
379		ZFS_EXIT(zfsvfs);
380		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381			return (SET_ERROR(EFAULT));
382		return (0);
383	}
384#endif
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389static vm_page_t
390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391{
392	vm_object_t obj;
393	vm_page_t pp;
394	int64_t end;
395
396	/*
397	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398	 * aligned boundaries, if the range is not aligned.  As a result a
399	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401	 * the whole page would be considred clean despite have some dirty data.
402	 * For this reason we should shrink the range to DEV_BSIZE aligned
403	 * boundaries before calling vm_page_clear_dirty.
404	 */
405	end = rounddown2(off + nbytes, DEV_BSIZE);
406	off = roundup2(off, DEV_BSIZE);
407	nbytes = end - off;
408
409	obj = vp->v_object;
410	zfs_vmobject_assert_wlocked(obj);
411
412	for (;;) {
413		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414		    pp->valid) {
415			if (vm_page_xbusied(pp)) {
416				/*
417				 * Reference the page before unlocking and
418				 * sleeping so that the page daemon is less
419				 * likely to reclaim it.
420				 */
421				vm_page_reference(pp);
422				vm_page_lock(pp);
423				zfs_vmobject_wunlock(obj);
424				vm_page_busy_sleep(pp, "zfsmwb", true);
425				zfs_vmobject_wlock(obj);
426				continue;
427			}
428			vm_page_sbusy(pp);
429		} else if (pp == NULL) {
430			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432			    VM_ALLOC_SBUSY);
433		} else {
434			ASSERT(pp != NULL && !pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb", true);
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes;
706	int		error = 0;
707	rl_t		*rl;
708	xuio_t		*xuio = NULL;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757	/*
758	 * If we are reading past end-of-file we can skip
759	 * to the end; but we might still need to set atime.
760	 */
761	if (uio->uio_loffset >= zp->z_size) {
762		error = 0;
763		goto out;
764	}
765
766	ASSERT(uio->uio_loffset < zp->z_size);
767	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769#ifdef illumos
770	if ((uio->uio_extflg == UIO_XUIO) &&
771	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772		int nblk;
773		int blksz = zp->z_blksz;
774		uint64_t offset = uio->uio_loffset;
775
776		xuio = (xuio_t *)uio;
777		if ((ISP2(blksz))) {
778			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779			    blksz)) / blksz;
780		} else {
781			ASSERT(offset + n <= blksz);
782			nblk = 1;
783		}
784		(void) dmu_xuio_init(xuio, nblk);
785
786		if (vn_has_cached_data(vp)) {
787			/*
788			 * For simplicity, we always allocate a full buffer
789			 * even if we only expect to read a portion of a block.
790			 */
791			while (--nblk >= 0) {
792				(void) dmu_xuio_add(xuio,
793				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794				    blksz), 0, blksz);
795			}
796		}
797	}
798#endif	/* illumos */
799
800	while (n > 0) {
801		nbytes = MIN(n, zfs_read_chunk_size -
802		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804#ifdef __FreeBSD__
805		if (uio->uio_segflg == UIO_NOCOPY)
806			error = mappedread_sf(vp, nbytes, uio);
807		else
808#endif /* __FreeBSD__ */
809		if (vn_has_cached_data(vp)) {
810			error = mappedread(vp, nbytes, uio);
811		} else {
812			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813			    uio, nbytes);
814		}
815		if (error) {
816			/* convert checksum errors into IO errors */
817			if (error == ECKSUM)
818				error = SET_ERROR(EIO);
819			break;
820		}
821
822		n -= nbytes;
823	}
824out:
825	zfs_range_unlock(rl);
826
827	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828	ZFS_EXIT(zfsvfs);
829	return (error);
830}
831
832/*
833 * Write the bytes to a file.
834 *
835 *	IN:	vp	- vnode of file to be written to.
836 *		uio	- structure supplying write location, range info,
837 *			  and data buffer.
838 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839 *			  set if in append mode.
840 *		cr	- credentials of caller.
841 *		ct	- caller context (NFS/CIFS fem monitor only)
842 *
843 *	OUT:	uio	- updated offset and range.
844 *
845 *	RETURN:	0 on success, error code on failure.
846 *
847 * Timestamps:
848 *	vp - ctime|mtime updated if byte count > 0
849 */
850
851/* ARGSUSED */
852static int
853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854{
855	znode_t		*zp = VTOZ(vp);
856	rlim64_t	limit = MAXOFFSET_T;
857	ssize_t		start_resid = uio->uio_resid;
858	ssize_t		tx_bytes;
859	uint64_t	end_size;
860	dmu_tx_t	*tx;
861	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862	zilog_t		*zilog;
863	offset_t	woff;
864	ssize_t		n, nbytes;
865	rl_t		*rl;
866	int		max_blksz = zfsvfs->z_max_blksz;
867	int		error = 0;
868	arc_buf_t	*abuf;
869	iovec_t		*aiov = NULL;
870	xuio_t		*xuio = NULL;
871	int		i_iov = 0;
872	int		iovcnt = uio->uio_iovcnt;
873	iovec_t		*iovp = uio->uio_iov;
874	int		write_eof;
875	int		count = 0;
876	sa_bulk_attr_t	bulk[4];
877	uint64_t	mtime[2], ctime[2];
878
879	/*
880	 * Fasttrack empty write
881	 */
882	n = start_resid;
883	if (n == 0)
884		return (0);
885
886	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887		limit = MAXOFFSET_T;
888
889	ZFS_ENTER(zfsvfs);
890	ZFS_VERIFY_ZP(zp);
891
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895	    &zp->z_size, 8);
896	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897	    &zp->z_pflags, 8);
898
899	/*
900	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901	 * callers might not be able to detect properly that we are read-only,
902	 * so check it explicitly here.
903	 */
904	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905		ZFS_EXIT(zfsvfs);
906		return (SET_ERROR(EROFS));
907	}
908
909	/*
910	 * If immutable or not appending then return EPERM
911	 */
912	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039			size_t cbytes;
1040
1041			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1042			    max_blksz);
1043			ASSERT(abuf != NULL);
1044			ASSERT(arc_buf_size(abuf) == max_blksz);
1045			if (error = uiocopy(abuf->b_data, max_blksz,
1046			    UIO_WRITE, uio, &cbytes)) {
1047				dmu_return_arcbuf(abuf);
1048				break;
1049			}
1050			ASSERT(cbytes == max_blksz);
1051		}
1052
1053		/*
1054		 * Start a transaction.
1055		 */
1056		tx = dmu_tx_create(zfsvfs->z_os);
1057		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1058		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1059		zfs_sa_upgrade_txholds(tx, zp);
1060		error = dmu_tx_assign(tx, TXG_WAIT);
1061		if (error) {
1062			dmu_tx_abort(tx);
1063			if (abuf != NULL)
1064				dmu_return_arcbuf(abuf);
1065			break;
1066		}
1067
1068		/*
1069		 * If zfs_range_lock() over-locked we grow the blocksize
1070		 * and then reduce the lock range.  This will only happen
1071		 * on the first iteration since zfs_range_reduce() will
1072		 * shrink down r_len to the appropriate size.
1073		 */
1074		if (rl->r_len == UINT64_MAX) {
1075			uint64_t new_blksz;
1076
1077			if (zp->z_blksz > max_blksz) {
1078				/*
1079				 * File's blocksize is already larger than the
1080				 * "recordsize" property.  Only let it grow to
1081				 * the next power of 2.
1082				 */
1083				ASSERT(!ISP2(zp->z_blksz));
1084				new_blksz = MIN(end_size,
1085				    1 << highbit64(zp->z_blksz));
1086			} else {
1087				new_blksz = MIN(end_size, max_blksz);
1088			}
1089			zfs_grow_blocksize(zp, new_blksz, tx);
1090			zfs_range_reduce(rl, woff, n);
1091		}
1092
1093		/*
1094		 * XXX - should we really limit each write to z_max_blksz?
1095		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1096		 */
1097		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1098
1099		if (woff + nbytes > zp->z_size)
1100			vnode_pager_setsize(vp, woff + nbytes);
1101
1102		if (abuf == NULL) {
1103			tx_bytes = uio->uio_resid;
1104			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1105			    uio, nbytes, tx);
1106			tx_bytes -= uio->uio_resid;
1107		} else {
1108			tx_bytes = nbytes;
1109			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1110			/*
1111			 * If this is not a full block write, but we are
1112			 * extending the file past EOF and this data starts
1113			 * block-aligned, use assign_arcbuf().  Otherwise,
1114			 * write via dmu_write().
1115			 */
1116			if (tx_bytes < max_blksz && (!write_eof ||
1117			    aiov->iov_base != abuf->b_data)) {
1118				ASSERT(xuio);
1119				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1120				    aiov->iov_len, aiov->iov_base, tx);
1121				dmu_return_arcbuf(abuf);
1122				xuio_stat_wbuf_copied();
1123			} else {
1124				ASSERT(xuio || tx_bytes == max_blksz);
1125				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1126				    woff, abuf, tx);
1127			}
1128			ASSERT(tx_bytes <= uio->uio_resid);
1129			uioskip(uio, tx_bytes);
1130		}
1131		if (tx_bytes && vn_has_cached_data(vp)) {
1132			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1133			    zp->z_id, uio->uio_segflg, tx);
1134		}
1135
1136		/*
1137		 * If we made no progress, we're done.  If we made even
1138		 * partial progress, update the znode and ZIL accordingly.
1139		 */
1140		if (tx_bytes == 0) {
1141			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1142			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1143			dmu_tx_commit(tx);
1144			ASSERT(error != 0);
1145			break;
1146		}
1147
1148		/*
1149		 * Clear Set-UID/Set-GID bits on successful write if not
1150		 * privileged and at least one of the excute bits is set.
1151		 *
1152		 * It would be nice to to this after all writes have
1153		 * been done, but that would still expose the ISUID/ISGID
1154		 * to another app after the partial write is committed.
1155		 *
1156		 * Note: we don't call zfs_fuid_map_id() here because
1157		 * user 0 is not an ephemeral uid.
1158		 */
1159		mutex_enter(&zp->z_acl_lock);
1160		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1161		    (S_IXUSR >> 6))) != 0 &&
1162		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1163		    secpolicy_vnode_setid_retain(vp, cr,
1164		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1165			uint64_t newmode;
1166			zp->z_mode &= ~(S_ISUID | S_ISGID);
1167			newmode = zp->z_mode;
1168			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1169			    (void *)&newmode, sizeof (uint64_t), tx);
1170		}
1171		mutex_exit(&zp->z_acl_lock);
1172
1173		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1174		    B_TRUE);
1175
1176		/*
1177		 * Update the file size (zp_size) if it has changed;
1178		 * account for possible concurrent updates.
1179		 */
1180		while ((end_size = zp->z_size) < uio->uio_loffset) {
1181			(void) atomic_cas_64(&zp->z_size, end_size,
1182			    uio->uio_loffset);
1183#ifdef illumos
1184			ASSERT(error == 0);
1185#else
1186			ASSERT(error == 0 || error == EFAULT);
1187#endif
1188		}
1189		/*
1190		 * If we are replaying and eof is non zero then force
1191		 * the file size to the specified eof. Note, there's no
1192		 * concurrency during replay.
1193		 */
1194		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1195			zp->z_size = zfsvfs->z_replay_eof;
1196
1197		if (error == 0)
1198			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1199		else
1200			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1201
1202		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1203		dmu_tx_commit(tx);
1204
1205		if (error != 0)
1206			break;
1207		ASSERT(tx_bytes == nbytes);
1208		n -= nbytes;
1209
1210#ifdef illumos
1211		if (!xuio && n > 0)
1212			uio_prefaultpages(MIN(n, max_blksz), uio);
1213#endif
1214	}
1215
1216	zfs_range_unlock(rl);
1217
1218	/*
1219	 * If we're in replay mode, or we made no progress, return error.
1220	 * Otherwise, it's at least a partial write, so it's successful.
1221	 */
1222	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1223		ZFS_EXIT(zfsvfs);
1224		return (error);
1225	}
1226
1227#ifdef __FreeBSD__
1228	/*
1229	 * EFAULT means that at least one page of the source buffer was not
1230	 * available.  VFS will re-try remaining I/O upon this error.
1231	 */
1232	if (error == EFAULT) {
1233		ZFS_EXIT(zfsvfs);
1234		return (error);
1235	}
1236#endif
1237
1238	if (ioflag & (FSYNC | FDSYNC) ||
1239	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1240		zil_commit(zilog, zp->z_id);
1241
1242	ZFS_EXIT(zfsvfs);
1243	return (0);
1244}
1245
1246void
1247zfs_get_done(zgd_t *zgd, int error)
1248{
1249	znode_t *zp = zgd->zgd_private;
1250	objset_t *os = zp->z_zfsvfs->z_os;
1251
1252	if (zgd->zgd_db)
1253		dmu_buf_rele(zgd->zgd_db, zgd);
1254
1255	zfs_range_unlock(zgd->zgd_rl);
1256
1257	/*
1258	 * Release the vnode asynchronously as we currently have the
1259	 * txg stopped from syncing.
1260	 */
1261	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1262
1263	if (error == 0 && zgd->zgd_bp)
1264		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1265
1266	kmem_free(zgd, sizeof (zgd_t));
1267}
1268
1269#ifdef DEBUG
1270static int zil_fault_io = 0;
1271#endif
1272
1273/*
1274 * Get data to generate a TX_WRITE intent log record.
1275 */
1276int
1277zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1278{
1279	zfsvfs_t *zfsvfs = arg;
1280	objset_t *os = zfsvfs->z_os;
1281	znode_t *zp;
1282	uint64_t object = lr->lr_foid;
1283	uint64_t offset = lr->lr_offset;
1284	uint64_t size = lr->lr_length;
1285	blkptr_t *bp = &lr->lr_blkptr;
1286	dmu_buf_t *db;
1287	zgd_t *zgd;
1288	int error = 0;
1289
1290	ASSERT(zio != NULL);
1291	ASSERT(size != 0);
1292
1293	/*
1294	 * Nothing to do if the file has been removed
1295	 */
1296	if (zfs_zget(zfsvfs, object, &zp) != 0)
1297		return (SET_ERROR(ENOENT));
1298	if (zp->z_unlinked) {
1299		/*
1300		 * Release the vnode asynchronously as we currently have the
1301		 * txg stopped from syncing.
1302		 */
1303		VN_RELE_ASYNC(ZTOV(zp),
1304		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1305		return (SET_ERROR(ENOENT));
1306	}
1307
1308	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1309	zgd->zgd_zilog = zfsvfs->z_log;
1310	zgd->zgd_private = zp;
1311
1312	/*
1313	 * Write records come in two flavors: immediate and indirect.
1314	 * For small writes it's cheaper to store the data with the
1315	 * log record (immediate); for large writes it's cheaper to
1316	 * sync the data and get a pointer to it (indirect) so that
1317	 * we don't have to write the data twice.
1318	 */
1319	if (buf != NULL) { /* immediate write */
1320		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1321		/* test for truncation needs to be done while range locked */
1322		if (offset >= zp->z_size) {
1323			error = SET_ERROR(ENOENT);
1324		} else {
1325			error = dmu_read(os, object, offset, size, buf,
1326			    DMU_READ_NO_PREFETCH);
1327		}
1328		ASSERT(error == 0 || error == ENOENT);
1329	} else { /* indirect write */
1330		/*
1331		 * Have to lock the whole block to ensure when it's
1332		 * written out and its checksum is being calculated
1333		 * that no one can change the data. We need to re-check
1334		 * blocksize after we get the lock in case it's changed!
1335		 */
1336		for (;;) {
1337			uint64_t blkoff;
1338			size = zp->z_blksz;
1339			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1340			offset -= blkoff;
1341			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1342			    RL_READER);
1343			if (zp->z_blksz == size)
1344				break;
1345			offset += blkoff;
1346			zfs_range_unlock(zgd->zgd_rl);
1347		}
1348		/* test for truncation needs to be done while range locked */
1349		if (lr->lr_offset >= zp->z_size)
1350			error = SET_ERROR(ENOENT);
1351#ifdef DEBUG
1352		if (zil_fault_io) {
1353			error = SET_ERROR(EIO);
1354			zil_fault_io = 0;
1355		}
1356#endif
1357		if (error == 0)
1358			error = dmu_buf_hold(os, object, offset, zgd, &db,
1359			    DMU_READ_NO_PREFETCH);
1360
1361		if (error == 0) {
1362			blkptr_t *obp = dmu_buf_get_blkptr(db);
1363			if (obp) {
1364				ASSERT(BP_IS_HOLE(bp));
1365				*bp = *obp;
1366			}
1367
1368			zgd->zgd_db = db;
1369			zgd->zgd_bp = bp;
1370
1371			ASSERT(db->db_offset == offset);
1372			ASSERT(db->db_size == size);
1373
1374			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1375			    zfs_get_done, zgd);
1376			ASSERT(error || lr->lr_length <= zp->z_blksz);
1377
1378			/*
1379			 * On success, we need to wait for the write I/O
1380			 * initiated by dmu_sync() to complete before we can
1381			 * release this dbuf.  We will finish everything up
1382			 * in the zfs_get_done() callback.
1383			 */
1384			if (error == 0)
1385				return (0);
1386
1387			if (error == EALREADY) {
1388				lr->lr_common.lrc_txtype = TX_WRITE2;
1389				error = 0;
1390			}
1391		}
1392	}
1393
1394	zfs_get_done(zgd, error);
1395
1396	return (error);
1397}
1398
1399/*ARGSUSED*/
1400static int
1401zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1402    caller_context_t *ct)
1403{
1404	znode_t *zp = VTOZ(vp);
1405	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1406	int error;
1407
1408	ZFS_ENTER(zfsvfs);
1409	ZFS_VERIFY_ZP(zp);
1410
1411	if (flag & V_ACE_MASK)
1412		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1413	else
1414		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1415
1416	ZFS_EXIT(zfsvfs);
1417	return (error);
1418}
1419
1420static int
1421zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1422{
1423	int error;
1424
1425	*vpp = arg;
1426	error = vn_lock(*vpp, lkflags);
1427	if (error != 0)
1428		vrele(*vpp);
1429	return (error);
1430}
1431
1432static int
1433zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1434{
1435	znode_t *zdp = VTOZ(dvp);
1436	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1437	int error;
1438	int ltype;
1439
1440	ASSERT_VOP_LOCKED(dvp, __func__);
1441#ifdef DIAGNOSTIC
1442	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1443		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1444#endif
1445
1446	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1447		ASSERT3P(dvp, ==, vp);
1448		vref(dvp);
1449		ltype = lkflags & LK_TYPE_MASK;
1450		if (ltype != VOP_ISLOCKED(dvp)) {
1451			if (ltype == LK_EXCLUSIVE)
1452				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1453			else /* if (ltype == LK_SHARED) */
1454				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1455
1456			/*
1457			 * Relock for the "." case could leave us with
1458			 * reclaimed vnode.
1459			 */
1460			if (dvp->v_iflag & VI_DOOMED) {
1461				vrele(dvp);
1462				return (SET_ERROR(ENOENT));
1463			}
1464		}
1465		return (0);
1466	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1467		/*
1468		 * Note that in this case, dvp is the child vnode, and we
1469		 * are looking up the parent vnode - exactly reverse from
1470		 * normal operation.  Unlocking dvp requires some rather
1471		 * tricky unlock/relock dance to prevent mp from being freed;
1472		 * use vn_vget_ino_gen() which takes care of all that.
1473		 *
1474		 * XXX Note that there is a time window when both vnodes are
1475		 * unlocked.  It is possible, although highly unlikely, that
1476		 * during that window the parent-child relationship between
1477		 * the vnodes may change, for example, get reversed.
1478		 * In that case we would have a wrong lock order for the vnodes.
1479		 * All other filesystems seem to ignore this problem, so we
1480		 * do the same here.
1481		 * A potential solution could be implemented as follows:
1482		 * - using LK_NOWAIT when locking the second vnode and retrying
1483		 *   if necessary
1484		 * - checking that the parent-child relationship still holds
1485		 *   after locking both vnodes and retrying if it doesn't
1486		 */
1487		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1488		return (error);
1489	} else {
1490		error = vn_lock(vp, lkflags);
1491		if (error != 0)
1492			vrele(vp);
1493		return (error);
1494	}
1495}
1496
1497/*
1498 * Lookup an entry in a directory, or an extended attribute directory.
1499 * If it exists, return a held vnode reference for it.
1500 *
1501 *	IN:	dvp	- vnode of directory to search.
1502 *		nm	- name of entry to lookup.
1503 *		pnp	- full pathname to lookup [UNUSED].
1504 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1505 *		rdir	- root directory vnode [UNUSED].
1506 *		cr	- credentials of caller.
1507 *		ct	- caller context
1508 *
1509 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1510 *
1511 *	RETURN:	0 on success, error code on failure.
1512 *
1513 * Timestamps:
1514 *	NA
1515 */
1516/* ARGSUSED */
1517static int
1518zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1519    int nameiop, cred_t *cr, kthread_t *td, int flags)
1520{
1521	znode_t *zdp = VTOZ(dvp);
1522	znode_t *zp;
1523	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1524	int	error = 0;
1525
1526	/* fast path (should be redundant with vfs namecache) */
1527	if (!(flags & LOOKUP_XATTR)) {
1528		if (dvp->v_type != VDIR) {
1529			return (SET_ERROR(ENOTDIR));
1530		} else if (zdp->z_sa_hdl == NULL) {
1531			return (SET_ERROR(EIO));
1532		}
1533	}
1534
1535	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1536
1537	ZFS_ENTER(zfsvfs);
1538	ZFS_VERIFY_ZP(zdp);
1539
1540	*vpp = NULL;
1541
1542	if (flags & LOOKUP_XATTR) {
1543#ifdef TODO
1544		/*
1545		 * If the xattr property is off, refuse the lookup request.
1546		 */
1547		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1548			ZFS_EXIT(zfsvfs);
1549			return (SET_ERROR(EINVAL));
1550		}
1551#endif
1552
1553		/*
1554		 * We don't allow recursive attributes..
1555		 * Maybe someday we will.
1556		 */
1557		if (zdp->z_pflags & ZFS_XATTR) {
1558			ZFS_EXIT(zfsvfs);
1559			return (SET_ERROR(EINVAL));
1560		}
1561
1562		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1563			ZFS_EXIT(zfsvfs);
1564			return (error);
1565		}
1566
1567		/*
1568		 * Do we have permission to get into attribute directory?
1569		 */
1570		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1571		    B_FALSE, cr)) {
1572			vrele(*vpp);
1573			*vpp = NULL;
1574		}
1575
1576		ZFS_EXIT(zfsvfs);
1577		return (error);
1578	}
1579
1580	/*
1581	 * Check accessibility of directory.
1582	 */
1583	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1584		ZFS_EXIT(zfsvfs);
1585		return (error);
1586	}
1587
1588	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1589	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1590		ZFS_EXIT(zfsvfs);
1591		return (SET_ERROR(EILSEQ));
1592	}
1593
1594
1595	/*
1596	 * First handle the special cases.
1597	 */
1598	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1599		/*
1600		 * If we are a snapshot mounted under .zfs, return
1601		 * the vp for the snapshot directory.
1602		 */
1603		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1604			struct componentname cn;
1605			vnode_t *zfsctl_vp;
1606			int ltype;
1607
1608			ZFS_EXIT(zfsvfs);
1609			ltype = VOP_ISLOCKED(dvp);
1610			VOP_UNLOCK(dvp, 0);
1611			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1612			    &zfsctl_vp);
1613			if (error == 0) {
1614				cn.cn_nameptr = "snapshot";
1615				cn.cn_namelen = strlen(cn.cn_nameptr);
1616				cn.cn_nameiop = cnp->cn_nameiop;
1617				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1618				cn.cn_lkflags = cnp->cn_lkflags;
1619				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1620				vput(zfsctl_vp);
1621			}
1622			vn_lock(dvp, ltype | LK_RETRY);
1623			return (error);
1624		}
1625	}
1626	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1627		ZFS_EXIT(zfsvfs);
1628		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1629			return (SET_ERROR(ENOTSUP));
1630		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1631		return (error);
1632	}
1633
1634	/*
1635	 * The loop is retry the lookup if the parent-child relationship
1636	 * changes during the dot-dot locking complexities.
1637	 */
1638	for (;;) {
1639		uint64_t parent;
1640
1641		error = zfs_dirlook(zdp, nm, &zp);
1642		if (error == 0)
1643			*vpp = ZTOV(zp);
1644
1645		ZFS_EXIT(zfsvfs);
1646		if (error != 0)
1647			break;
1648
1649		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1650		if (error != 0) {
1651			/*
1652			 * If we've got a locking error, then the vnode
1653			 * got reclaimed because of a force unmount.
1654			 * We never enter doomed vnodes into the name cache.
1655			 */
1656			*vpp = NULL;
1657			return (error);
1658		}
1659
1660		if ((cnp->cn_flags & ISDOTDOT) == 0)
1661			break;
1662
1663		ZFS_ENTER(zfsvfs);
1664		if (zdp->z_sa_hdl == NULL) {
1665			error = SET_ERROR(EIO);
1666		} else {
1667			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1668			    &parent, sizeof (parent));
1669		}
1670		if (error != 0) {
1671			ZFS_EXIT(zfsvfs);
1672			vput(ZTOV(zp));
1673			break;
1674		}
1675		if (zp->z_id == parent) {
1676			ZFS_EXIT(zfsvfs);
1677			break;
1678		}
1679		vput(ZTOV(zp));
1680	}
1681
1682out:
1683	if (error != 0)
1684		*vpp = NULL;
1685
1686	/* Translate errors and add SAVENAME when needed. */
1687	if (cnp->cn_flags & ISLASTCN) {
1688		switch (nameiop) {
1689		case CREATE:
1690		case RENAME:
1691			if (error == ENOENT) {
1692				error = EJUSTRETURN;
1693				cnp->cn_flags |= SAVENAME;
1694				break;
1695			}
1696			/* FALLTHROUGH */
1697		case DELETE:
1698			if (error == 0)
1699				cnp->cn_flags |= SAVENAME;
1700			break;
1701		}
1702	}
1703
1704	/* Insert name into cache (as non-existent) if appropriate. */
1705	if (zfsvfs->z_use_namecache &&
1706	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1707		cache_enter(dvp, NULL, cnp);
1708
1709	/* Insert name into cache if appropriate. */
1710	if (zfsvfs->z_use_namecache &&
1711	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1712		if (!(cnp->cn_flags & ISLASTCN) ||
1713		    (nameiop != DELETE && nameiop != RENAME)) {
1714			cache_enter(dvp, *vpp, cnp);
1715		}
1716	}
1717
1718	return (error);
1719}
1720
1721/*
1722 * Attempt to create a new entry in a directory.  If the entry
1723 * already exists, truncate the file if permissible, else return
1724 * an error.  Return the vp of the created or trunc'd file.
1725 *
1726 *	IN:	dvp	- vnode of directory to put new file entry in.
1727 *		name	- name of new file entry.
1728 *		vap	- attributes of new file.
1729 *		excl	- flag indicating exclusive or non-exclusive mode.
1730 *		mode	- mode to open file with.
1731 *		cr	- credentials of caller.
1732 *		flag	- large file flag [UNUSED].
1733 *		ct	- caller context
1734 *		vsecp	- ACL to be set
1735 *
1736 *	OUT:	vpp	- vnode of created or trunc'd entry.
1737 *
1738 *	RETURN:	0 on success, error code on failure.
1739 *
1740 * Timestamps:
1741 *	dvp - ctime|mtime updated if new entry created
1742 *	 vp - ctime|mtime always, atime if new
1743 */
1744
1745/* ARGSUSED */
1746static int
1747zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1748    vnode_t **vpp, cred_t *cr, kthread_t *td)
1749{
1750	znode_t		*zp, *dzp = VTOZ(dvp);
1751	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1752	zilog_t		*zilog;
1753	objset_t	*os;
1754	dmu_tx_t	*tx;
1755	int		error;
1756	ksid_t		*ksid;
1757	uid_t		uid;
1758	gid_t		gid = crgetgid(cr);
1759	zfs_acl_ids_t   acl_ids;
1760	boolean_t	fuid_dirtied;
1761	void		*vsecp = NULL;
1762	int		flag = 0;
1763	uint64_t	txtype;
1764
1765	/*
1766	 * If we have an ephemeral id, ACL, or XVATTR then
1767	 * make sure file system is at proper version
1768	 */
1769
1770	ksid = crgetsid(cr, KSID_OWNER);
1771	if (ksid)
1772		uid = ksid_getid(ksid);
1773	else
1774		uid = crgetuid(cr);
1775
1776	if (zfsvfs->z_use_fuids == B_FALSE &&
1777	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1778	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1779		return (SET_ERROR(EINVAL));
1780
1781	ZFS_ENTER(zfsvfs);
1782	ZFS_VERIFY_ZP(dzp);
1783	os = zfsvfs->z_os;
1784	zilog = zfsvfs->z_log;
1785
1786	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1787	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1788		ZFS_EXIT(zfsvfs);
1789		return (SET_ERROR(EILSEQ));
1790	}
1791
1792	if (vap->va_mask & AT_XVATTR) {
1793		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1794		    crgetuid(cr), cr, vap->va_type)) != 0) {
1795			ZFS_EXIT(zfsvfs);
1796			return (error);
1797		}
1798	}
1799
1800	*vpp = NULL;
1801
1802	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1803		vap->va_mode &= ~S_ISVTX;
1804
1805	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1806	if (error) {
1807		ZFS_EXIT(zfsvfs);
1808		return (error);
1809	}
1810	ASSERT3P(zp, ==, NULL);
1811
1812	/*
1813	 * Create a new file object and update the directory
1814	 * to reference it.
1815	 */
1816	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1817		goto out;
1818	}
1819
1820	/*
1821	 * We only support the creation of regular files in
1822	 * extended attribute directories.
1823	 */
1824
1825	if ((dzp->z_pflags & ZFS_XATTR) &&
1826	    (vap->va_type != VREG)) {
1827		error = SET_ERROR(EINVAL);
1828		goto out;
1829	}
1830
1831	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1832	    cr, vsecp, &acl_ids)) != 0)
1833		goto out;
1834
1835	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1836		zfs_acl_ids_free(&acl_ids);
1837		error = SET_ERROR(EDQUOT);
1838		goto out;
1839	}
1840
1841	getnewvnode_reserve(1);
1842
1843	tx = dmu_tx_create(os);
1844
1845	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1846	    ZFS_SA_BASE_ATTR_SIZE);
1847
1848	fuid_dirtied = zfsvfs->z_fuid_dirty;
1849	if (fuid_dirtied)
1850		zfs_fuid_txhold(zfsvfs, tx);
1851	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1852	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1853	if (!zfsvfs->z_use_sa &&
1854	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1855		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1856		    0, acl_ids.z_aclp->z_acl_bytes);
1857	}
1858	error = dmu_tx_assign(tx, TXG_WAIT);
1859	if (error) {
1860		zfs_acl_ids_free(&acl_ids);
1861		dmu_tx_abort(tx);
1862		getnewvnode_drop_reserve();
1863		ZFS_EXIT(zfsvfs);
1864		return (error);
1865	}
1866	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1867
1868	if (fuid_dirtied)
1869		zfs_fuid_sync(zfsvfs, tx);
1870
1871	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1872	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1873	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1874	    vsecp, acl_ids.z_fuidp, vap);
1875	zfs_acl_ids_free(&acl_ids);
1876	dmu_tx_commit(tx);
1877
1878	getnewvnode_drop_reserve();
1879
1880out:
1881	if (error == 0) {
1882		*vpp = ZTOV(zp);
1883	}
1884
1885	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1886		zil_commit(zilog, 0);
1887
1888	ZFS_EXIT(zfsvfs);
1889	return (error);
1890}
1891
1892/*
1893 * Remove an entry from a directory.
1894 *
1895 *	IN:	dvp	- vnode of directory to remove entry from.
1896 *		name	- name of entry to remove.
1897 *		cr	- credentials of caller.
1898 *		ct	- caller context
1899 *		flags	- case flags
1900 *
1901 *	RETURN:	0 on success, error code on failure.
1902 *
1903 * Timestamps:
1904 *	dvp - ctime|mtime
1905 *	 vp - ctime (if nlink > 0)
1906 */
1907
1908/*ARGSUSED*/
1909static int
1910zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1911{
1912	znode_t		*dzp = VTOZ(dvp);
1913	znode_t		*zp = VTOZ(vp);
1914	znode_t		*xzp;
1915	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1916	zilog_t		*zilog;
1917	uint64_t	acl_obj, xattr_obj;
1918	uint64_t	obj = 0;
1919	dmu_tx_t	*tx;
1920	boolean_t	unlinked, toobig = FALSE;
1921	uint64_t	txtype;
1922	int		error;
1923
1924	ZFS_ENTER(zfsvfs);
1925	ZFS_VERIFY_ZP(dzp);
1926	ZFS_VERIFY_ZP(zp);
1927	zilog = zfsvfs->z_log;
1928	zp = VTOZ(vp);
1929
1930	xattr_obj = 0;
1931	xzp = NULL;
1932
1933	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1934		goto out;
1935	}
1936
1937	/*
1938	 * Need to use rmdir for removing directories.
1939	 */
1940	if (vp->v_type == VDIR) {
1941		error = SET_ERROR(EPERM);
1942		goto out;
1943	}
1944
1945	vnevent_remove(vp, dvp, name, ct);
1946
1947	obj = zp->z_id;
1948
1949	/* are there any extended attributes? */
1950	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1951	    &xattr_obj, sizeof (xattr_obj));
1952	if (error == 0 && xattr_obj) {
1953		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1954		ASSERT0(error);
1955	}
1956
1957	/*
1958	 * We may delete the znode now, or we may put it in the unlinked set;
1959	 * it depends on whether we're the last link, and on whether there are
1960	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1961	 * allow for either case.
1962	 */
1963	tx = dmu_tx_create(zfsvfs->z_os);
1964	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1965	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1966	zfs_sa_upgrade_txholds(tx, zp);
1967	zfs_sa_upgrade_txholds(tx, dzp);
1968
1969	if (xzp) {
1970		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1971		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1972	}
1973
1974	/* charge as an update -- would be nice not to charge at all */
1975	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1976
1977	/*
1978	 * Mark this transaction as typically resulting in a net free of space
1979	 */
1980	dmu_tx_mark_netfree(tx);
1981
1982	error = dmu_tx_assign(tx, TXG_WAIT);
1983	if (error) {
1984		dmu_tx_abort(tx);
1985		ZFS_EXIT(zfsvfs);
1986		return (error);
1987	}
1988
1989	/*
1990	 * Remove the directory entry.
1991	 */
1992	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1993
1994	if (error) {
1995		dmu_tx_commit(tx);
1996		goto out;
1997	}
1998
1999	if (unlinked) {
2000		zfs_unlinked_add(zp, tx);
2001		vp->v_vflag |= VV_NOSYNC;
2002	}
2003
2004	txtype = TX_REMOVE;
2005	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2006
2007	dmu_tx_commit(tx);
2008out:
2009
2010	if (xzp)
2011		vrele(ZTOV(xzp));
2012
2013	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2014		zil_commit(zilog, 0);
2015
2016	ZFS_EXIT(zfsvfs);
2017	return (error);
2018}
2019
2020/*
2021 * Create a new directory and insert it into dvp using the name
2022 * provided.  Return a pointer to the inserted directory.
2023 *
2024 *	IN:	dvp	- vnode of directory to add subdir to.
2025 *		dirname	- name of new directory.
2026 *		vap	- attributes of new directory.
2027 *		cr	- credentials of caller.
2028 *		ct	- caller context
2029 *		flags	- case flags
2030 *		vsecp	- ACL to be set
2031 *
2032 *	OUT:	vpp	- vnode of created directory.
2033 *
2034 *	RETURN:	0 on success, error code on failure.
2035 *
2036 * Timestamps:
2037 *	dvp - ctime|mtime updated
2038 *	 vp - ctime|mtime|atime updated
2039 */
2040/*ARGSUSED*/
2041static int
2042zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2043{
2044	znode_t		*zp, *dzp = VTOZ(dvp);
2045	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2046	zilog_t		*zilog;
2047	uint64_t	txtype;
2048	dmu_tx_t	*tx;
2049	int		error;
2050	ksid_t		*ksid;
2051	uid_t		uid;
2052	gid_t		gid = crgetgid(cr);
2053	zfs_acl_ids_t   acl_ids;
2054	boolean_t	fuid_dirtied;
2055
2056	ASSERT(vap->va_type == VDIR);
2057
2058	/*
2059	 * If we have an ephemeral id, ACL, or XVATTR then
2060	 * make sure file system is at proper version
2061	 */
2062
2063	ksid = crgetsid(cr, KSID_OWNER);
2064	if (ksid)
2065		uid = ksid_getid(ksid);
2066	else
2067		uid = crgetuid(cr);
2068	if (zfsvfs->z_use_fuids == B_FALSE &&
2069	    ((vap->va_mask & AT_XVATTR) ||
2070	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2071		return (SET_ERROR(EINVAL));
2072
2073	ZFS_ENTER(zfsvfs);
2074	ZFS_VERIFY_ZP(dzp);
2075	zilog = zfsvfs->z_log;
2076
2077	if (dzp->z_pflags & ZFS_XATTR) {
2078		ZFS_EXIT(zfsvfs);
2079		return (SET_ERROR(EINVAL));
2080	}
2081
2082	if (zfsvfs->z_utf8 && u8_validate(dirname,
2083	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2084		ZFS_EXIT(zfsvfs);
2085		return (SET_ERROR(EILSEQ));
2086	}
2087
2088	if (vap->va_mask & AT_XVATTR) {
2089		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2090		    crgetuid(cr), cr, vap->va_type)) != 0) {
2091			ZFS_EXIT(zfsvfs);
2092			return (error);
2093		}
2094	}
2095
2096	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2097	    NULL, &acl_ids)) != 0) {
2098		ZFS_EXIT(zfsvfs);
2099		return (error);
2100	}
2101
2102	/*
2103	 * First make sure the new directory doesn't exist.
2104	 *
2105	 * Existence is checked first to make sure we don't return
2106	 * EACCES instead of EEXIST which can cause some applications
2107	 * to fail.
2108	 */
2109	*vpp = NULL;
2110
2111	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2112		zfs_acl_ids_free(&acl_ids);
2113		ZFS_EXIT(zfsvfs);
2114		return (error);
2115	}
2116	ASSERT3P(zp, ==, NULL);
2117
2118	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2119		zfs_acl_ids_free(&acl_ids);
2120		ZFS_EXIT(zfsvfs);
2121		return (error);
2122	}
2123
2124	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2125		zfs_acl_ids_free(&acl_ids);
2126		ZFS_EXIT(zfsvfs);
2127		return (SET_ERROR(EDQUOT));
2128	}
2129
2130	/*
2131	 * Add a new entry to the directory.
2132	 */
2133	getnewvnode_reserve(1);
2134	tx = dmu_tx_create(zfsvfs->z_os);
2135	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2136	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2137	fuid_dirtied = zfsvfs->z_fuid_dirty;
2138	if (fuid_dirtied)
2139		zfs_fuid_txhold(zfsvfs, tx);
2140	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2141		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2142		    acl_ids.z_aclp->z_acl_bytes);
2143	}
2144
2145	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2146	    ZFS_SA_BASE_ATTR_SIZE);
2147
2148	error = dmu_tx_assign(tx, TXG_WAIT);
2149	if (error) {
2150		zfs_acl_ids_free(&acl_ids);
2151		dmu_tx_abort(tx);
2152		getnewvnode_drop_reserve();
2153		ZFS_EXIT(zfsvfs);
2154		return (error);
2155	}
2156
2157	/*
2158	 * Create new node.
2159	 */
2160	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2161
2162	if (fuid_dirtied)
2163		zfs_fuid_sync(zfsvfs, tx);
2164
2165	/*
2166	 * Now put new name in parent dir.
2167	 */
2168	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2169
2170	*vpp = ZTOV(zp);
2171
2172	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2173	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2174	    acl_ids.z_fuidp, vap);
2175
2176	zfs_acl_ids_free(&acl_ids);
2177
2178	dmu_tx_commit(tx);
2179
2180	getnewvnode_drop_reserve();
2181
2182	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2183		zil_commit(zilog, 0);
2184
2185	ZFS_EXIT(zfsvfs);
2186	return (0);
2187}
2188
2189/*
2190 * Remove a directory subdir entry.  If the current working
2191 * directory is the same as the subdir to be removed, the
2192 * remove will fail.
2193 *
2194 *	IN:	dvp	- vnode of directory to remove from.
2195 *		name	- name of directory to be removed.
2196 *		cwd	- vnode of current working directory.
2197 *		cr	- credentials of caller.
2198 *		ct	- caller context
2199 *		flags	- case flags
2200 *
2201 *	RETURN:	0 on success, error code on failure.
2202 *
2203 * Timestamps:
2204 *	dvp - ctime|mtime updated
2205 */
2206/*ARGSUSED*/
2207static int
2208zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2209{
2210	znode_t		*dzp = VTOZ(dvp);
2211	znode_t		*zp = VTOZ(vp);
2212	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2213	zilog_t		*zilog;
2214	dmu_tx_t	*tx;
2215	int		error;
2216
2217	ZFS_ENTER(zfsvfs);
2218	ZFS_VERIFY_ZP(dzp);
2219	ZFS_VERIFY_ZP(zp);
2220	zilog = zfsvfs->z_log;
2221
2222
2223	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2224		goto out;
2225	}
2226
2227	if (vp->v_type != VDIR) {
2228		error = SET_ERROR(ENOTDIR);
2229		goto out;
2230	}
2231
2232	vnevent_rmdir(vp, dvp, name, ct);
2233
2234	tx = dmu_tx_create(zfsvfs->z_os);
2235	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2236	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2237	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2238	zfs_sa_upgrade_txholds(tx, zp);
2239	zfs_sa_upgrade_txholds(tx, dzp);
2240	dmu_tx_mark_netfree(tx);
2241	error = dmu_tx_assign(tx, TXG_WAIT);
2242	if (error) {
2243		dmu_tx_abort(tx);
2244		ZFS_EXIT(zfsvfs);
2245		return (error);
2246	}
2247
2248	cache_purge(dvp);
2249
2250	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2251
2252	if (error == 0) {
2253		uint64_t txtype = TX_RMDIR;
2254		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2255	}
2256
2257	dmu_tx_commit(tx);
2258
2259	cache_purge(vp);
2260out:
2261	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2262		zil_commit(zilog, 0);
2263
2264	ZFS_EXIT(zfsvfs);
2265	return (error);
2266}
2267
2268/*
2269 * Read as many directory entries as will fit into the provided
2270 * buffer from the given directory cursor position (specified in
2271 * the uio structure).
2272 *
2273 *	IN:	vp	- vnode of directory to read.
2274 *		uio	- structure supplying read location, range info,
2275 *			  and return buffer.
2276 *		cr	- credentials of caller.
2277 *		ct	- caller context
2278 *		flags	- case flags
2279 *
2280 *	OUT:	uio	- updated offset and range, buffer filled.
2281 *		eofp	- set to true if end-of-file detected.
2282 *
2283 *	RETURN:	0 on success, error code on failure.
2284 *
2285 * Timestamps:
2286 *	vp - atime updated
2287 *
2288 * Note that the low 4 bits of the cookie returned by zap is always zero.
2289 * This allows us to use the low range for "special" directory entries:
2290 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2291 * we use the offset 2 for the '.zfs' directory.
2292 */
2293/* ARGSUSED */
2294static int
2295zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2296{
2297	znode_t		*zp = VTOZ(vp);
2298	iovec_t		*iovp;
2299	edirent_t	*eodp;
2300	dirent64_t	*odp;
2301	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2302	objset_t	*os;
2303	caddr_t		outbuf;
2304	size_t		bufsize;
2305	zap_cursor_t	zc;
2306	zap_attribute_t	zap;
2307	uint_t		bytes_wanted;
2308	uint64_t	offset; /* must be unsigned; checks for < 1 */
2309	uint64_t	parent;
2310	int		local_eof;
2311	int		outcount;
2312	int		error;
2313	uint8_t		prefetch;
2314	boolean_t	check_sysattrs;
2315	uint8_t		type;
2316	int		ncooks;
2317	u_long		*cooks = NULL;
2318	int		flags = 0;
2319
2320	ZFS_ENTER(zfsvfs);
2321	ZFS_VERIFY_ZP(zp);
2322
2323	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2324	    &parent, sizeof (parent))) != 0) {
2325		ZFS_EXIT(zfsvfs);
2326		return (error);
2327	}
2328
2329	/*
2330	 * If we are not given an eof variable,
2331	 * use a local one.
2332	 */
2333	if (eofp == NULL)
2334		eofp = &local_eof;
2335
2336	/*
2337	 * Check for valid iov_len.
2338	 */
2339	if (uio->uio_iov->iov_len <= 0) {
2340		ZFS_EXIT(zfsvfs);
2341		return (SET_ERROR(EINVAL));
2342	}
2343
2344	/*
2345	 * Quit if directory has been removed (posix)
2346	 */
2347	if ((*eofp = zp->z_unlinked) != 0) {
2348		ZFS_EXIT(zfsvfs);
2349		return (0);
2350	}
2351
2352	error = 0;
2353	os = zfsvfs->z_os;
2354	offset = uio->uio_loffset;
2355	prefetch = zp->z_zn_prefetch;
2356
2357	/*
2358	 * Initialize the iterator cursor.
2359	 */
2360	if (offset <= 3) {
2361		/*
2362		 * Start iteration from the beginning of the directory.
2363		 */
2364		zap_cursor_init(&zc, os, zp->z_id);
2365	} else {
2366		/*
2367		 * The offset is a serialized cursor.
2368		 */
2369		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2370	}
2371
2372	/*
2373	 * Get space to change directory entries into fs independent format.
2374	 */
2375	iovp = uio->uio_iov;
2376	bytes_wanted = iovp->iov_len;
2377	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2378		bufsize = bytes_wanted;
2379		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2380		odp = (struct dirent64 *)outbuf;
2381	} else {
2382		bufsize = bytes_wanted;
2383		outbuf = NULL;
2384		odp = (struct dirent64 *)iovp->iov_base;
2385	}
2386	eodp = (struct edirent *)odp;
2387
2388	if (ncookies != NULL) {
2389		/*
2390		 * Minimum entry size is dirent size and 1 byte for a file name.
2391		 */
2392		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2393		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2394		*cookies = cooks;
2395		*ncookies = ncooks;
2396	}
2397	/*
2398	 * If this VFS supports the system attribute view interface; and
2399	 * we're looking at an extended attribute directory; and we care
2400	 * about normalization conflicts on this vfs; then we must check
2401	 * for normalization conflicts with the sysattr name space.
2402	 */
2403#ifdef TODO
2404	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2405	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2406	    (flags & V_RDDIR_ENTFLAGS);
2407#else
2408	check_sysattrs = 0;
2409#endif
2410
2411	/*
2412	 * Transform to file-system independent format
2413	 */
2414	outcount = 0;
2415	while (outcount < bytes_wanted) {
2416		ino64_t objnum;
2417		ushort_t reclen;
2418		off64_t *next = NULL;
2419
2420		/*
2421		 * Special case `.', `..', and `.zfs'.
2422		 */
2423		if (offset == 0) {
2424			(void) strcpy(zap.za_name, ".");
2425			zap.za_normalization_conflict = 0;
2426			objnum = zp->z_id;
2427			type = DT_DIR;
2428		} else if (offset == 1) {
2429			(void) strcpy(zap.za_name, "..");
2430			zap.za_normalization_conflict = 0;
2431			objnum = parent;
2432			type = DT_DIR;
2433		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2434			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2435			zap.za_normalization_conflict = 0;
2436			objnum = ZFSCTL_INO_ROOT;
2437			type = DT_DIR;
2438		} else {
2439			/*
2440			 * Grab next entry.
2441			 */
2442			if (error = zap_cursor_retrieve(&zc, &zap)) {
2443				if ((*eofp = (error == ENOENT)) != 0)
2444					break;
2445				else
2446					goto update;
2447			}
2448
2449			if (zap.za_integer_length != 8 ||
2450			    zap.za_num_integers != 1) {
2451				cmn_err(CE_WARN, "zap_readdir: bad directory "
2452				    "entry, obj = %lld, offset = %lld\n",
2453				    (u_longlong_t)zp->z_id,
2454				    (u_longlong_t)offset);
2455				error = SET_ERROR(ENXIO);
2456				goto update;
2457			}
2458
2459			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2460			/*
2461			 * MacOS X can extract the object type here such as:
2462			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2463			 */
2464			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2465
2466			if (check_sysattrs && !zap.za_normalization_conflict) {
2467#ifdef TODO
2468				zap.za_normalization_conflict =
2469				    xattr_sysattr_casechk(zap.za_name);
2470#else
2471				panic("%s:%u: TODO", __func__, __LINE__);
2472#endif
2473			}
2474		}
2475
2476		if (flags & V_RDDIR_ACCFILTER) {
2477			/*
2478			 * If we have no access at all, don't include
2479			 * this entry in the returned information
2480			 */
2481			znode_t	*ezp;
2482			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2483				goto skip_entry;
2484			if (!zfs_has_access(ezp, cr)) {
2485				vrele(ZTOV(ezp));
2486				goto skip_entry;
2487			}
2488			vrele(ZTOV(ezp));
2489		}
2490
2491		if (flags & V_RDDIR_ENTFLAGS)
2492			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2493		else
2494			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2495
2496		/*
2497		 * Will this entry fit in the buffer?
2498		 */
2499		if (outcount + reclen > bufsize) {
2500			/*
2501			 * Did we manage to fit anything in the buffer?
2502			 */
2503			if (!outcount) {
2504				error = SET_ERROR(EINVAL);
2505				goto update;
2506			}
2507			break;
2508		}
2509		if (flags & V_RDDIR_ENTFLAGS) {
2510			/*
2511			 * Add extended flag entry:
2512			 */
2513			eodp->ed_ino = objnum;
2514			eodp->ed_reclen = reclen;
2515			/* NOTE: ed_off is the offset for the *next* entry */
2516			next = &(eodp->ed_off);
2517			eodp->ed_eflags = zap.za_normalization_conflict ?
2518			    ED_CASE_CONFLICT : 0;
2519			(void) strncpy(eodp->ed_name, zap.za_name,
2520			    EDIRENT_NAMELEN(reclen));
2521			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2522		} else {
2523			/*
2524			 * Add normal entry:
2525			 */
2526			odp->d_ino = objnum;
2527			odp->d_reclen = reclen;
2528			odp->d_namlen = strlen(zap.za_name);
2529			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2530			odp->d_type = type;
2531			odp = (dirent64_t *)((intptr_t)odp + reclen);
2532		}
2533		outcount += reclen;
2534
2535		ASSERT(outcount <= bufsize);
2536
2537		/* Prefetch znode */
2538		if (prefetch)
2539			dmu_prefetch(os, objnum, 0, 0, 0,
2540			    ZIO_PRIORITY_SYNC_READ);
2541
2542	skip_entry:
2543		/*
2544		 * Move to the next entry, fill in the previous offset.
2545		 */
2546		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2547			zap_cursor_advance(&zc);
2548			offset = zap_cursor_serialize(&zc);
2549		} else {
2550			offset += 1;
2551		}
2552
2553		if (cooks != NULL) {
2554			*cooks++ = offset;
2555			ncooks--;
2556			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2557		}
2558	}
2559	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2560
2561	/* Subtract unused cookies */
2562	if (ncookies != NULL)
2563		*ncookies -= ncooks;
2564
2565	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2566		iovp->iov_base += outcount;
2567		iovp->iov_len -= outcount;
2568		uio->uio_resid -= outcount;
2569	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2570		/*
2571		 * Reset the pointer.
2572		 */
2573		offset = uio->uio_loffset;
2574	}
2575
2576update:
2577	zap_cursor_fini(&zc);
2578	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2579		kmem_free(outbuf, bufsize);
2580
2581	if (error == ENOENT)
2582		error = 0;
2583
2584	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2585
2586	uio->uio_loffset = offset;
2587	ZFS_EXIT(zfsvfs);
2588	if (error != 0 && cookies != NULL) {
2589		free(*cookies, M_TEMP);
2590		*cookies = NULL;
2591		*ncookies = 0;
2592	}
2593	return (error);
2594}
2595
2596ulong_t zfs_fsync_sync_cnt = 4;
2597
2598static int
2599zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2600{
2601	znode_t	*zp = VTOZ(vp);
2602	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2603
2604	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2605
2606	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2607		ZFS_ENTER(zfsvfs);
2608		ZFS_VERIFY_ZP(zp);
2609		zil_commit(zfsvfs->z_log, zp->z_id);
2610		ZFS_EXIT(zfsvfs);
2611	}
2612	return (0);
2613}
2614
2615
2616/*
2617 * Get the requested file attributes and place them in the provided
2618 * vattr structure.
2619 *
2620 *	IN:	vp	- vnode of file.
2621 *		vap	- va_mask identifies requested attributes.
2622 *			  If AT_XVATTR set, then optional attrs are requested
2623 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2624 *		cr	- credentials of caller.
2625 *		ct	- caller context
2626 *
2627 *	OUT:	vap	- attribute values.
2628 *
2629 *	RETURN:	0 (always succeeds).
2630 */
2631/* ARGSUSED */
2632static int
2633zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2634    caller_context_t *ct)
2635{
2636	znode_t *zp = VTOZ(vp);
2637	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2638	int	error = 0;
2639	uint32_t blksize;
2640	u_longlong_t nblocks;
2641	uint64_t links;
2642	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2643	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2644	xoptattr_t *xoap = NULL;
2645	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2646	sa_bulk_attr_t bulk[4];
2647	int count = 0;
2648
2649	ZFS_ENTER(zfsvfs);
2650	ZFS_VERIFY_ZP(zp);
2651
2652	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2653
2654	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2655	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2656	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2657	if (vp->v_type == VBLK || vp->v_type == VCHR)
2658		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2659		    &rdev, 8);
2660
2661	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2662		ZFS_EXIT(zfsvfs);
2663		return (error);
2664	}
2665
2666	/*
2667	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2668	 * Also, if we are the owner don't bother, since owner should
2669	 * always be allowed to read basic attributes of file.
2670	 */
2671	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2672	    (vap->va_uid != crgetuid(cr))) {
2673		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2674		    skipaclchk, cr)) {
2675			ZFS_EXIT(zfsvfs);
2676			return (error);
2677		}
2678	}
2679
2680	/*
2681	 * Return all attributes.  It's cheaper to provide the answer
2682	 * than to determine whether we were asked the question.
2683	 */
2684
2685	vap->va_type = IFTOVT(zp->z_mode);
2686	vap->va_mode = zp->z_mode & ~S_IFMT;
2687#ifdef illumos
2688	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2689#else
2690	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2691#endif
2692	vap->va_nodeid = zp->z_id;
2693	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2694		links = zp->z_links + 1;
2695	else
2696		links = zp->z_links;
2697	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2698	vap->va_size = zp->z_size;
2699#ifdef illumos
2700	vap->va_rdev = vp->v_rdev;
2701#else
2702	if (vp->v_type == VBLK || vp->v_type == VCHR)
2703		vap->va_rdev = zfs_cmpldev(rdev);
2704#endif
2705	vap->va_seq = zp->z_seq;
2706	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2707	vap->va_filerev = zp->z_seq;
2708
2709	/*
2710	 * Add in any requested optional attributes and the create time.
2711	 * Also set the corresponding bits in the returned attribute bitmap.
2712	 */
2713	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2714		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2715			xoap->xoa_archive =
2716			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2717			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2718		}
2719
2720		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2721			xoap->xoa_readonly =
2722			    ((zp->z_pflags & ZFS_READONLY) != 0);
2723			XVA_SET_RTN(xvap, XAT_READONLY);
2724		}
2725
2726		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2727			xoap->xoa_system =
2728			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2729			XVA_SET_RTN(xvap, XAT_SYSTEM);
2730		}
2731
2732		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2733			xoap->xoa_hidden =
2734			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2735			XVA_SET_RTN(xvap, XAT_HIDDEN);
2736		}
2737
2738		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2739			xoap->xoa_nounlink =
2740			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2741			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2742		}
2743
2744		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2745			xoap->xoa_immutable =
2746			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2747			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2748		}
2749
2750		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2751			xoap->xoa_appendonly =
2752			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2753			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2754		}
2755
2756		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2757			xoap->xoa_nodump =
2758			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2759			XVA_SET_RTN(xvap, XAT_NODUMP);
2760		}
2761
2762		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2763			xoap->xoa_opaque =
2764			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2765			XVA_SET_RTN(xvap, XAT_OPAQUE);
2766		}
2767
2768		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2769			xoap->xoa_av_quarantined =
2770			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2771			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2772		}
2773
2774		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2775			xoap->xoa_av_modified =
2776			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2777			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2778		}
2779
2780		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2781		    vp->v_type == VREG) {
2782			zfs_sa_get_scanstamp(zp, xvap);
2783		}
2784
2785		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2786			uint64_t times[2];
2787
2788			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2789			    times, sizeof (times));
2790			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2791			XVA_SET_RTN(xvap, XAT_CREATETIME);
2792		}
2793
2794		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2795			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2796			XVA_SET_RTN(xvap, XAT_REPARSE);
2797		}
2798		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2799			xoap->xoa_generation = zp->z_gen;
2800			XVA_SET_RTN(xvap, XAT_GEN);
2801		}
2802
2803		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2804			xoap->xoa_offline =
2805			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2806			XVA_SET_RTN(xvap, XAT_OFFLINE);
2807		}
2808
2809		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2810			xoap->xoa_sparse =
2811			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2812			XVA_SET_RTN(xvap, XAT_SPARSE);
2813		}
2814	}
2815
2816	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2817	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2818	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2819	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2820
2821
2822	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2823	vap->va_blksize = blksize;
2824	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2825
2826	if (zp->z_blksz == 0) {
2827		/*
2828		 * Block size hasn't been set; suggest maximal I/O transfers.
2829		 */
2830		vap->va_blksize = zfsvfs->z_max_blksz;
2831	}
2832
2833	ZFS_EXIT(zfsvfs);
2834	return (0);
2835}
2836
2837/*
2838 * Set the file attributes to the values contained in the
2839 * vattr structure.
2840 *
2841 *	IN:	vp	- vnode of file to be modified.
2842 *		vap	- new attribute values.
2843 *			  If AT_XVATTR set, then optional attrs are being set
2844 *		flags	- ATTR_UTIME set if non-default time values provided.
2845 *			- ATTR_NOACLCHECK (CIFS context only).
2846 *		cr	- credentials of caller.
2847 *		ct	- caller context
2848 *
2849 *	RETURN:	0 on success, error code on failure.
2850 *
2851 * Timestamps:
2852 *	vp - ctime updated, mtime updated if size changed.
2853 */
2854/* ARGSUSED */
2855static int
2856zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2857    caller_context_t *ct)
2858{
2859	znode_t		*zp = VTOZ(vp);
2860	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2861	zilog_t		*zilog;
2862	dmu_tx_t	*tx;
2863	vattr_t		oldva;
2864	xvattr_t	tmpxvattr;
2865	uint_t		mask = vap->va_mask;
2866	uint_t		saved_mask = 0;
2867	uint64_t	saved_mode;
2868	int		trim_mask = 0;
2869	uint64_t	new_mode;
2870	uint64_t	new_uid, new_gid;
2871	uint64_t	xattr_obj;
2872	uint64_t	mtime[2], ctime[2];
2873	znode_t		*attrzp;
2874	int		need_policy = FALSE;
2875	int		err, err2;
2876	zfs_fuid_info_t *fuidp = NULL;
2877	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2878	xoptattr_t	*xoap;
2879	zfs_acl_t	*aclp;
2880	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2881	boolean_t	fuid_dirtied = B_FALSE;
2882	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2883	int		count = 0, xattr_count = 0;
2884
2885	if (mask == 0)
2886		return (0);
2887
2888	if (mask & AT_NOSET)
2889		return (SET_ERROR(EINVAL));
2890
2891	ZFS_ENTER(zfsvfs);
2892	ZFS_VERIFY_ZP(zp);
2893
2894	zilog = zfsvfs->z_log;
2895
2896	/*
2897	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2898	 * that file system is at proper version level
2899	 */
2900
2901	if (zfsvfs->z_use_fuids == B_FALSE &&
2902	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2903	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2904	    (mask & AT_XVATTR))) {
2905		ZFS_EXIT(zfsvfs);
2906		return (SET_ERROR(EINVAL));
2907	}
2908
2909	if (mask & AT_SIZE && vp->v_type == VDIR) {
2910		ZFS_EXIT(zfsvfs);
2911		return (SET_ERROR(EISDIR));
2912	}
2913
2914	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2915		ZFS_EXIT(zfsvfs);
2916		return (SET_ERROR(EINVAL));
2917	}
2918
2919	/*
2920	 * If this is an xvattr_t, then get a pointer to the structure of
2921	 * optional attributes.  If this is NULL, then we have a vattr_t.
2922	 */
2923	xoap = xva_getxoptattr(xvap);
2924
2925	xva_init(&tmpxvattr);
2926
2927	/*
2928	 * Immutable files can only alter immutable bit and atime
2929	 */
2930	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2931	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2932	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2933		ZFS_EXIT(zfsvfs);
2934		return (SET_ERROR(EPERM));
2935	}
2936
2937	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2938		ZFS_EXIT(zfsvfs);
2939		return (SET_ERROR(EPERM));
2940	}
2941
2942	/*
2943	 * Verify timestamps doesn't overflow 32 bits.
2944	 * ZFS can handle large timestamps, but 32bit syscalls can't
2945	 * handle times greater than 2039.  This check should be removed
2946	 * once large timestamps are fully supported.
2947	 */
2948	if (mask & (AT_ATIME | AT_MTIME)) {
2949		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2950		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2951			ZFS_EXIT(zfsvfs);
2952			return (SET_ERROR(EOVERFLOW));
2953		}
2954	}
2955
2956	attrzp = NULL;
2957	aclp = NULL;
2958
2959	/* Can this be moved to before the top label? */
2960	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2961		ZFS_EXIT(zfsvfs);
2962		return (SET_ERROR(EROFS));
2963	}
2964
2965	/*
2966	 * First validate permissions
2967	 */
2968
2969	if (mask & AT_SIZE) {
2970		/*
2971		 * XXX - Note, we are not providing any open
2972		 * mode flags here (like FNDELAY), so we may
2973		 * block if there are locks present... this
2974		 * should be addressed in openat().
2975		 */
2976		/* XXX - would it be OK to generate a log record here? */
2977		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2978		if (err) {
2979			ZFS_EXIT(zfsvfs);
2980			return (err);
2981		}
2982	}
2983
2984	if (mask & (AT_ATIME|AT_MTIME) ||
2985	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2986	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2987	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2988	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2989	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2990	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2991	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2992		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2993		    skipaclchk, cr);
2994	}
2995
2996	if (mask & (AT_UID|AT_GID)) {
2997		int	idmask = (mask & (AT_UID|AT_GID));
2998		int	take_owner;
2999		int	take_group;
3000
3001		/*
3002		 * NOTE: even if a new mode is being set,
3003		 * we may clear S_ISUID/S_ISGID bits.
3004		 */
3005
3006		if (!(mask & AT_MODE))
3007			vap->va_mode = zp->z_mode;
3008
3009		/*
3010		 * Take ownership or chgrp to group we are a member of
3011		 */
3012
3013		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3014		take_group = (mask & AT_GID) &&
3015		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3016
3017		/*
3018		 * If both AT_UID and AT_GID are set then take_owner and
3019		 * take_group must both be set in order to allow taking
3020		 * ownership.
3021		 *
3022		 * Otherwise, send the check through secpolicy_vnode_setattr()
3023		 *
3024		 */
3025
3026		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3027		    ((idmask == AT_UID) && take_owner) ||
3028		    ((idmask == AT_GID) && take_group)) {
3029			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3030			    skipaclchk, cr) == 0) {
3031				/*
3032				 * Remove setuid/setgid for non-privileged users
3033				 */
3034				secpolicy_setid_clear(vap, vp, cr);
3035				trim_mask = (mask & (AT_UID|AT_GID));
3036			} else {
3037				need_policy =  TRUE;
3038			}
3039		} else {
3040			need_policy =  TRUE;
3041		}
3042	}
3043
3044	oldva.va_mode = zp->z_mode;
3045	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3046	if (mask & AT_XVATTR) {
3047		/*
3048		 * Update xvattr mask to include only those attributes
3049		 * that are actually changing.
3050		 *
3051		 * the bits will be restored prior to actually setting
3052		 * the attributes so the caller thinks they were set.
3053		 */
3054		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3055			if (xoap->xoa_appendonly !=
3056			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3057				need_policy = TRUE;
3058			} else {
3059				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3060				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3061			}
3062		}
3063
3064		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3065			if (xoap->xoa_nounlink !=
3066			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3067				need_policy = TRUE;
3068			} else {
3069				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3070				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3071			}
3072		}
3073
3074		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3075			if (xoap->xoa_immutable !=
3076			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3077				need_policy = TRUE;
3078			} else {
3079				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3080				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3081			}
3082		}
3083
3084		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3085			if (xoap->xoa_nodump !=
3086			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3087				need_policy = TRUE;
3088			} else {
3089				XVA_CLR_REQ(xvap, XAT_NODUMP);
3090				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3091			}
3092		}
3093
3094		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3095			if (xoap->xoa_av_modified !=
3096			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3097				need_policy = TRUE;
3098			} else {
3099				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3100				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3101			}
3102		}
3103
3104		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3105			if ((vp->v_type != VREG &&
3106			    xoap->xoa_av_quarantined) ||
3107			    xoap->xoa_av_quarantined !=
3108			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3109				need_policy = TRUE;
3110			} else {
3111				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3112				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3113			}
3114		}
3115
3116		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3117			ZFS_EXIT(zfsvfs);
3118			return (SET_ERROR(EPERM));
3119		}
3120
3121		if (need_policy == FALSE &&
3122		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3123		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3124			need_policy = TRUE;
3125		}
3126	}
3127
3128	if (mask & AT_MODE) {
3129		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3130			err = secpolicy_setid_setsticky_clear(vp, vap,
3131			    &oldva, cr);
3132			if (err) {
3133				ZFS_EXIT(zfsvfs);
3134				return (err);
3135			}
3136			trim_mask |= AT_MODE;
3137		} else {
3138			need_policy = TRUE;
3139		}
3140	}
3141
3142	if (need_policy) {
3143		/*
3144		 * If trim_mask is set then take ownership
3145		 * has been granted or write_acl is present and user
3146		 * has the ability to modify mode.  In that case remove
3147		 * UID|GID and or MODE from mask so that
3148		 * secpolicy_vnode_setattr() doesn't revoke it.
3149		 */
3150
3151		if (trim_mask) {
3152			saved_mask = vap->va_mask;
3153			vap->va_mask &= ~trim_mask;
3154			if (trim_mask & AT_MODE) {
3155				/*
3156				 * Save the mode, as secpolicy_vnode_setattr()
3157				 * will overwrite it with ova.va_mode.
3158				 */
3159				saved_mode = vap->va_mode;
3160			}
3161		}
3162		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3163		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3164		if (err) {
3165			ZFS_EXIT(zfsvfs);
3166			return (err);
3167		}
3168
3169		if (trim_mask) {
3170			vap->va_mask |= saved_mask;
3171			if (trim_mask & AT_MODE) {
3172				/*
3173				 * Recover the mode after
3174				 * secpolicy_vnode_setattr().
3175				 */
3176				vap->va_mode = saved_mode;
3177			}
3178		}
3179	}
3180
3181	/*
3182	 * secpolicy_vnode_setattr, or take ownership may have
3183	 * changed va_mask
3184	 */
3185	mask = vap->va_mask;
3186
3187	if ((mask & (AT_UID | AT_GID))) {
3188		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3189		    &xattr_obj, sizeof (xattr_obj));
3190
3191		if (err == 0 && xattr_obj) {
3192			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3193			if (err == 0) {
3194				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3195				if (err != 0)
3196					vrele(ZTOV(attrzp));
3197			}
3198			if (err)
3199				goto out2;
3200		}
3201		if (mask & AT_UID) {
3202			new_uid = zfs_fuid_create(zfsvfs,
3203			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3204			if (new_uid != zp->z_uid &&
3205			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3206				if (attrzp)
3207					vput(ZTOV(attrzp));
3208				err = SET_ERROR(EDQUOT);
3209				goto out2;
3210			}
3211		}
3212
3213		if (mask & AT_GID) {
3214			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3215			    cr, ZFS_GROUP, &fuidp);
3216			if (new_gid != zp->z_gid &&
3217			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3218				if (attrzp)
3219					vput(ZTOV(attrzp));
3220				err = SET_ERROR(EDQUOT);
3221				goto out2;
3222			}
3223		}
3224	}
3225	tx = dmu_tx_create(zfsvfs->z_os);
3226
3227	if (mask & AT_MODE) {
3228		uint64_t pmode = zp->z_mode;
3229		uint64_t acl_obj;
3230		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3231
3232		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3233		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3234			err = SET_ERROR(EPERM);
3235			goto out;
3236		}
3237
3238		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3239			goto out;
3240
3241		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3242			/*
3243			 * Are we upgrading ACL from old V0 format
3244			 * to V1 format?
3245			 */
3246			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3247			    zfs_znode_acl_version(zp) ==
3248			    ZFS_ACL_VERSION_INITIAL) {
3249				dmu_tx_hold_free(tx, acl_obj, 0,
3250				    DMU_OBJECT_END);
3251				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3252				    0, aclp->z_acl_bytes);
3253			} else {
3254				dmu_tx_hold_write(tx, acl_obj, 0,
3255				    aclp->z_acl_bytes);
3256			}
3257		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3258			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3259			    0, aclp->z_acl_bytes);
3260		}
3261		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3262	} else {
3263		if ((mask & AT_XVATTR) &&
3264		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3265			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3266		else
3267			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3268	}
3269
3270	if (attrzp) {
3271		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3272	}
3273
3274	fuid_dirtied = zfsvfs->z_fuid_dirty;
3275	if (fuid_dirtied)
3276		zfs_fuid_txhold(zfsvfs, tx);
3277
3278	zfs_sa_upgrade_txholds(tx, zp);
3279
3280	err = dmu_tx_assign(tx, TXG_WAIT);
3281	if (err)
3282		goto out;
3283
3284	count = 0;
3285	/*
3286	 * Set each attribute requested.
3287	 * We group settings according to the locks they need to acquire.
3288	 *
3289	 * Note: you cannot set ctime directly, although it will be
3290	 * updated as a side-effect of calling this function.
3291	 */
3292
3293	if (mask & (AT_UID|AT_GID|AT_MODE))
3294		mutex_enter(&zp->z_acl_lock);
3295
3296	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3297	    &zp->z_pflags, sizeof (zp->z_pflags));
3298
3299	if (attrzp) {
3300		if (mask & (AT_UID|AT_GID|AT_MODE))
3301			mutex_enter(&attrzp->z_acl_lock);
3302		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3303		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3304		    sizeof (attrzp->z_pflags));
3305	}
3306
3307	if (mask & (AT_UID|AT_GID)) {
3308
3309		if (mask & AT_UID) {
3310			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3311			    &new_uid, sizeof (new_uid));
3312			zp->z_uid = new_uid;
3313			if (attrzp) {
3314				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3315				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3316				    sizeof (new_uid));
3317				attrzp->z_uid = new_uid;
3318			}
3319		}
3320
3321		if (mask & AT_GID) {
3322			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3323			    NULL, &new_gid, sizeof (new_gid));
3324			zp->z_gid = new_gid;
3325			if (attrzp) {
3326				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3327				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3328				    sizeof (new_gid));
3329				attrzp->z_gid = new_gid;
3330			}
3331		}
3332		if (!(mask & AT_MODE)) {
3333			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3334			    NULL, &new_mode, sizeof (new_mode));
3335			new_mode = zp->z_mode;
3336		}
3337		err = zfs_acl_chown_setattr(zp);
3338		ASSERT(err == 0);
3339		if (attrzp) {
3340			err = zfs_acl_chown_setattr(attrzp);
3341			ASSERT(err == 0);
3342		}
3343	}
3344
3345	if (mask & AT_MODE) {
3346		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3347		    &new_mode, sizeof (new_mode));
3348		zp->z_mode = new_mode;
3349		ASSERT3U((uintptr_t)aclp, !=, 0);
3350		err = zfs_aclset_common(zp, aclp, cr, tx);
3351		ASSERT0(err);
3352		if (zp->z_acl_cached)
3353			zfs_acl_free(zp->z_acl_cached);
3354		zp->z_acl_cached = aclp;
3355		aclp = NULL;
3356	}
3357
3358
3359	if (mask & AT_ATIME) {
3360		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3361		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3362		    &zp->z_atime, sizeof (zp->z_atime));
3363	}
3364
3365	if (mask & AT_MTIME) {
3366		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3367		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3368		    mtime, sizeof (mtime));
3369	}
3370
3371	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3372	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3373		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3374		    NULL, mtime, sizeof (mtime));
3375		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3376		    &ctime, sizeof (ctime));
3377		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3378		    B_TRUE);
3379	} else if (mask != 0) {
3380		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3381		    &ctime, sizeof (ctime));
3382		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3383		    B_TRUE);
3384		if (attrzp) {
3385			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3386			    SA_ZPL_CTIME(zfsvfs), NULL,
3387			    &ctime, sizeof (ctime));
3388			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3389			    mtime, ctime, B_TRUE);
3390		}
3391	}
3392	/*
3393	 * Do this after setting timestamps to prevent timestamp
3394	 * update from toggling bit
3395	 */
3396
3397	if (xoap && (mask & AT_XVATTR)) {
3398
3399		/*
3400		 * restore trimmed off masks
3401		 * so that return masks can be set for caller.
3402		 */
3403
3404		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3405			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3406		}
3407		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3408			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3409		}
3410		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3411			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3412		}
3413		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3414			XVA_SET_REQ(xvap, XAT_NODUMP);
3415		}
3416		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3417			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3418		}
3419		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3420			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3421		}
3422
3423		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3424			ASSERT(vp->v_type == VREG);
3425
3426		zfs_xvattr_set(zp, xvap, tx);
3427	}
3428
3429	if (fuid_dirtied)
3430		zfs_fuid_sync(zfsvfs, tx);
3431
3432	if (mask != 0)
3433		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3434
3435	if (mask & (AT_UID|AT_GID|AT_MODE))
3436		mutex_exit(&zp->z_acl_lock);
3437
3438	if (attrzp) {
3439		if (mask & (AT_UID|AT_GID|AT_MODE))
3440			mutex_exit(&attrzp->z_acl_lock);
3441	}
3442out:
3443	if (err == 0 && attrzp) {
3444		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3445		    xattr_count, tx);
3446		ASSERT(err2 == 0);
3447	}
3448
3449	if (attrzp)
3450		vput(ZTOV(attrzp));
3451
3452	if (aclp)
3453		zfs_acl_free(aclp);
3454
3455	if (fuidp) {
3456		zfs_fuid_info_free(fuidp);
3457		fuidp = NULL;
3458	}
3459
3460	if (err) {
3461		dmu_tx_abort(tx);
3462	} else {
3463		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3464		dmu_tx_commit(tx);
3465	}
3466
3467out2:
3468	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3469		zil_commit(zilog, 0);
3470
3471	ZFS_EXIT(zfsvfs);
3472	return (err);
3473}
3474
3475/*
3476 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3477 * fail to acquire any lock in the path we will drop all held locks,
3478 * acquire the new lock in a blocking fashion, and then release it and
3479 * restart the rename.  This acquire/release step ensures that we do not
3480 * spin on a lock waiting for release.  On error release all vnode locks
3481 * and decrement references the way tmpfs_rename() would do.
3482 */
3483static int
3484zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3485    struct vnode *tdvp, struct vnode **tvpp,
3486    const struct componentname *scnp, const struct componentname *tcnp)
3487{
3488	zfsvfs_t	*zfsvfs;
3489	struct vnode	*nvp, *svp, *tvp;
3490	znode_t		*sdzp, *tdzp, *szp, *tzp;
3491	const char	*snm = scnp->cn_nameptr;
3492	const char	*tnm = tcnp->cn_nameptr;
3493	int error;
3494
3495	VOP_UNLOCK(tdvp, 0);
3496	if (*tvpp != NULL && *tvpp != tdvp)
3497		VOP_UNLOCK(*tvpp, 0);
3498
3499relock:
3500	error = vn_lock(sdvp, LK_EXCLUSIVE);
3501	if (error)
3502		goto out;
3503	sdzp = VTOZ(sdvp);
3504
3505	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3506	if (error != 0) {
3507		VOP_UNLOCK(sdvp, 0);
3508		if (error != EBUSY)
3509			goto out;
3510		error = vn_lock(tdvp, LK_EXCLUSIVE);
3511		if (error)
3512			goto out;
3513		VOP_UNLOCK(tdvp, 0);
3514		goto relock;
3515	}
3516	tdzp = VTOZ(tdvp);
3517
3518	/*
3519	 * Before using sdzp and tdzp we must ensure that they are live.
3520	 * As a porting legacy from illumos we have two things to worry
3521	 * about.  One is typical for FreeBSD and it is that the vnode is
3522	 * not reclaimed (doomed).  The other is that the znode is live.
3523	 * The current code can invalidate the znode without acquiring the
3524	 * corresponding vnode lock if the object represented by the znode
3525	 * and vnode is no longer valid after a rollback or receive operation.
3526	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3527	 * that protects the znodes from the invalidation.
3528	 */
3529	zfsvfs = sdzp->z_zfsvfs;
3530	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3531	ZFS_ENTER(zfsvfs);
3532
3533	/*
3534	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3535	 * bypassing the cleanup code in the case of an error.
3536	 */
3537	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3538		ZFS_EXIT(zfsvfs);
3539		VOP_UNLOCK(sdvp, 0);
3540		VOP_UNLOCK(tdvp, 0);
3541		error = SET_ERROR(EIO);
3542		goto out;
3543	}
3544
3545	/*
3546	 * Re-resolve svp to be certain it still exists and fetch the
3547	 * correct vnode.
3548	 */
3549	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3550	if (error != 0) {
3551		/* Source entry invalid or not there. */
3552		ZFS_EXIT(zfsvfs);
3553		VOP_UNLOCK(sdvp, 0);
3554		VOP_UNLOCK(tdvp, 0);
3555		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3556		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3557			error = SET_ERROR(EINVAL);
3558		goto out;
3559	}
3560	svp = ZTOV(szp);
3561
3562	/*
3563	 * Re-resolve tvp, if it disappeared we just carry on.
3564	 */
3565	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3566	if (error != 0) {
3567		ZFS_EXIT(zfsvfs);
3568		VOP_UNLOCK(sdvp, 0);
3569		VOP_UNLOCK(tdvp, 0);
3570		vrele(svp);
3571		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3572			error = SET_ERROR(EINVAL);
3573		goto out;
3574	}
3575	if (tzp != NULL)
3576		tvp = ZTOV(tzp);
3577	else
3578		tvp = NULL;
3579
3580	/*
3581	 * At present the vnode locks must be acquired before z_teardown_lock,
3582	 * although it would be more logical to use the opposite order.
3583	 */
3584	ZFS_EXIT(zfsvfs);
3585
3586	/*
3587	 * Now try acquire locks on svp and tvp.
3588	 */
3589	nvp = svp;
3590	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3591	if (error != 0) {
3592		VOP_UNLOCK(sdvp, 0);
3593		VOP_UNLOCK(tdvp, 0);
3594		if (tvp != NULL)
3595			vrele(tvp);
3596		if (error != EBUSY) {
3597			vrele(nvp);
3598			goto out;
3599		}
3600		error = vn_lock(nvp, LK_EXCLUSIVE);
3601		if (error != 0) {
3602			vrele(nvp);
3603			goto out;
3604		}
3605		VOP_UNLOCK(nvp, 0);
3606		/*
3607		 * Concurrent rename race.
3608		 * XXX ?
3609		 */
3610		if (nvp == tdvp) {
3611			vrele(nvp);
3612			error = SET_ERROR(EINVAL);
3613			goto out;
3614		}
3615		vrele(*svpp);
3616		*svpp = nvp;
3617		goto relock;
3618	}
3619	vrele(*svpp);
3620	*svpp = nvp;
3621
3622	if (*tvpp != NULL)
3623		vrele(*tvpp);
3624	*tvpp = NULL;
3625	if (tvp != NULL) {
3626		nvp = tvp;
3627		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3628		if (error != 0) {
3629			VOP_UNLOCK(sdvp, 0);
3630			VOP_UNLOCK(tdvp, 0);
3631			VOP_UNLOCK(*svpp, 0);
3632			if (error != EBUSY) {
3633				vrele(nvp);
3634				goto out;
3635			}
3636			error = vn_lock(nvp, LK_EXCLUSIVE);
3637			if (error != 0) {
3638				vrele(nvp);
3639				goto out;
3640			}
3641			vput(nvp);
3642			goto relock;
3643		}
3644		*tvpp = nvp;
3645	}
3646
3647	return (0);
3648
3649out:
3650	return (error);
3651}
3652
3653/*
3654 * Note that we must use VRELE_ASYNC in this function as it walks
3655 * up the directory tree and vrele may need to acquire an exclusive
3656 * lock if a last reference to a vnode is dropped.
3657 */
3658static int
3659zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3660{
3661	zfsvfs_t	*zfsvfs;
3662	znode_t		*zp, *zp1;
3663	uint64_t	parent;
3664	int		error;
3665
3666	zfsvfs = tdzp->z_zfsvfs;
3667	if (tdzp == szp)
3668		return (SET_ERROR(EINVAL));
3669	if (tdzp == sdzp)
3670		return (0);
3671	if (tdzp->z_id == zfsvfs->z_root)
3672		return (0);
3673	zp = tdzp;
3674	for (;;) {
3675		ASSERT(!zp->z_unlinked);
3676		if ((error = sa_lookup(zp->z_sa_hdl,
3677		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3678			break;
3679
3680		if (parent == szp->z_id) {
3681			error = SET_ERROR(EINVAL);
3682			break;
3683		}
3684		if (parent == zfsvfs->z_root)
3685			break;
3686		if (parent == sdzp->z_id)
3687			break;
3688
3689		error = zfs_zget(zfsvfs, parent, &zp1);
3690		if (error != 0)
3691			break;
3692
3693		if (zp != tdzp)
3694			VN_RELE_ASYNC(ZTOV(zp),
3695			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3696		zp = zp1;
3697	}
3698
3699	if (error == ENOTDIR)
3700		panic("checkpath: .. not a directory\n");
3701	if (zp != tdzp)
3702		VN_RELE_ASYNC(ZTOV(zp),
3703		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3704	return (error);
3705}
3706
3707/*
3708 * Move an entry from the provided source directory to the target
3709 * directory.  Change the entry name as indicated.
3710 *
3711 *	IN:	sdvp	- Source directory containing the "old entry".
3712 *		snm	- Old entry name.
3713 *		tdvp	- Target directory to contain the "new entry".
3714 *		tnm	- New entry name.
3715 *		cr	- credentials of caller.
3716 *		ct	- caller context
3717 *		flags	- case flags
3718 *
3719 *	RETURN:	0 on success, error code on failure.
3720 *
3721 * Timestamps:
3722 *	sdvp,tdvp - ctime|mtime updated
3723 */
3724/*ARGSUSED*/
3725static int
3726zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3727    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3728    cred_t *cr)
3729{
3730	zfsvfs_t	*zfsvfs;
3731	znode_t		*sdzp, *tdzp, *szp, *tzp;
3732	zilog_t		*zilog = NULL;
3733	dmu_tx_t	*tx;
3734	char		*snm = scnp->cn_nameptr;
3735	char		*tnm = tcnp->cn_nameptr;
3736	int		error = 0;
3737
3738	/* Reject renames across filesystems. */
3739	if ((*svpp)->v_mount != tdvp->v_mount ||
3740	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3741		error = SET_ERROR(EXDEV);
3742		goto out;
3743	}
3744
3745	if (zfsctl_is_node(tdvp)) {
3746		error = SET_ERROR(EXDEV);
3747		goto out;
3748	}
3749
3750	/*
3751	 * Lock all four vnodes to ensure safety and semantics of renaming.
3752	 */
3753	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3754	if (error != 0) {
3755		/* no vnodes are locked in the case of error here */
3756		return (error);
3757	}
3758
3759	tdzp = VTOZ(tdvp);
3760	sdzp = VTOZ(sdvp);
3761	zfsvfs = tdzp->z_zfsvfs;
3762	zilog = zfsvfs->z_log;
3763
3764	/*
3765	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3766	 * znodes involved.
3767	 */
3768	ZFS_ENTER(zfsvfs);
3769
3770	if (zfsvfs->z_utf8 && u8_validate(tnm,
3771	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3772		error = SET_ERROR(EILSEQ);
3773		goto unlockout;
3774	}
3775
3776	/* If source and target are the same file, there is nothing to do. */
3777	if ((*svpp) == (*tvpp)) {
3778		error = 0;
3779		goto unlockout;
3780	}
3781
3782	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3783	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3784	    (*tvpp)->v_mountedhere != NULL)) {
3785		error = SET_ERROR(EXDEV);
3786		goto unlockout;
3787	}
3788
3789	/*
3790	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3791	 * bypassing the cleanup code in the case of an error.
3792	 */
3793	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3794		error = SET_ERROR(EIO);
3795		goto unlockout;
3796	}
3797
3798	szp = VTOZ(*svpp);
3799	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3800	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3801		error = SET_ERROR(EIO);
3802		goto unlockout;
3803	}
3804
3805	/*
3806	 * This is to prevent the creation of links into attribute space
3807	 * by renaming a linked file into/outof an attribute directory.
3808	 * See the comment in zfs_link() for why this is considered bad.
3809	 */
3810	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3811		error = SET_ERROR(EINVAL);
3812		goto unlockout;
3813	}
3814
3815	/*
3816	 * Must have write access at the source to remove the old entry
3817	 * and write access at the target to create the new entry.
3818	 * Note that if target and source are the same, this can be
3819	 * done in a single check.
3820	 */
3821	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3822		goto unlockout;
3823
3824	if ((*svpp)->v_type == VDIR) {
3825		/*
3826		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3827		 */
3828		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3829		    sdzp == szp ||
3830		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3831			error = EINVAL;
3832			goto unlockout;
3833		}
3834
3835		/*
3836		 * Check to make sure rename is valid.
3837		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3838		 */
3839		if (error = zfs_rename_check(szp, sdzp, tdzp))
3840			goto unlockout;
3841	}
3842
3843	/*
3844	 * Does target exist?
3845	 */
3846	if (tzp) {
3847		/*
3848		 * Source and target must be the same type.
3849		 */
3850		if ((*svpp)->v_type == VDIR) {
3851			if ((*tvpp)->v_type != VDIR) {
3852				error = SET_ERROR(ENOTDIR);
3853				goto unlockout;
3854			} else {
3855				cache_purge(tdvp);
3856				if (sdvp != tdvp)
3857					cache_purge(sdvp);
3858			}
3859		} else {
3860			if ((*tvpp)->v_type == VDIR) {
3861				error = SET_ERROR(EISDIR);
3862				goto unlockout;
3863			}
3864		}
3865	}
3866
3867	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3868	if (tzp)
3869		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3870
3871	/*
3872	 * notify the target directory if it is not the same
3873	 * as source directory.
3874	 */
3875	if (tdvp != sdvp) {
3876		vnevent_rename_dest_dir(tdvp, ct);
3877	}
3878
3879	tx = dmu_tx_create(zfsvfs->z_os);
3880	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3881	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3882	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3883	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3884	if (sdzp != tdzp) {
3885		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3886		zfs_sa_upgrade_txholds(tx, tdzp);
3887	}
3888	if (tzp) {
3889		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3890		zfs_sa_upgrade_txholds(tx, tzp);
3891	}
3892
3893	zfs_sa_upgrade_txholds(tx, szp);
3894	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3895	error = dmu_tx_assign(tx, TXG_WAIT);
3896	if (error) {
3897		dmu_tx_abort(tx);
3898		goto unlockout;
3899	}
3900
3901
3902	if (tzp)	/* Attempt to remove the existing target */
3903		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3904
3905	if (error == 0) {
3906		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3907		if (error == 0) {
3908			szp->z_pflags |= ZFS_AV_MODIFIED;
3909
3910			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3911			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3912			ASSERT0(error);
3913
3914			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3915			    NULL);
3916			if (error == 0) {
3917				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3918				    snm, tdzp, tnm, szp);
3919
3920				/*
3921				 * Update path information for the target vnode
3922				 */
3923				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3924			} else {
3925				/*
3926				 * At this point, we have successfully created
3927				 * the target name, but have failed to remove
3928				 * the source name.  Since the create was done
3929				 * with the ZRENAMING flag, there are
3930				 * complications; for one, the link count is
3931				 * wrong.  The easiest way to deal with this
3932				 * is to remove the newly created target, and
3933				 * return the original error.  This must
3934				 * succeed; fortunately, it is very unlikely to
3935				 * fail, since we just created it.
3936				 */
3937				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3938				    ZRENAMING, NULL), ==, 0);
3939			}
3940		}
3941		if (error == 0) {
3942			cache_purge(*svpp);
3943			if (*tvpp != NULL)
3944				cache_purge(*tvpp);
3945			cache_purge_negative(tdvp);
3946		}
3947	}
3948
3949	dmu_tx_commit(tx);
3950
3951unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3952	ZFS_EXIT(zfsvfs);
3953	VOP_UNLOCK(*svpp, 0);
3954	VOP_UNLOCK(sdvp, 0);
3955
3956out:				/* original two vnodes are locked */
3957	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3958		zil_commit(zilog, 0);
3959
3960	if (*tvpp != NULL)
3961		VOP_UNLOCK(*tvpp, 0);
3962	if (tdvp != *tvpp)
3963		VOP_UNLOCK(tdvp, 0);
3964	return (error);
3965}
3966
3967/*
3968 * Insert the indicated symbolic reference entry into the directory.
3969 *
3970 *	IN:	dvp	- Directory to contain new symbolic link.
3971 *		link	- Name for new symlink entry.
3972 *		vap	- Attributes of new entry.
3973 *		cr	- credentials of caller.
3974 *		ct	- caller context
3975 *		flags	- case flags
3976 *
3977 *	RETURN:	0 on success, error code on failure.
3978 *
3979 * Timestamps:
3980 *	dvp - ctime|mtime updated
3981 */
3982/*ARGSUSED*/
3983static int
3984zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3985    cred_t *cr, kthread_t *td)
3986{
3987	znode_t		*zp, *dzp = VTOZ(dvp);
3988	dmu_tx_t	*tx;
3989	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3990	zilog_t		*zilog;
3991	uint64_t	len = strlen(link);
3992	int		error;
3993	zfs_acl_ids_t	acl_ids;
3994	boolean_t	fuid_dirtied;
3995	uint64_t	txtype = TX_SYMLINK;
3996	int		flags = 0;
3997
3998	ASSERT(vap->va_type == VLNK);
3999
4000	ZFS_ENTER(zfsvfs);
4001	ZFS_VERIFY_ZP(dzp);
4002	zilog = zfsvfs->z_log;
4003
4004	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4005	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4006		ZFS_EXIT(zfsvfs);
4007		return (SET_ERROR(EILSEQ));
4008	}
4009
4010	if (len > MAXPATHLEN) {
4011		ZFS_EXIT(zfsvfs);
4012		return (SET_ERROR(ENAMETOOLONG));
4013	}
4014
4015	if ((error = zfs_acl_ids_create(dzp, 0,
4016	    vap, cr, NULL, &acl_ids)) != 0) {
4017		ZFS_EXIT(zfsvfs);
4018		return (error);
4019	}
4020
4021	/*
4022	 * Attempt to lock directory; fail if entry already exists.
4023	 */
4024	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4025	if (error) {
4026		zfs_acl_ids_free(&acl_ids);
4027		ZFS_EXIT(zfsvfs);
4028		return (error);
4029	}
4030
4031	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4032		zfs_acl_ids_free(&acl_ids);
4033		ZFS_EXIT(zfsvfs);
4034		return (error);
4035	}
4036
4037	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4038		zfs_acl_ids_free(&acl_ids);
4039		ZFS_EXIT(zfsvfs);
4040		return (SET_ERROR(EDQUOT));
4041	}
4042
4043	getnewvnode_reserve(1);
4044	tx = dmu_tx_create(zfsvfs->z_os);
4045	fuid_dirtied = zfsvfs->z_fuid_dirty;
4046	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4047	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4048	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4049	    ZFS_SA_BASE_ATTR_SIZE + len);
4050	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4051	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4052		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4053		    acl_ids.z_aclp->z_acl_bytes);
4054	}
4055	if (fuid_dirtied)
4056		zfs_fuid_txhold(zfsvfs, tx);
4057	error = dmu_tx_assign(tx, TXG_WAIT);
4058	if (error) {
4059		zfs_acl_ids_free(&acl_ids);
4060		dmu_tx_abort(tx);
4061		getnewvnode_drop_reserve();
4062		ZFS_EXIT(zfsvfs);
4063		return (error);
4064	}
4065
4066	/*
4067	 * Create a new object for the symlink.
4068	 * for version 4 ZPL datsets the symlink will be an SA attribute
4069	 */
4070	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4071
4072	if (fuid_dirtied)
4073		zfs_fuid_sync(zfsvfs, tx);
4074
4075	if (zp->z_is_sa)
4076		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4077		    link, len, tx);
4078	else
4079		zfs_sa_symlink(zp, link, len, tx);
4080
4081	zp->z_size = len;
4082	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4083	    &zp->z_size, sizeof (zp->z_size), tx);
4084	/*
4085	 * Insert the new object into the directory.
4086	 */
4087	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4088
4089	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4090	*vpp = ZTOV(zp);
4091
4092	zfs_acl_ids_free(&acl_ids);
4093
4094	dmu_tx_commit(tx);
4095
4096	getnewvnode_drop_reserve();
4097
4098	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4099		zil_commit(zilog, 0);
4100
4101	ZFS_EXIT(zfsvfs);
4102	return (error);
4103}
4104
4105/*
4106 * Return, in the buffer contained in the provided uio structure,
4107 * the symbolic path referred to by vp.
4108 *
4109 *	IN:	vp	- vnode of symbolic link.
4110 *		uio	- structure to contain the link path.
4111 *		cr	- credentials of caller.
4112 *		ct	- caller context
4113 *
4114 *	OUT:	uio	- structure containing the link path.
4115 *
4116 *	RETURN:	0 on success, error code on failure.
4117 *
4118 * Timestamps:
4119 *	vp - atime updated
4120 */
4121/* ARGSUSED */
4122static int
4123zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4124{
4125	znode_t		*zp = VTOZ(vp);
4126	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4127	int		error;
4128
4129	ZFS_ENTER(zfsvfs);
4130	ZFS_VERIFY_ZP(zp);
4131
4132	if (zp->z_is_sa)
4133		error = sa_lookup_uio(zp->z_sa_hdl,
4134		    SA_ZPL_SYMLINK(zfsvfs), uio);
4135	else
4136		error = zfs_sa_readlink(zp, uio);
4137
4138	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4139
4140	ZFS_EXIT(zfsvfs);
4141	return (error);
4142}
4143
4144/*
4145 * Insert a new entry into directory tdvp referencing svp.
4146 *
4147 *	IN:	tdvp	- Directory to contain new entry.
4148 *		svp	- vnode of new entry.
4149 *		name	- name of new entry.
4150 *		cr	- credentials of caller.
4151 *		ct	- caller context
4152 *
4153 *	RETURN:	0 on success, error code on failure.
4154 *
4155 * Timestamps:
4156 *	tdvp - ctime|mtime updated
4157 *	 svp - ctime updated
4158 */
4159/* ARGSUSED */
4160static int
4161zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4162    caller_context_t *ct, int flags)
4163{
4164	znode_t		*dzp = VTOZ(tdvp);
4165	znode_t		*tzp, *szp;
4166	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4167	zilog_t		*zilog;
4168	dmu_tx_t	*tx;
4169	int		error;
4170	uint64_t	parent;
4171	uid_t		owner;
4172
4173	ASSERT(tdvp->v_type == VDIR);
4174
4175	ZFS_ENTER(zfsvfs);
4176	ZFS_VERIFY_ZP(dzp);
4177	zilog = zfsvfs->z_log;
4178
4179	/*
4180	 * POSIX dictates that we return EPERM here.
4181	 * Better choices include ENOTSUP or EISDIR.
4182	 */
4183	if (svp->v_type == VDIR) {
4184		ZFS_EXIT(zfsvfs);
4185		return (SET_ERROR(EPERM));
4186	}
4187
4188	szp = VTOZ(svp);
4189	ZFS_VERIFY_ZP(szp);
4190
4191	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4192		ZFS_EXIT(zfsvfs);
4193		return (SET_ERROR(EPERM));
4194	}
4195
4196	/* Prevent links to .zfs/shares files */
4197
4198	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4199	    &parent, sizeof (uint64_t))) != 0) {
4200		ZFS_EXIT(zfsvfs);
4201		return (error);
4202	}
4203	if (parent == zfsvfs->z_shares_dir) {
4204		ZFS_EXIT(zfsvfs);
4205		return (SET_ERROR(EPERM));
4206	}
4207
4208	if (zfsvfs->z_utf8 && u8_validate(name,
4209	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4210		ZFS_EXIT(zfsvfs);
4211		return (SET_ERROR(EILSEQ));
4212	}
4213
4214	/*
4215	 * We do not support links between attributes and non-attributes
4216	 * because of the potential security risk of creating links
4217	 * into "normal" file space in order to circumvent restrictions
4218	 * imposed in attribute space.
4219	 */
4220	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4221		ZFS_EXIT(zfsvfs);
4222		return (SET_ERROR(EINVAL));
4223	}
4224
4225
4226	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4227	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4228		ZFS_EXIT(zfsvfs);
4229		return (SET_ERROR(EPERM));
4230	}
4231
4232	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4233		ZFS_EXIT(zfsvfs);
4234		return (error);
4235	}
4236
4237	/*
4238	 * Attempt to lock directory; fail if entry already exists.
4239	 */
4240	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4241	if (error) {
4242		ZFS_EXIT(zfsvfs);
4243		return (error);
4244	}
4245
4246	tx = dmu_tx_create(zfsvfs->z_os);
4247	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4248	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4249	zfs_sa_upgrade_txholds(tx, szp);
4250	zfs_sa_upgrade_txholds(tx, dzp);
4251	error = dmu_tx_assign(tx, TXG_WAIT);
4252	if (error) {
4253		dmu_tx_abort(tx);
4254		ZFS_EXIT(zfsvfs);
4255		return (error);
4256	}
4257
4258	error = zfs_link_create(dzp, name, szp, tx, 0);
4259
4260	if (error == 0) {
4261		uint64_t txtype = TX_LINK;
4262		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4263	}
4264
4265	dmu_tx_commit(tx);
4266
4267	if (error == 0) {
4268		vnevent_link(svp, ct);
4269	}
4270
4271	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4272		zil_commit(zilog, 0);
4273
4274	ZFS_EXIT(zfsvfs);
4275	return (error);
4276}
4277
4278
4279/*ARGSUSED*/
4280void
4281zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4282{
4283	znode_t	*zp = VTOZ(vp);
4284	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4285	int error;
4286
4287	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4288	if (zp->z_sa_hdl == NULL) {
4289		/*
4290		 * The fs has been unmounted, or we did a
4291		 * suspend/resume and this file no longer exists.
4292		 */
4293		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4294		vrecycle(vp);
4295		return;
4296	}
4297
4298	if (zp->z_unlinked) {
4299		/*
4300		 * Fast path to recycle a vnode of a removed file.
4301		 */
4302		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4303		vrecycle(vp);
4304		return;
4305	}
4306
4307	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4308		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4309
4310		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4311		zfs_sa_upgrade_txholds(tx, zp);
4312		error = dmu_tx_assign(tx, TXG_WAIT);
4313		if (error) {
4314			dmu_tx_abort(tx);
4315		} else {
4316			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4317			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4318			zp->z_atime_dirty = 0;
4319			dmu_tx_commit(tx);
4320		}
4321	}
4322	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4323}
4324
4325
4326CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4327CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4328
4329/*ARGSUSED*/
4330static int
4331zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4332{
4333	znode_t		*zp = VTOZ(vp);
4334	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4335	uint32_t	gen;
4336	uint64_t	gen64;
4337	uint64_t	object = zp->z_id;
4338	zfid_short_t	*zfid;
4339	int		size, i, error;
4340
4341	ZFS_ENTER(zfsvfs);
4342	ZFS_VERIFY_ZP(zp);
4343
4344	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4345	    &gen64, sizeof (uint64_t))) != 0) {
4346		ZFS_EXIT(zfsvfs);
4347		return (error);
4348	}
4349
4350	gen = (uint32_t)gen64;
4351
4352	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4353
4354#ifdef illumos
4355	if (fidp->fid_len < size) {
4356		fidp->fid_len = size;
4357		ZFS_EXIT(zfsvfs);
4358		return (SET_ERROR(ENOSPC));
4359	}
4360#else
4361	fidp->fid_len = size;
4362#endif
4363
4364	zfid = (zfid_short_t *)fidp;
4365
4366	zfid->zf_len = size;
4367
4368	for (i = 0; i < sizeof (zfid->zf_object); i++)
4369		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4370
4371	/* Must have a non-zero generation number to distinguish from .zfs */
4372	if (gen == 0)
4373		gen = 1;
4374	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4375		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4376
4377	if (size == LONG_FID_LEN) {
4378		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4379		zfid_long_t	*zlfid;
4380
4381		zlfid = (zfid_long_t *)fidp;
4382
4383		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4384			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4385
4386		/* XXX - this should be the generation number for the objset */
4387		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4388			zlfid->zf_setgen[i] = 0;
4389	}
4390
4391	ZFS_EXIT(zfsvfs);
4392	return (0);
4393}
4394
4395static int
4396zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4397    caller_context_t *ct)
4398{
4399	znode_t		*zp, *xzp;
4400	zfsvfs_t	*zfsvfs;
4401	int		error;
4402
4403	switch (cmd) {
4404	case _PC_LINK_MAX:
4405		*valp = INT_MAX;
4406		return (0);
4407
4408	case _PC_FILESIZEBITS:
4409		*valp = 64;
4410		return (0);
4411#ifdef illumos
4412	case _PC_XATTR_EXISTS:
4413		zp = VTOZ(vp);
4414		zfsvfs = zp->z_zfsvfs;
4415		ZFS_ENTER(zfsvfs);
4416		ZFS_VERIFY_ZP(zp);
4417		*valp = 0;
4418		error = zfs_dirent_lookup(zp, "", &xzp,
4419		    ZXATTR | ZEXISTS | ZSHARED);
4420		if (error == 0) {
4421			if (!zfs_dirempty(xzp))
4422				*valp = 1;
4423			vrele(ZTOV(xzp));
4424		} else if (error == ENOENT) {
4425			/*
4426			 * If there aren't extended attributes, it's the
4427			 * same as having zero of them.
4428			 */
4429			error = 0;
4430		}
4431		ZFS_EXIT(zfsvfs);
4432		return (error);
4433
4434	case _PC_SATTR_ENABLED:
4435	case _PC_SATTR_EXISTS:
4436		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4437		    (vp->v_type == VREG || vp->v_type == VDIR);
4438		return (0);
4439
4440	case _PC_ACCESS_FILTERING:
4441		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4442		    vp->v_type == VDIR;
4443		return (0);
4444
4445	case _PC_ACL_ENABLED:
4446		*valp = _ACL_ACE_ENABLED;
4447		return (0);
4448#endif	/* illumos */
4449	case _PC_MIN_HOLE_SIZE:
4450		*valp = (int)SPA_MINBLOCKSIZE;
4451		return (0);
4452#ifdef illumos
4453	case _PC_TIMESTAMP_RESOLUTION:
4454		/* nanosecond timestamp resolution */
4455		*valp = 1L;
4456		return (0);
4457#endif
4458	case _PC_ACL_EXTENDED:
4459		*valp = 0;
4460		return (0);
4461
4462	case _PC_ACL_NFS4:
4463		*valp = 1;
4464		return (0);
4465
4466	case _PC_ACL_PATH_MAX:
4467		*valp = ACL_MAX_ENTRIES;
4468		return (0);
4469
4470	default:
4471		return (EOPNOTSUPP);
4472	}
4473}
4474
4475/*ARGSUSED*/
4476static int
4477zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4478    caller_context_t *ct)
4479{
4480	znode_t *zp = VTOZ(vp);
4481	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4482	int error;
4483	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4484
4485	ZFS_ENTER(zfsvfs);
4486	ZFS_VERIFY_ZP(zp);
4487	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4488	ZFS_EXIT(zfsvfs);
4489
4490	return (error);
4491}
4492
4493/*ARGSUSED*/
4494int
4495zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4496    caller_context_t *ct)
4497{
4498	znode_t *zp = VTOZ(vp);
4499	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4500	int error;
4501	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4502	zilog_t	*zilog = zfsvfs->z_log;
4503
4504	ZFS_ENTER(zfsvfs);
4505	ZFS_VERIFY_ZP(zp);
4506
4507	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4508
4509	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4510		zil_commit(zilog, 0);
4511
4512	ZFS_EXIT(zfsvfs);
4513	return (error);
4514}
4515
4516static int
4517ioflags(int ioflags)
4518{
4519	int flags = 0;
4520
4521	if (ioflags & IO_APPEND)
4522		flags |= FAPPEND;
4523	if (ioflags & IO_NDELAY)
4524		flags |= FNONBLOCK;
4525	if (ioflags & IO_SYNC)
4526		flags |= (FSYNC | FDSYNC | FRSYNC);
4527
4528	return (flags);
4529}
4530
4531static int
4532zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4533{
4534	znode_t *zp = VTOZ(vp);
4535	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4536	objset_t *os = zp->z_zfsvfs->z_os;
4537	vm_page_t mfirst, mlast, mreq;
4538	vm_object_t object;
4539	caddr_t va;
4540	struct sf_buf *sf;
4541	off_t startoff, endoff;
4542	int i, error;
4543	vm_pindex_t reqstart, reqend;
4544	int pcount, lsize, reqsize, size;
4545
4546	ZFS_ENTER(zfsvfs);
4547	ZFS_VERIFY_ZP(zp);
4548
4549	pcount = OFF_TO_IDX(round_page(count));
4550	mreq = m[reqpage];
4551	object = mreq->object;
4552	error = 0;
4553
4554	KASSERT(vp->v_object == object, ("mismatching object"));
4555
4556	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4557		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4558		reqstart = OFF_TO_IDX(round_page(startoff));
4559		if (reqstart < m[0]->pindex)
4560			reqstart = 0;
4561		else
4562			reqstart = reqstart - m[0]->pindex;
4563		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4564		    zp->z_blksz);
4565		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4566		if (reqend > m[pcount - 1]->pindex)
4567			reqend = m[pcount - 1]->pindex;
4568		reqsize = reqend - m[reqstart]->pindex + 1;
4569		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4570		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4571	} else {
4572		reqstart = reqpage;
4573		reqsize = 1;
4574	}
4575	mfirst = m[reqstart];
4576	mlast = m[reqstart + reqsize - 1];
4577
4578	zfs_vmobject_wlock(object);
4579
4580	for (i = 0; i < reqstart; i++) {
4581		vm_page_lock(m[i]);
4582		vm_page_free(m[i]);
4583		vm_page_unlock(m[i]);
4584	}
4585	for (i = reqstart + reqsize; i < pcount; i++) {
4586		vm_page_lock(m[i]);
4587		vm_page_free(m[i]);
4588		vm_page_unlock(m[i]);
4589	}
4590
4591	if (mreq->valid && reqsize == 1) {
4592		if (mreq->valid != VM_PAGE_BITS_ALL)
4593			vm_page_zero_invalid(mreq, TRUE);
4594		zfs_vmobject_wunlock(object);
4595		ZFS_EXIT(zfsvfs);
4596		return (zfs_vm_pagerret_ok);
4597	}
4598
4599	PCPU_INC(cnt.v_vnodein);
4600	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4601
4602	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4603		for (i = reqstart; i < reqstart + reqsize; i++) {
4604			if (i != reqpage) {
4605				vm_page_lock(m[i]);
4606				vm_page_free(m[i]);
4607				vm_page_unlock(m[i]);
4608			}
4609		}
4610		zfs_vmobject_wunlock(object);
4611		ZFS_EXIT(zfsvfs);
4612		return (zfs_vm_pagerret_bad);
4613	}
4614
4615	lsize = PAGE_SIZE;
4616	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4617		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4618
4619	zfs_vmobject_wunlock(object);
4620
4621	for (i = reqstart; i < reqstart + reqsize; i++) {
4622		size = PAGE_SIZE;
4623		if (i == (reqstart + reqsize - 1))
4624			size = lsize;
4625		va = zfs_map_page(m[i], &sf);
4626		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4627		    size, va, DMU_READ_PREFETCH);
4628		if (size != PAGE_SIZE)
4629			bzero(va + size, PAGE_SIZE - size);
4630		zfs_unmap_page(sf);
4631		if (error != 0)
4632			break;
4633	}
4634
4635	zfs_vmobject_wlock(object);
4636
4637	for (i = reqstart; i < reqstart + reqsize; i++) {
4638		if (!error)
4639			m[i]->valid = VM_PAGE_BITS_ALL;
4640		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4641		if (i != reqpage)
4642			vm_page_readahead_finish(m[i]);
4643	}
4644
4645	zfs_vmobject_wunlock(object);
4646
4647	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4648	ZFS_EXIT(zfsvfs);
4649	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4650}
4651
4652static int
4653zfs_freebsd_getpages(ap)
4654	struct vop_getpages_args /* {
4655		struct vnode *a_vp;
4656		vm_page_t *a_m;
4657		int a_count;
4658		int a_reqpage;
4659		vm_ooffset_t a_offset;
4660	} */ *ap;
4661{
4662
4663	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4664}
4665
4666static int
4667zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4668    int *rtvals)
4669{
4670	znode_t		*zp = VTOZ(vp);
4671	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4672	rl_t		*rl;
4673	dmu_tx_t	*tx;
4674	struct sf_buf	*sf;
4675	vm_object_t	object;
4676	vm_page_t	m;
4677	caddr_t		va;
4678	size_t		tocopy;
4679	size_t		lo_len;
4680	vm_ooffset_t	lo_off;
4681	vm_ooffset_t	off;
4682	uint_t		blksz;
4683	int		ncount;
4684	int		pcount;
4685	int		err;
4686	int		i;
4687
4688	ZFS_ENTER(zfsvfs);
4689	ZFS_VERIFY_ZP(zp);
4690
4691	object = vp->v_object;
4692	pcount = btoc(len);
4693	ncount = pcount;
4694
4695	KASSERT(ma[0]->object == object, ("mismatching object"));
4696	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4697
4698	for (i = 0; i < pcount; i++)
4699		rtvals[i] = zfs_vm_pagerret_error;
4700
4701	off = IDX_TO_OFF(ma[0]->pindex);
4702	blksz = zp->z_blksz;
4703	lo_off = rounddown(off, blksz);
4704	lo_len = roundup(len + (off - lo_off), blksz);
4705	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4706
4707	zfs_vmobject_wlock(object);
4708	if (len + off > object->un_pager.vnp.vnp_size) {
4709		if (object->un_pager.vnp.vnp_size > off) {
4710			int pgoff;
4711
4712			len = object->un_pager.vnp.vnp_size - off;
4713			ncount = btoc(len);
4714			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4715				/*
4716				 * If the object is locked and the following
4717				 * conditions hold, then the page's dirty
4718				 * field cannot be concurrently changed by a
4719				 * pmap operation.
4720				 */
4721				m = ma[ncount - 1];
4722				vm_page_assert_sbusied(m);
4723				KASSERT(!pmap_page_is_write_mapped(m),
4724				    ("zfs_putpages: page %p is not read-only", m));
4725				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4726				    pgoff);
4727			}
4728		} else {
4729			len = 0;
4730			ncount = 0;
4731		}
4732		if (ncount < pcount) {
4733			for (i = ncount; i < pcount; i++) {
4734				rtvals[i] = zfs_vm_pagerret_bad;
4735			}
4736		}
4737	}
4738	zfs_vmobject_wunlock(object);
4739
4740	if (ncount == 0)
4741		goto out;
4742
4743	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4744	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4745		goto out;
4746	}
4747
4748	tx = dmu_tx_create(zfsvfs->z_os);
4749	dmu_tx_hold_write(tx, zp->z_id, off, len);
4750
4751	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4752	zfs_sa_upgrade_txholds(tx, zp);
4753	err = dmu_tx_assign(tx, TXG_WAIT);
4754	if (err != 0) {
4755		dmu_tx_abort(tx);
4756		goto out;
4757	}
4758
4759	if (zp->z_blksz < PAGE_SIZE) {
4760		i = 0;
4761		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4762			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4763			va = zfs_map_page(ma[i], &sf);
4764			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4765			zfs_unmap_page(sf);
4766		}
4767	} else {
4768		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4769	}
4770
4771	if (err == 0) {
4772		uint64_t mtime[2], ctime[2];
4773		sa_bulk_attr_t bulk[3];
4774		int count = 0;
4775
4776		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4777		    &mtime, 16);
4778		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4779		    &ctime, 16);
4780		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4781		    &zp->z_pflags, 8);
4782		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4783		    B_TRUE);
4784		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4785		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4786
4787		zfs_vmobject_wlock(object);
4788		for (i = 0; i < ncount; i++) {
4789			rtvals[i] = zfs_vm_pagerret_ok;
4790			vm_page_undirty(ma[i]);
4791		}
4792		zfs_vmobject_wunlock(object);
4793		PCPU_INC(cnt.v_vnodeout);
4794		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4795	}
4796	dmu_tx_commit(tx);
4797
4798out:
4799	zfs_range_unlock(rl);
4800	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4801	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4802		zil_commit(zfsvfs->z_log, zp->z_id);
4803	ZFS_EXIT(zfsvfs);
4804	return (rtvals[0]);
4805}
4806
4807int
4808zfs_freebsd_putpages(ap)
4809	struct vop_putpages_args /* {
4810		struct vnode *a_vp;
4811		vm_page_t *a_m;
4812		int a_count;
4813		int a_sync;
4814		int *a_rtvals;
4815		vm_ooffset_t a_offset;
4816	} */ *ap;
4817{
4818
4819	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4820	    ap->a_rtvals));
4821}
4822
4823static int
4824zfs_freebsd_bmap(ap)
4825	struct vop_bmap_args /* {
4826		struct vnode *a_vp;
4827		daddr_t  a_bn;
4828		struct bufobj **a_bop;
4829		daddr_t *a_bnp;
4830		int *a_runp;
4831		int *a_runb;
4832	} */ *ap;
4833{
4834
4835	if (ap->a_bop != NULL)
4836		*ap->a_bop = &ap->a_vp->v_bufobj;
4837	if (ap->a_bnp != NULL)
4838		*ap->a_bnp = ap->a_bn;
4839	if (ap->a_runp != NULL)
4840		*ap->a_runp = 0;
4841	if (ap->a_runb != NULL)
4842		*ap->a_runb = 0;
4843
4844	return (0);
4845}
4846
4847static int
4848zfs_freebsd_open(ap)
4849	struct vop_open_args /* {
4850		struct vnode *a_vp;
4851		int a_mode;
4852		struct ucred *a_cred;
4853		struct thread *a_td;
4854	} */ *ap;
4855{
4856	vnode_t	*vp = ap->a_vp;
4857	znode_t *zp = VTOZ(vp);
4858	int error;
4859
4860	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4861	if (error == 0)
4862		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4863	return (error);
4864}
4865
4866static int
4867zfs_freebsd_close(ap)
4868	struct vop_close_args /* {
4869		struct vnode *a_vp;
4870		int  a_fflag;
4871		struct ucred *a_cred;
4872		struct thread *a_td;
4873	} */ *ap;
4874{
4875
4876	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4877}
4878
4879static int
4880zfs_freebsd_ioctl(ap)
4881	struct vop_ioctl_args /* {
4882		struct vnode *a_vp;
4883		u_long a_command;
4884		caddr_t a_data;
4885		int a_fflag;
4886		struct ucred *cred;
4887		struct thread *td;
4888	} */ *ap;
4889{
4890
4891	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4892	    ap->a_fflag, ap->a_cred, NULL, NULL));
4893}
4894
4895static int
4896zfs_freebsd_read(ap)
4897	struct vop_read_args /* {
4898		struct vnode *a_vp;
4899		struct uio *a_uio;
4900		int a_ioflag;
4901		struct ucred *a_cred;
4902	} */ *ap;
4903{
4904
4905	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4906	    ap->a_cred, NULL));
4907}
4908
4909static int
4910zfs_freebsd_write(ap)
4911	struct vop_write_args /* {
4912		struct vnode *a_vp;
4913		struct uio *a_uio;
4914		int a_ioflag;
4915		struct ucred *a_cred;
4916	} */ *ap;
4917{
4918
4919	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4920	    ap->a_cred, NULL));
4921}
4922
4923static int
4924zfs_freebsd_access(ap)
4925	struct vop_access_args /* {
4926		struct vnode *a_vp;
4927		accmode_t a_accmode;
4928		struct ucred *a_cred;
4929		struct thread *a_td;
4930	} */ *ap;
4931{
4932	vnode_t *vp = ap->a_vp;
4933	znode_t *zp = VTOZ(vp);
4934	accmode_t accmode;
4935	int error = 0;
4936
4937	/*
4938	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4939	 */
4940	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4941	if (accmode != 0)
4942		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4943
4944	/*
4945	 * VADMIN has to be handled by vaccess().
4946	 */
4947	if (error == 0) {
4948		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4949		if (accmode != 0) {
4950			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4951			    zp->z_gid, accmode, ap->a_cred, NULL);
4952		}
4953	}
4954
4955	/*
4956	 * For VEXEC, ensure that at least one execute bit is set for
4957	 * non-directories.
4958	 */
4959	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4960	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4961		error = EACCES;
4962	}
4963
4964	return (error);
4965}
4966
4967static int
4968zfs_freebsd_lookup(ap)
4969	struct vop_lookup_args /* {
4970		struct vnode *a_dvp;
4971		struct vnode **a_vpp;
4972		struct componentname *a_cnp;
4973	} */ *ap;
4974{
4975	struct componentname *cnp = ap->a_cnp;
4976	char nm[NAME_MAX + 1];
4977
4978	ASSERT(cnp->cn_namelen < sizeof(nm));
4979	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4980
4981	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4982	    cnp->cn_cred, cnp->cn_thread, 0));
4983}
4984
4985static int
4986zfs_cache_lookup(ap)
4987	struct vop_lookup_args /* {
4988		struct vnode *a_dvp;
4989		struct vnode **a_vpp;
4990		struct componentname *a_cnp;
4991	} */ *ap;
4992{
4993	zfsvfs_t *zfsvfs;
4994
4995	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4996	if (zfsvfs->z_use_namecache)
4997		return (vfs_cache_lookup(ap));
4998	else
4999		return (zfs_freebsd_lookup(ap));
5000}
5001
5002static int
5003zfs_freebsd_create(ap)
5004	struct vop_create_args /* {
5005		struct vnode *a_dvp;
5006		struct vnode **a_vpp;
5007		struct componentname *a_cnp;
5008		struct vattr *a_vap;
5009	} */ *ap;
5010{
5011	zfsvfs_t *zfsvfs;
5012	struct componentname *cnp = ap->a_cnp;
5013	vattr_t *vap = ap->a_vap;
5014	int error, mode;
5015
5016	ASSERT(cnp->cn_flags & SAVENAME);
5017
5018	vattr_init_mask(vap);
5019	mode = vap->va_mode & ALLPERMS;
5020	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5021
5022	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5023	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5024	if (zfsvfs->z_use_namecache &&
5025	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5026		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5027	return (error);
5028}
5029
5030static int
5031zfs_freebsd_remove(ap)
5032	struct vop_remove_args /* {
5033		struct vnode *a_dvp;
5034		struct vnode *a_vp;
5035		struct componentname *a_cnp;
5036	} */ *ap;
5037{
5038
5039	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5040
5041	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5042	    ap->a_cnp->cn_cred));
5043}
5044
5045static int
5046zfs_freebsd_mkdir(ap)
5047	struct vop_mkdir_args /* {
5048		struct vnode *a_dvp;
5049		struct vnode **a_vpp;
5050		struct componentname *a_cnp;
5051		struct vattr *a_vap;
5052	} */ *ap;
5053{
5054	vattr_t *vap = ap->a_vap;
5055
5056	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5057
5058	vattr_init_mask(vap);
5059
5060	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5061	    ap->a_cnp->cn_cred));
5062}
5063
5064static int
5065zfs_freebsd_rmdir(ap)
5066	struct vop_rmdir_args /* {
5067		struct vnode *a_dvp;
5068		struct vnode *a_vp;
5069		struct componentname *a_cnp;
5070	} */ *ap;
5071{
5072	struct componentname *cnp = ap->a_cnp;
5073
5074	ASSERT(cnp->cn_flags & SAVENAME);
5075
5076	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5077}
5078
5079static int
5080zfs_freebsd_readdir(ap)
5081	struct vop_readdir_args /* {
5082		struct vnode *a_vp;
5083		struct uio *a_uio;
5084		struct ucred *a_cred;
5085		int *a_eofflag;
5086		int *a_ncookies;
5087		u_long **a_cookies;
5088	} */ *ap;
5089{
5090
5091	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5092	    ap->a_ncookies, ap->a_cookies));
5093}
5094
5095static int
5096zfs_freebsd_fsync(ap)
5097	struct vop_fsync_args /* {
5098		struct vnode *a_vp;
5099		int a_waitfor;
5100		struct thread *a_td;
5101	} */ *ap;
5102{
5103
5104	vop_stdfsync(ap);
5105	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5106}
5107
5108static int
5109zfs_freebsd_getattr(ap)
5110	struct vop_getattr_args /* {
5111		struct vnode *a_vp;
5112		struct vattr *a_vap;
5113		struct ucred *a_cred;
5114	} */ *ap;
5115{
5116	vattr_t *vap = ap->a_vap;
5117	xvattr_t xvap;
5118	u_long fflags = 0;
5119	int error;
5120
5121	xva_init(&xvap);
5122	xvap.xva_vattr = *vap;
5123	xvap.xva_vattr.va_mask |= AT_XVATTR;
5124
5125	/* Convert chflags into ZFS-type flags. */
5126	/* XXX: what about SF_SETTABLE?. */
5127	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5128	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5129	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5130	XVA_SET_REQ(&xvap, XAT_NODUMP);
5131	XVA_SET_REQ(&xvap, XAT_READONLY);
5132	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5133	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5134	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5135	XVA_SET_REQ(&xvap, XAT_REPARSE);
5136	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5137	XVA_SET_REQ(&xvap, XAT_SPARSE);
5138
5139	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5140	if (error != 0)
5141		return (error);
5142
5143	/* Convert ZFS xattr into chflags. */
5144#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5145	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5146		fflags |= (fflag);					\
5147} while (0)
5148	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5149	    xvap.xva_xoptattrs.xoa_immutable);
5150	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5151	    xvap.xva_xoptattrs.xoa_appendonly);
5152	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5153	    xvap.xva_xoptattrs.xoa_nounlink);
5154	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5155	    xvap.xva_xoptattrs.xoa_archive);
5156	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5157	    xvap.xva_xoptattrs.xoa_nodump);
5158	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5159	    xvap.xva_xoptattrs.xoa_readonly);
5160	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5161	    xvap.xva_xoptattrs.xoa_system);
5162	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5163	    xvap.xva_xoptattrs.xoa_hidden);
5164	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5165	    xvap.xva_xoptattrs.xoa_reparse);
5166	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5167	    xvap.xva_xoptattrs.xoa_offline);
5168	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5169	    xvap.xva_xoptattrs.xoa_sparse);
5170
5171#undef	FLAG_CHECK
5172	*vap = xvap.xva_vattr;
5173	vap->va_flags = fflags;
5174	return (0);
5175}
5176
5177static int
5178zfs_freebsd_setattr(ap)
5179	struct vop_setattr_args /* {
5180		struct vnode *a_vp;
5181		struct vattr *a_vap;
5182		struct ucred *a_cred;
5183	} */ *ap;
5184{
5185	vnode_t *vp = ap->a_vp;
5186	vattr_t *vap = ap->a_vap;
5187	cred_t *cred = ap->a_cred;
5188	xvattr_t xvap;
5189	u_long fflags;
5190	uint64_t zflags;
5191
5192	vattr_init_mask(vap);
5193	vap->va_mask &= ~AT_NOSET;
5194
5195	xva_init(&xvap);
5196	xvap.xva_vattr = *vap;
5197
5198	zflags = VTOZ(vp)->z_pflags;
5199
5200	if (vap->va_flags != VNOVAL) {
5201		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5202		int error;
5203
5204		if (zfsvfs->z_use_fuids == B_FALSE)
5205			return (EOPNOTSUPP);
5206
5207		fflags = vap->va_flags;
5208		/*
5209		 * XXX KDM
5210		 * We need to figure out whether it makes sense to allow
5211		 * UF_REPARSE through, since we don't really have other
5212		 * facilities to handle reparse points and zfs_setattr()
5213		 * doesn't currently allow setting that attribute anyway.
5214		 */
5215		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5216		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5217		     UF_OFFLINE|UF_SPARSE)) != 0)
5218			return (EOPNOTSUPP);
5219		/*
5220		 * Unprivileged processes are not permitted to unset system
5221		 * flags, or modify flags if any system flags are set.
5222		 * Privileged non-jail processes may not modify system flags
5223		 * if securelevel > 0 and any existing system flags are set.
5224		 * Privileged jail processes behave like privileged non-jail
5225		 * processes if the security.jail.chflags_allowed sysctl is
5226		 * is non-zero; otherwise, they behave like unprivileged
5227		 * processes.
5228		 */
5229		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5230		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5231			if (zflags &
5232			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5233				error = securelevel_gt(cred, 0);
5234				if (error != 0)
5235					return (error);
5236			}
5237		} else {
5238			/*
5239			 * Callers may only modify the file flags on objects they
5240			 * have VADMIN rights for.
5241			 */
5242			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5243				return (error);
5244			if (zflags &
5245			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5246				return (EPERM);
5247			}
5248			if (fflags &
5249			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5250				return (EPERM);
5251			}
5252		}
5253
5254#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5255	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5256	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5257		XVA_SET_REQ(&xvap, (xflag));				\
5258		(xfield) = ((fflags & (fflag)) != 0);			\
5259	}								\
5260} while (0)
5261		/* Convert chflags into ZFS-type flags. */
5262		/* XXX: what about SF_SETTABLE?. */
5263		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5264		    xvap.xva_xoptattrs.xoa_immutable);
5265		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5266		    xvap.xva_xoptattrs.xoa_appendonly);
5267		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5268		    xvap.xva_xoptattrs.xoa_nounlink);
5269		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5270		    xvap.xva_xoptattrs.xoa_archive);
5271		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5272		    xvap.xva_xoptattrs.xoa_nodump);
5273		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5274		    xvap.xva_xoptattrs.xoa_readonly);
5275		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5276		    xvap.xva_xoptattrs.xoa_system);
5277		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5278		    xvap.xva_xoptattrs.xoa_hidden);
5279		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5280		    xvap.xva_xoptattrs.xoa_hidden);
5281		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5282		    xvap.xva_xoptattrs.xoa_offline);
5283		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5284		    xvap.xva_xoptattrs.xoa_sparse);
5285#undef	FLAG_CHANGE
5286	}
5287	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5288}
5289
5290static int
5291zfs_freebsd_rename(ap)
5292	struct vop_rename_args  /* {
5293		struct vnode *a_fdvp;
5294		struct vnode *a_fvp;
5295		struct componentname *a_fcnp;
5296		struct vnode *a_tdvp;
5297		struct vnode *a_tvp;
5298		struct componentname *a_tcnp;
5299	} */ *ap;
5300{
5301	vnode_t *fdvp = ap->a_fdvp;
5302	vnode_t *fvp = ap->a_fvp;
5303	vnode_t *tdvp = ap->a_tdvp;
5304	vnode_t *tvp = ap->a_tvp;
5305	int error;
5306
5307	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5308	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5309
5310	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5311	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5312
5313	vrele(fdvp);
5314	vrele(fvp);
5315	vrele(tdvp);
5316	if (tvp != NULL)
5317		vrele(tvp);
5318
5319	return (error);
5320}
5321
5322static int
5323zfs_freebsd_symlink(ap)
5324	struct vop_symlink_args /* {
5325		struct vnode *a_dvp;
5326		struct vnode **a_vpp;
5327		struct componentname *a_cnp;
5328		struct vattr *a_vap;
5329		char *a_target;
5330	} */ *ap;
5331{
5332	struct componentname *cnp = ap->a_cnp;
5333	vattr_t *vap = ap->a_vap;
5334
5335	ASSERT(cnp->cn_flags & SAVENAME);
5336
5337	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5338	vattr_init_mask(vap);
5339
5340	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5341	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5342}
5343
5344static int
5345zfs_freebsd_readlink(ap)
5346	struct vop_readlink_args /* {
5347		struct vnode *a_vp;
5348		struct uio *a_uio;
5349		struct ucred *a_cred;
5350	} */ *ap;
5351{
5352
5353	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5354}
5355
5356static int
5357zfs_freebsd_link(ap)
5358	struct vop_link_args /* {
5359		struct vnode *a_tdvp;
5360		struct vnode *a_vp;
5361		struct componentname *a_cnp;
5362	} */ *ap;
5363{
5364	struct componentname *cnp = ap->a_cnp;
5365	vnode_t *vp = ap->a_vp;
5366	vnode_t *tdvp = ap->a_tdvp;
5367
5368	if (tdvp->v_mount != vp->v_mount)
5369		return (EXDEV);
5370
5371	ASSERT(cnp->cn_flags & SAVENAME);
5372
5373	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5374}
5375
5376static int
5377zfs_freebsd_inactive(ap)
5378	struct vop_inactive_args /* {
5379		struct vnode *a_vp;
5380		struct thread *a_td;
5381	} */ *ap;
5382{
5383	vnode_t *vp = ap->a_vp;
5384
5385	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5386	return (0);
5387}
5388
5389static int
5390zfs_freebsd_reclaim(ap)
5391	struct vop_reclaim_args /* {
5392		struct vnode *a_vp;
5393		struct thread *a_td;
5394	} */ *ap;
5395{
5396	vnode_t	*vp = ap->a_vp;
5397	znode_t	*zp = VTOZ(vp);
5398	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5399
5400	ASSERT(zp != NULL);
5401
5402	/* Destroy the vm object and flush associated pages. */
5403	vnode_destroy_vobject(vp);
5404
5405	/*
5406	 * z_teardown_inactive_lock protects from a race with
5407	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5408	 * force unmount.
5409	 */
5410	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5411	if (zp->z_sa_hdl == NULL)
5412		zfs_znode_free(zp);
5413	else
5414		zfs_zinactive(zp);
5415	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5416
5417	vp->v_data = NULL;
5418	return (0);
5419}
5420
5421static int
5422zfs_freebsd_fid(ap)
5423	struct vop_fid_args /* {
5424		struct vnode *a_vp;
5425		struct fid *a_fid;
5426	} */ *ap;
5427{
5428
5429	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5430}
5431
5432static int
5433zfs_freebsd_pathconf(ap)
5434	struct vop_pathconf_args /* {
5435		struct vnode *a_vp;
5436		int a_name;
5437		register_t *a_retval;
5438	} */ *ap;
5439{
5440	ulong_t val;
5441	int error;
5442
5443	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5444	if (error == 0)
5445		*ap->a_retval = val;
5446	else if (error == EOPNOTSUPP)
5447		error = vop_stdpathconf(ap);
5448	return (error);
5449}
5450
5451static int
5452zfs_freebsd_fifo_pathconf(ap)
5453	struct vop_pathconf_args /* {
5454		struct vnode *a_vp;
5455		int a_name;
5456		register_t *a_retval;
5457	} */ *ap;
5458{
5459
5460	switch (ap->a_name) {
5461	case _PC_ACL_EXTENDED:
5462	case _PC_ACL_NFS4:
5463	case _PC_ACL_PATH_MAX:
5464	case _PC_MAC_PRESENT:
5465		return (zfs_freebsd_pathconf(ap));
5466	default:
5467		return (fifo_specops.vop_pathconf(ap));
5468	}
5469}
5470
5471/*
5472 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5473 * extended attribute name:
5474 *
5475 *	NAMESPACE	PREFIX
5476 *	system		freebsd:system:
5477 *	user		(none, can be used to access ZFS fsattr(5) attributes
5478 *			created on Solaris)
5479 */
5480static int
5481zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5482    size_t size)
5483{
5484	const char *namespace, *prefix, *suffix;
5485
5486	/* We don't allow '/' character in attribute name. */
5487	if (strchr(name, '/') != NULL)
5488		return (EINVAL);
5489	/* We don't allow attribute names that start with "freebsd:" string. */
5490	if (strncmp(name, "freebsd:", 8) == 0)
5491		return (EINVAL);
5492
5493	bzero(attrname, size);
5494
5495	switch (attrnamespace) {
5496	case EXTATTR_NAMESPACE_USER:
5497#if 0
5498		prefix = "freebsd:";
5499		namespace = EXTATTR_NAMESPACE_USER_STRING;
5500		suffix = ":";
5501#else
5502		/*
5503		 * This is the default namespace by which we can access all
5504		 * attributes created on Solaris.
5505		 */
5506		prefix = namespace = suffix = "";
5507#endif
5508		break;
5509	case EXTATTR_NAMESPACE_SYSTEM:
5510		prefix = "freebsd:";
5511		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5512		suffix = ":";
5513		break;
5514	case EXTATTR_NAMESPACE_EMPTY:
5515	default:
5516		return (EINVAL);
5517	}
5518	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5519	    name) >= size) {
5520		return (ENAMETOOLONG);
5521	}
5522	return (0);
5523}
5524
5525/*
5526 * Vnode operating to retrieve a named extended attribute.
5527 */
5528static int
5529zfs_getextattr(struct vop_getextattr_args *ap)
5530/*
5531vop_getextattr {
5532	IN struct vnode *a_vp;
5533	IN int a_attrnamespace;
5534	IN const char *a_name;
5535	INOUT struct uio *a_uio;
5536	OUT size_t *a_size;
5537	IN struct ucred *a_cred;
5538	IN struct thread *a_td;
5539};
5540*/
5541{
5542	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5543	struct thread *td = ap->a_td;
5544	struct nameidata nd;
5545	char attrname[255];
5546	struct vattr va;
5547	vnode_t *xvp = NULL, *vp;
5548	int error, flags;
5549
5550	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5551	    ap->a_cred, ap->a_td, VREAD);
5552	if (error != 0)
5553		return (error);
5554
5555	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5556	    sizeof(attrname));
5557	if (error != 0)
5558		return (error);
5559
5560	ZFS_ENTER(zfsvfs);
5561
5562	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5563	    LOOKUP_XATTR);
5564	if (error != 0) {
5565		ZFS_EXIT(zfsvfs);
5566		return (error);
5567	}
5568
5569	flags = FREAD;
5570	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5571	    xvp, td);
5572	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5573	vp = nd.ni_vp;
5574	NDFREE(&nd, NDF_ONLY_PNBUF);
5575	if (error != 0) {
5576		ZFS_EXIT(zfsvfs);
5577		if (error == ENOENT)
5578			error = ENOATTR;
5579		return (error);
5580	}
5581
5582	if (ap->a_size != NULL) {
5583		error = VOP_GETATTR(vp, &va, ap->a_cred);
5584		if (error == 0)
5585			*ap->a_size = (size_t)va.va_size;
5586	} else if (ap->a_uio != NULL)
5587		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5588
5589	VOP_UNLOCK(vp, 0);
5590	vn_close(vp, flags, ap->a_cred, td);
5591	ZFS_EXIT(zfsvfs);
5592
5593	return (error);
5594}
5595
5596/*
5597 * Vnode operation to remove a named attribute.
5598 */
5599int
5600zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5601/*
5602vop_deleteextattr {
5603	IN struct vnode *a_vp;
5604	IN int a_attrnamespace;
5605	IN const char *a_name;
5606	IN struct ucred *a_cred;
5607	IN struct thread *a_td;
5608};
5609*/
5610{
5611	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5612	struct thread *td = ap->a_td;
5613	struct nameidata nd;
5614	char attrname[255];
5615	struct vattr va;
5616	vnode_t *xvp = NULL, *vp;
5617	int error, flags;
5618
5619	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5620	    ap->a_cred, ap->a_td, VWRITE);
5621	if (error != 0)
5622		return (error);
5623
5624	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5625	    sizeof(attrname));
5626	if (error != 0)
5627		return (error);
5628
5629	ZFS_ENTER(zfsvfs);
5630
5631	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5632	    LOOKUP_XATTR);
5633	if (error != 0) {
5634		ZFS_EXIT(zfsvfs);
5635		return (error);
5636	}
5637
5638	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5639	    UIO_SYSSPACE, attrname, xvp, td);
5640	error = namei(&nd);
5641	vp = nd.ni_vp;
5642	if (error != 0) {
5643		ZFS_EXIT(zfsvfs);
5644		NDFREE(&nd, NDF_ONLY_PNBUF);
5645		if (error == ENOENT)
5646			error = ENOATTR;
5647		return (error);
5648	}
5649
5650	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5651	NDFREE(&nd, NDF_ONLY_PNBUF);
5652
5653	vput(nd.ni_dvp);
5654	if (vp == nd.ni_dvp)
5655		vrele(vp);
5656	else
5657		vput(vp);
5658	ZFS_EXIT(zfsvfs);
5659
5660	return (error);
5661}
5662
5663/*
5664 * Vnode operation to set a named attribute.
5665 */
5666static int
5667zfs_setextattr(struct vop_setextattr_args *ap)
5668/*
5669vop_setextattr {
5670	IN struct vnode *a_vp;
5671	IN int a_attrnamespace;
5672	IN const char *a_name;
5673	INOUT struct uio *a_uio;
5674	IN struct ucred *a_cred;
5675	IN struct thread *a_td;
5676};
5677*/
5678{
5679	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5680	struct thread *td = ap->a_td;
5681	struct nameidata nd;
5682	char attrname[255];
5683	struct vattr va;
5684	vnode_t *xvp = NULL, *vp;
5685	int error, flags;
5686
5687	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5688	    ap->a_cred, ap->a_td, VWRITE);
5689	if (error != 0)
5690		return (error);
5691
5692	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5693	    sizeof(attrname));
5694	if (error != 0)
5695		return (error);
5696
5697	ZFS_ENTER(zfsvfs);
5698
5699	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5700	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5701	if (error != 0) {
5702		ZFS_EXIT(zfsvfs);
5703		return (error);
5704	}
5705
5706	flags = FFLAGS(O_WRONLY | O_CREAT);
5707	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5708	    xvp, td);
5709	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5710	vp = nd.ni_vp;
5711	NDFREE(&nd, NDF_ONLY_PNBUF);
5712	if (error != 0) {
5713		ZFS_EXIT(zfsvfs);
5714		return (error);
5715	}
5716
5717	VATTR_NULL(&va);
5718	va.va_size = 0;
5719	error = VOP_SETATTR(vp, &va, ap->a_cred);
5720	if (error == 0)
5721		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5722
5723	VOP_UNLOCK(vp, 0);
5724	vn_close(vp, flags, ap->a_cred, td);
5725	ZFS_EXIT(zfsvfs);
5726
5727	return (error);
5728}
5729
5730/*
5731 * Vnode operation to retrieve extended attributes on a vnode.
5732 */
5733static int
5734zfs_listextattr(struct vop_listextattr_args *ap)
5735/*
5736vop_listextattr {
5737	IN struct vnode *a_vp;
5738	IN int a_attrnamespace;
5739	INOUT struct uio *a_uio;
5740	OUT size_t *a_size;
5741	IN struct ucred *a_cred;
5742	IN struct thread *a_td;
5743};
5744*/
5745{
5746	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5747	struct thread *td = ap->a_td;
5748	struct nameidata nd;
5749	char attrprefix[16];
5750	u_char dirbuf[sizeof(struct dirent)];
5751	struct dirent *dp;
5752	struct iovec aiov;
5753	struct uio auio, *uio = ap->a_uio;
5754	size_t *sizep = ap->a_size;
5755	size_t plen;
5756	vnode_t *xvp = NULL, *vp;
5757	int done, error, eof, pos;
5758
5759	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5760	    ap->a_cred, ap->a_td, VREAD);
5761	if (error != 0)
5762		return (error);
5763
5764	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5765	    sizeof(attrprefix));
5766	if (error != 0)
5767		return (error);
5768	plen = strlen(attrprefix);
5769
5770	ZFS_ENTER(zfsvfs);
5771
5772	if (sizep != NULL)
5773		*sizep = 0;
5774
5775	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5776	    LOOKUP_XATTR);
5777	if (error != 0) {
5778		ZFS_EXIT(zfsvfs);
5779		/*
5780		 * ENOATTR means that the EA directory does not yet exist,
5781		 * i.e. there are no extended attributes there.
5782		 */
5783		if (error == ENOATTR)
5784			error = 0;
5785		return (error);
5786	}
5787
5788	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5789	    UIO_SYSSPACE, ".", xvp, td);
5790	error = namei(&nd);
5791	vp = nd.ni_vp;
5792	NDFREE(&nd, NDF_ONLY_PNBUF);
5793	if (error != 0) {
5794		ZFS_EXIT(zfsvfs);
5795		return (error);
5796	}
5797
5798	auio.uio_iov = &aiov;
5799	auio.uio_iovcnt = 1;
5800	auio.uio_segflg = UIO_SYSSPACE;
5801	auio.uio_td = td;
5802	auio.uio_rw = UIO_READ;
5803	auio.uio_offset = 0;
5804
5805	do {
5806		u_char nlen;
5807
5808		aiov.iov_base = (void *)dirbuf;
5809		aiov.iov_len = sizeof(dirbuf);
5810		auio.uio_resid = sizeof(dirbuf);
5811		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5812		done = sizeof(dirbuf) - auio.uio_resid;
5813		if (error != 0)
5814			break;
5815		for (pos = 0; pos < done;) {
5816			dp = (struct dirent *)(dirbuf + pos);
5817			pos += dp->d_reclen;
5818			/*
5819			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5820			 * is what we get when attribute was created on Solaris.
5821			 */
5822			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5823				continue;
5824			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5825				continue;
5826			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5827				continue;
5828			nlen = dp->d_namlen - plen;
5829			if (sizep != NULL)
5830				*sizep += 1 + nlen;
5831			else if (uio != NULL) {
5832				/*
5833				 * Format of extattr name entry is one byte for
5834				 * length and the rest for name.
5835				 */
5836				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5837				if (error == 0) {
5838					error = uiomove(dp->d_name + plen, nlen,
5839					    uio->uio_rw, uio);
5840				}
5841				if (error != 0)
5842					break;
5843			}
5844		}
5845	} while (!eof && error == 0);
5846
5847	vput(vp);
5848	ZFS_EXIT(zfsvfs);
5849
5850	return (error);
5851}
5852
5853int
5854zfs_freebsd_getacl(ap)
5855	struct vop_getacl_args /* {
5856		struct vnode *vp;
5857		acl_type_t type;
5858		struct acl *aclp;
5859		struct ucred *cred;
5860		struct thread *td;
5861	} */ *ap;
5862{
5863	int		error;
5864	vsecattr_t      vsecattr;
5865
5866	if (ap->a_type != ACL_TYPE_NFS4)
5867		return (EINVAL);
5868
5869	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5870	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5871		return (error);
5872
5873	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5874	if (vsecattr.vsa_aclentp != NULL)
5875		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5876
5877	return (error);
5878}
5879
5880int
5881zfs_freebsd_setacl(ap)
5882	struct vop_setacl_args /* {
5883		struct vnode *vp;
5884		acl_type_t type;
5885		struct acl *aclp;
5886		struct ucred *cred;
5887		struct thread *td;
5888	} */ *ap;
5889{
5890	int		error;
5891	vsecattr_t      vsecattr;
5892	int		aclbsize;	/* size of acl list in bytes */
5893	aclent_t	*aaclp;
5894
5895	if (ap->a_type != ACL_TYPE_NFS4)
5896		return (EINVAL);
5897
5898	if (ap->a_aclp == NULL)
5899		return (EINVAL);
5900
5901	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5902		return (EINVAL);
5903
5904	/*
5905	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5906	 * splitting every entry into two and appending "canonical six"
5907	 * entries at the end.  Don't allow for setting an ACL that would
5908	 * cause chmod(2) to run out of ACL entries.
5909	 */
5910	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5911		return (ENOSPC);
5912
5913	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5914	if (error != 0)
5915		return (error);
5916
5917	vsecattr.vsa_mask = VSA_ACE;
5918	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5919	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5920	aaclp = vsecattr.vsa_aclentp;
5921	vsecattr.vsa_aclentsz = aclbsize;
5922
5923	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5924	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5925	kmem_free(aaclp, aclbsize);
5926
5927	return (error);
5928}
5929
5930int
5931zfs_freebsd_aclcheck(ap)
5932	struct vop_aclcheck_args /* {
5933		struct vnode *vp;
5934		acl_type_t type;
5935		struct acl *aclp;
5936		struct ucred *cred;
5937		struct thread *td;
5938	} */ *ap;
5939{
5940
5941	return (EOPNOTSUPP);
5942}
5943
5944static int
5945zfs_vptocnp(struct vop_vptocnp_args *ap)
5946{
5947	vnode_t *covered_vp;
5948	vnode_t *vp = ap->a_vp;;
5949	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5950	znode_t *zp = VTOZ(vp);
5951	int ltype;
5952	int error;
5953
5954	ZFS_ENTER(zfsvfs);
5955	ZFS_VERIFY_ZP(zp);
5956
5957	/*
5958	 * If we are a snapshot mounted under .zfs, run the operation
5959	 * on the covered vnode.
5960	 */
5961	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5962		char name[MAXNAMLEN + 1];
5963		znode_t *dzp;
5964		size_t len;
5965
5966		error = zfs_znode_parent_and_name(zp, &dzp, name);
5967		if (error == 0) {
5968			len = strlen(name);
5969			if (*ap->a_buflen < len)
5970				error = SET_ERROR(ENOMEM);
5971		}
5972		if (error == 0) {
5973			*ap->a_buflen -= len;
5974			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5975			*ap->a_vpp = ZTOV(dzp);
5976		}
5977		ZFS_EXIT(zfsvfs);
5978		return (error);
5979	}
5980	ZFS_EXIT(zfsvfs);
5981
5982	covered_vp = vp->v_mount->mnt_vnodecovered;
5983	vhold(covered_vp);
5984	ltype = VOP_ISLOCKED(vp);
5985	VOP_UNLOCK(vp, 0);
5986	error = vget(covered_vp, LK_SHARED, curthread);
5987	vdrop(covered_vp);
5988	if (error == 0) {
5989		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5990		    ap->a_buf, ap->a_buflen);
5991		vput(covered_vp);
5992	}
5993	vn_lock(vp, ltype | LK_RETRY);
5994	if ((vp->v_iflag & VI_DOOMED) != 0)
5995		error = SET_ERROR(ENOENT);
5996	return (error);
5997}
5998
5999#ifdef DIAGNOSTIC
6000static int
6001zfs_lock(ap)
6002	struct vop_lock1_args /* {
6003		struct vnode *a_vp;
6004		int a_flags;
6005		char *file;
6006		int line;
6007	} */ *ap;
6008{
6009	vnode_t *vp;
6010	znode_t *zp;
6011	int err;
6012
6013	err = vop_stdlock(ap);
6014	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
6015		vp = ap->a_vp;
6016		zp = vp->v_data;
6017		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6018		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6019			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6020	}
6021	return (err);
6022}
6023#endif
6024
6025struct vop_vector zfs_vnodeops;
6026struct vop_vector zfs_fifoops;
6027struct vop_vector zfs_shareops;
6028
6029struct vop_vector zfs_vnodeops = {
6030	.vop_default =		&default_vnodeops,
6031	.vop_inactive =		zfs_freebsd_inactive,
6032	.vop_reclaim =		zfs_freebsd_reclaim,
6033	.vop_access =		zfs_freebsd_access,
6034	.vop_lookup =		zfs_cache_lookup,
6035	.vop_cachedlookup =	zfs_freebsd_lookup,
6036	.vop_getattr =		zfs_freebsd_getattr,
6037	.vop_setattr =		zfs_freebsd_setattr,
6038	.vop_create =		zfs_freebsd_create,
6039	.vop_mknod =		zfs_freebsd_create,
6040	.vop_mkdir =		zfs_freebsd_mkdir,
6041	.vop_readdir =		zfs_freebsd_readdir,
6042	.vop_fsync =		zfs_freebsd_fsync,
6043	.vop_open =		zfs_freebsd_open,
6044	.vop_close =		zfs_freebsd_close,
6045	.vop_rmdir =		zfs_freebsd_rmdir,
6046	.vop_ioctl =		zfs_freebsd_ioctl,
6047	.vop_link =		zfs_freebsd_link,
6048	.vop_symlink =		zfs_freebsd_symlink,
6049	.vop_readlink =		zfs_freebsd_readlink,
6050	.vop_read =		zfs_freebsd_read,
6051	.vop_write =		zfs_freebsd_write,
6052	.vop_remove =		zfs_freebsd_remove,
6053	.vop_rename =		zfs_freebsd_rename,
6054	.vop_pathconf =		zfs_freebsd_pathconf,
6055	.vop_bmap =		zfs_freebsd_bmap,
6056	.vop_fid =		zfs_freebsd_fid,
6057	.vop_getextattr =	zfs_getextattr,
6058	.vop_deleteextattr =	zfs_deleteextattr,
6059	.vop_setextattr =	zfs_setextattr,
6060	.vop_listextattr =	zfs_listextattr,
6061	.vop_getacl =		zfs_freebsd_getacl,
6062	.vop_setacl =		zfs_freebsd_setacl,
6063	.vop_aclcheck =		zfs_freebsd_aclcheck,
6064	.vop_getpages =		zfs_freebsd_getpages,
6065	.vop_putpages =		zfs_freebsd_putpages,
6066	.vop_vptocnp =		zfs_vptocnp,
6067#ifdef DIAGNOSTIC
6068	.vop_lock1 =		zfs_lock,
6069#endif
6070};
6071
6072struct vop_vector zfs_fifoops = {
6073	.vop_default =		&fifo_specops,
6074	.vop_fsync =		zfs_freebsd_fsync,
6075	.vop_access =		zfs_freebsd_access,
6076	.vop_getattr =		zfs_freebsd_getattr,
6077	.vop_inactive =		zfs_freebsd_inactive,
6078	.vop_read =		VOP_PANIC,
6079	.vop_reclaim =		zfs_freebsd_reclaim,
6080	.vop_setattr =		zfs_freebsd_setattr,
6081	.vop_write =		VOP_PANIC,
6082	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6083	.vop_fid =		zfs_freebsd_fid,
6084	.vop_getacl =		zfs_freebsd_getacl,
6085	.vop_setacl =		zfs_freebsd_setacl,
6086	.vop_aclcheck =		zfs_freebsd_aclcheck,
6087};
6088
6089/*
6090 * special share hidden files vnode operations template
6091 */
6092struct vop_vector zfs_shareops = {
6093	.vop_default =		&default_vnodeops,
6094	.vop_access =		zfs_freebsd_access,
6095	.vop_inactive =		zfs_freebsd_inactive,
6096	.vop_reclaim =		zfs_freebsd_reclaim,
6097	.vop_fid =		zfs_freebsd_fid,
6098	.vop_pathconf =		zfs_freebsd_pathconf,
6099};
6100