zfs_vnops.c revision 297077
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25 */
26
27/* Portions Copyright 2007 Jeremy Teo */
28/* Portions Copyright 2010 Robert Milkowski */
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/time.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/resource.h>
36#include <sys/vfs.h>
37#include <sys/vm.h>
38#include <sys/vnode.h>
39#include <sys/file.h>
40#include <sys/stat.h>
41#include <sys/kmem.h>
42#include <sys/taskq.h>
43#include <sys/uio.h>
44#include <sys/atomic.h>
45#include <sys/namei.h>
46#include <sys/mman.h>
47#include <sys/cmn_err.h>
48#include <sys/errno.h>
49#include <sys/unistd.h>
50#include <sys/zfs_dir.h>
51#include <sys/zfs_ioctl.h>
52#include <sys/fs/zfs.h>
53#include <sys/dmu.h>
54#include <sys/dmu_objset.h>
55#include <sys/spa.h>
56#include <sys/txg.h>
57#include <sys/dbuf.h>
58#include <sys/zap.h>
59#include <sys/sa.h>
60#include <sys/dirent.h>
61#include <sys/policy.h>
62#include <sys/sunddi.h>
63#include <sys/filio.h>
64#include <sys/sid.h>
65#include <sys/zfs_ctldir.h>
66#include <sys/zfs_fuid.h>
67#include <sys/zfs_sa.h>
68#include <sys/dnlc.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77#include <vm/vm_pageout.h>
78
79/*
80 * Programming rules.
81 *
82 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
83 * properly lock its in-core state, create a DMU transaction, do the work,
84 * record this work in the intent log (ZIL), commit the DMU transaction,
85 * and wait for the intent log to commit if it is a synchronous operation.
86 * Moreover, the vnode ops must work in both normal and log replay context.
87 * The ordering of events is important to avoid deadlocks and references
88 * to freed memory.  The example below illustrates the following Big Rules:
89 *
90 *  (1)	A check must be made in each zfs thread for a mounted file system.
91 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
92 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
93 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
94 *	can return EIO from the calling function.
95 *
96 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
97 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
98 *	First, if it's the last reference, the vnode/znode
99 *	can be freed, so the zp may point to freed memory.  Second, the last
100 *	reference will call zfs_zinactive(), which may induce a lot of work --
101 *	pushing cached pages (which acquires range locks) and syncing out
102 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
103 *	which could deadlock the system if you were already holding one.
104 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
105 *
106 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
107 *	as they can span dmu_tx_assign() calls.
108 *
109 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
110 *      dmu_tx_assign().  This is critical because we don't want to block
111 *      while holding locks.
112 *
113 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
114 *	reduces lock contention and CPU usage when we must wait (note that if
115 *	throughput is constrained by the storage, nearly every transaction
116 *	must wait).
117 *
118 *      Note, in particular, that if a lock is sometimes acquired before
119 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
120 *      to use a non-blocking assign can deadlock the system.  The scenario:
121 *
122 *	Thread A has grabbed a lock before calling dmu_tx_assign().
123 *	Thread B is in an already-assigned tx, and blocks for this lock.
124 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
125 *	forever, because the previous txg can't quiesce until B's tx commits.
126 *
127 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
128 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
129 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
130 *	to indicate that this operation has already called dmu_tx_wait().
131 *	This will ensure that we don't retry forever, waiting a short bit
132 *	each time.
133 *
134 *  (5)	If the operation succeeded, generate the intent log entry for it
135 *	before dropping locks.  This ensures that the ordering of events
136 *	in the intent log matches the order in which they actually occurred.
137 *	During ZIL replay the zfs_log_* functions will update the sequence
138 *	number to indicate the zil transaction has replayed.
139 *
140 *  (6)	At the end of each vnode op, the DMU tx must always commit,
141 *	regardless of whether there were any errors.
142 *
143 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
144 *	to ensure that synchronous semantics are provided when necessary.
145 *
146 * In general, this is how things should be ordered in each vnode op:
147 *
148 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
149 * top:
150 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
151 *	rw_enter(...);			// grab any other locks you need
152 *	tx = dmu_tx_create(...);	// get DMU tx
153 *	dmu_tx_hold_*();		// hold each object you might modify
154 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
155 *	if (error) {
156 *		rw_exit(...);		// drop locks
157 *		zfs_dirent_unlock(dl);	// unlock directory entry
158 *		VN_RELE(...);		// release held vnodes
159 *		if (error == ERESTART) {
160 *			waited = B_TRUE;
161 *			dmu_tx_wait(tx);
162 *			dmu_tx_abort(tx);
163 *			goto top;
164 *		}
165 *		dmu_tx_abort(tx);	// abort DMU tx
166 *		ZFS_EXIT(zfsvfs);	// finished in zfs
167 *		return (error);		// really out of space
168 *	}
169 *	error = do_real_work();		// do whatever this VOP does
170 *	if (error == 0)
171 *		zfs_log_*(...);		// on success, make ZIL entry
172 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
173 *	rw_exit(...);			// drop locks
174 *	zfs_dirent_unlock(dl);		// unlock directory entry
175 *	VN_RELE(...);			// release held vnodes
176 *	zil_commit(zilog, foid);	// synchronous when necessary
177 *	ZFS_EXIT(zfsvfs);		// finished in zfs
178 *	return (error);			// done, report error
179 */
180
181/* ARGSUSED */
182static int
183zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
184{
185	znode_t	*zp = VTOZ(*vpp);
186	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
187
188	ZFS_ENTER(zfsvfs);
189	ZFS_VERIFY_ZP(zp);
190
191	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
192	    ((flag & FAPPEND) == 0)) {
193		ZFS_EXIT(zfsvfs);
194		return (SET_ERROR(EPERM));
195	}
196
197	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
198	    ZTOV(zp)->v_type == VREG &&
199	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
200		if (fs_vscan(*vpp, cr, 0) != 0) {
201			ZFS_EXIT(zfsvfs);
202			return (SET_ERROR(EACCES));
203		}
204	}
205
206	/* Keep a count of the synchronous opens in the znode */
207	if (flag & (FSYNC | FDSYNC))
208		atomic_inc_32(&zp->z_sync_cnt);
209
210	ZFS_EXIT(zfsvfs);
211	return (0);
212}
213
214/* ARGSUSED */
215static int
216zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
217    caller_context_t *ct)
218{
219	znode_t	*zp = VTOZ(vp);
220	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
221
222	/*
223	 * Clean up any locks held by this process on the vp.
224	 */
225	cleanlocks(vp, ddi_get_pid(), 0);
226	cleanshares(vp, ddi_get_pid());
227
228	ZFS_ENTER(zfsvfs);
229	ZFS_VERIFY_ZP(zp);
230
231	/* Decrement the synchronous opens in the znode */
232	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
233		atomic_dec_32(&zp->z_sync_cnt);
234
235	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
236	    ZTOV(zp)->v_type == VREG &&
237	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
238		VERIFY(fs_vscan(vp, cr, 1) == 0);
239
240	ZFS_EXIT(zfsvfs);
241	return (0);
242}
243
244/*
245 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
246 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
247 */
248static int
249zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
250{
251	znode_t	*zp = VTOZ(vp);
252	uint64_t noff = (uint64_t)*off; /* new offset */
253	uint64_t file_sz;
254	int error;
255	boolean_t hole;
256
257	file_sz = zp->z_size;
258	if (noff >= file_sz)  {
259		return (SET_ERROR(ENXIO));
260	}
261
262	if (cmd == _FIO_SEEK_HOLE)
263		hole = B_TRUE;
264	else
265		hole = B_FALSE;
266
267	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
268
269	if (error == ESRCH)
270		return (SET_ERROR(ENXIO));
271
272	/*
273	 * We could find a hole that begins after the logical end-of-file,
274	 * because dmu_offset_next() only works on whole blocks.  If the
275	 * EOF falls mid-block, then indicate that the "virtual hole"
276	 * at the end of the file begins at the logical EOF, rather than
277	 * at the end of the last block.
278	 */
279	if (noff > file_sz) {
280		ASSERT(hole);
281		noff = file_sz;
282	}
283
284	if (noff < *off)
285		return (error);
286	*off = noff;
287	return (error);
288}
289
290/* ARGSUSED */
291static int
292zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
293    int *rvalp, caller_context_t *ct)
294{
295	offset_t off;
296	offset_t ndata;
297	dmu_object_info_t doi;
298	int error;
299	zfsvfs_t *zfsvfs;
300	znode_t *zp;
301
302	switch (com) {
303	case _FIOFFS:
304	{
305		return (0);
306
307		/*
308		 * The following two ioctls are used by bfu.  Faking out,
309		 * necessary to avoid bfu errors.
310		 */
311	}
312	case _FIOGDIO:
313	case _FIOSDIO:
314	{
315		return (0);
316	}
317
318	case _FIO_SEEK_DATA:
319	case _FIO_SEEK_HOLE:
320	{
321#ifdef illumos
322		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
323			return (SET_ERROR(EFAULT));
324#else
325		off = *(offset_t *)data;
326#endif
327		zp = VTOZ(vp);
328		zfsvfs = zp->z_zfsvfs;
329		ZFS_ENTER(zfsvfs);
330		ZFS_VERIFY_ZP(zp);
331
332		/* offset parameter is in/out */
333		error = zfs_holey(vp, com, &off);
334		ZFS_EXIT(zfsvfs);
335		if (error)
336			return (error);
337#ifdef illumos
338		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
339			return (SET_ERROR(EFAULT));
340#else
341		*(offset_t *)data = off;
342#endif
343		return (0);
344	}
345#ifdef illumos
346	case _FIO_COUNT_FILLED:
347	{
348		/*
349		 * _FIO_COUNT_FILLED adds a new ioctl command which
350		 * exposes the number of filled blocks in a
351		 * ZFS object.
352		 */
353		zp = VTOZ(vp);
354		zfsvfs = zp->z_zfsvfs;
355		ZFS_ENTER(zfsvfs);
356		ZFS_VERIFY_ZP(zp);
357
358		/*
359		 * Wait for all dirty blocks for this object
360		 * to get synced out to disk, and the DMU info
361		 * updated.
362		 */
363		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
364		if (error) {
365			ZFS_EXIT(zfsvfs);
366			return (error);
367		}
368
369		/*
370		 * Retrieve fill count from DMU object.
371		 */
372		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
373		if (error) {
374			ZFS_EXIT(zfsvfs);
375			return (error);
376		}
377
378		ndata = doi.doi_fill_count;
379
380		ZFS_EXIT(zfsvfs);
381		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
382			return (SET_ERROR(EFAULT));
383		return (0);
384	}
385#endif
386	}
387	return (SET_ERROR(ENOTTY));
388}
389
390static vm_page_t
391page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
392{
393	vm_object_t obj;
394	vm_page_t pp;
395	int64_t end;
396
397	/*
398	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
399	 * aligned boundaries, if the range is not aligned.  As a result a
400	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
401	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
402	 * the whole page would be considred clean despite have some dirty data.
403	 * For this reason we should shrink the range to DEV_BSIZE aligned
404	 * boundaries before calling vm_page_clear_dirty.
405	 */
406	end = rounddown2(off + nbytes, DEV_BSIZE);
407	off = roundup2(off, DEV_BSIZE);
408	nbytes = end - off;
409
410	obj = vp->v_object;
411	zfs_vmobject_assert_wlocked(obj);
412
413	for (;;) {
414		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
415		    pp->valid) {
416			if (vm_page_xbusied(pp)) {
417				/*
418				 * Reference the page before unlocking and
419				 * sleeping so that the page daemon is less
420				 * likely to reclaim it.
421				 */
422				vm_page_reference(pp);
423				vm_page_lock(pp);
424				zfs_vmobject_wunlock(obj);
425				vm_page_busy_sleep(pp, "zfsmwb");
426				zfs_vmobject_wlock(obj);
427				continue;
428			}
429			vm_page_sbusy(pp);
430		} else if (pp == NULL) {
431			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
432			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
433			    VM_ALLOC_SBUSY);
434		} else {
435			ASSERT(pp != NULL && !pp->valid);
436			pp = NULL;
437		}
438
439		if (pp != NULL) {
440			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
441			vm_object_pip_add(obj, 1);
442			pmap_remove_write(pp);
443			if (nbytes != 0)
444				vm_page_clear_dirty(pp, off, nbytes);
445		}
446		break;
447	}
448	return (pp);
449}
450
451static void
452page_unbusy(vm_page_t pp)
453{
454
455	vm_page_sunbusy(pp);
456	vm_object_pip_subtract(pp->object, 1);
457}
458
459static vm_page_t
460page_hold(vnode_t *vp, int64_t start)
461{
462	vm_object_t obj;
463	vm_page_t pp;
464
465	obj = vp->v_object;
466	zfs_vmobject_assert_wlocked(obj);
467
468	for (;;) {
469		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
470		    pp->valid) {
471			if (vm_page_xbusied(pp)) {
472				/*
473				 * Reference the page before unlocking and
474				 * sleeping so that the page daemon is less
475				 * likely to reclaim it.
476				 */
477				vm_page_reference(pp);
478				vm_page_lock(pp);
479				zfs_vmobject_wunlock(obj);
480				vm_page_busy_sleep(pp, "zfsmwb");
481				zfs_vmobject_wlock(obj);
482				continue;
483			}
484
485			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
486			vm_page_lock(pp);
487			vm_page_hold(pp);
488			vm_page_unlock(pp);
489
490		} else
491			pp = NULL;
492		break;
493	}
494	return (pp);
495}
496
497static void
498page_unhold(vm_page_t pp)
499{
500
501	vm_page_lock(pp);
502	vm_page_unhold(pp);
503	vm_page_unlock(pp);
504}
505
506/*
507 * When a file is memory mapped, we must keep the IO data synchronized
508 * between the DMU cache and the memory mapped pages.  What this means:
509 *
510 * On Write:	If we find a memory mapped page, we write to *both*
511 *		the page and the dmu buffer.
512 */
513static void
514update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
515    int segflg, dmu_tx_t *tx)
516{
517	vm_object_t obj;
518	struct sf_buf *sf;
519	caddr_t va;
520	int off;
521
522	ASSERT(segflg != UIO_NOCOPY);
523	ASSERT(vp->v_mount != NULL);
524	obj = vp->v_object;
525	ASSERT(obj != NULL);
526
527	off = start & PAGEOFFSET;
528	zfs_vmobject_wlock(obj);
529	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
530		vm_page_t pp;
531		int nbytes = imin(PAGESIZE - off, len);
532
533		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
534			zfs_vmobject_wunlock(obj);
535
536			va = zfs_map_page(pp, &sf);
537			(void) dmu_read(os, oid, start+off, nbytes,
538			    va+off, DMU_READ_PREFETCH);;
539			zfs_unmap_page(sf);
540
541			zfs_vmobject_wlock(obj);
542			page_unbusy(pp);
543		}
544		len -= nbytes;
545		off = 0;
546	}
547	vm_object_pip_wakeupn(obj, 0);
548	zfs_vmobject_wunlock(obj);
549}
550
551/*
552 * Read with UIO_NOCOPY flag means that sendfile(2) requests
553 * ZFS to populate a range of page cache pages with data.
554 *
555 * NOTE: this function could be optimized to pre-allocate
556 * all pages in advance, drain exclusive busy on all of them,
557 * map them into contiguous KVA region and populate them
558 * in one single dmu_read() call.
559 */
560static int
561mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
562{
563	znode_t *zp = VTOZ(vp);
564	objset_t *os = zp->z_zfsvfs->z_os;
565	struct sf_buf *sf;
566	vm_object_t obj;
567	vm_page_t pp;
568	int64_t start;
569	caddr_t va;
570	int len = nbytes;
571	int off;
572	int error = 0;
573
574	ASSERT(uio->uio_segflg == UIO_NOCOPY);
575	ASSERT(vp->v_mount != NULL);
576	obj = vp->v_object;
577	ASSERT(obj != NULL);
578	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
579
580	zfs_vmobject_wlock(obj);
581	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
582		int bytes = MIN(PAGESIZE, len);
583
584		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
585		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
586		if (pp->valid == 0) {
587			zfs_vmobject_wunlock(obj);
588			va = zfs_map_page(pp, &sf);
589			error = dmu_read(os, zp->z_id, start, bytes, va,
590			    DMU_READ_PREFETCH);
591			if (bytes != PAGESIZE && error == 0)
592				bzero(va + bytes, PAGESIZE - bytes);
593			zfs_unmap_page(sf);
594			zfs_vmobject_wlock(obj);
595			vm_page_sunbusy(pp);
596			vm_page_lock(pp);
597			if (error) {
598				if (pp->wire_count == 0 && pp->valid == 0 &&
599				    !vm_page_busied(pp))
600					vm_page_free(pp);
601			} else {
602				pp->valid = VM_PAGE_BITS_ALL;
603				vm_page_activate(pp);
604			}
605			vm_page_unlock(pp);
606		} else {
607			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
608			vm_page_sunbusy(pp);
609		}
610		if (error)
611			break;
612		uio->uio_resid -= bytes;
613		uio->uio_offset += bytes;
614		len -= bytes;
615	}
616	zfs_vmobject_wunlock(obj);
617	return (error);
618}
619
620/*
621 * When a file is memory mapped, we must keep the IO data synchronized
622 * between the DMU cache and the memory mapped pages.  What this means:
623 *
624 * On Read:	We "read" preferentially from memory mapped pages,
625 *		else we default from the dmu buffer.
626 *
627 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
628 *	 the file is memory mapped.
629 */
630static int
631mappedread(vnode_t *vp, int nbytes, uio_t *uio)
632{
633	znode_t *zp = VTOZ(vp);
634	vm_object_t obj;
635	int64_t start;
636	caddr_t va;
637	int len = nbytes;
638	int off;
639	int error = 0;
640
641	ASSERT(vp->v_mount != NULL);
642	obj = vp->v_object;
643	ASSERT(obj != NULL);
644
645	start = uio->uio_loffset;
646	off = start & PAGEOFFSET;
647	zfs_vmobject_wlock(obj);
648	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
649		vm_page_t pp;
650		uint64_t bytes = MIN(PAGESIZE - off, len);
651
652		if (pp = page_hold(vp, start)) {
653			struct sf_buf *sf;
654			caddr_t va;
655
656			zfs_vmobject_wunlock(obj);
657			va = zfs_map_page(pp, &sf);
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659			zfs_unmap_page(sf);
660			zfs_vmobject_wlock(obj);
661			page_unhold(pp);
662		} else {
663			zfs_vmobject_wunlock(obj);
664			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
665			    uio, bytes);
666			zfs_vmobject_wlock(obj);
667		}
668		len -= bytes;
669		off = 0;
670		if (error)
671			break;
672	}
673	zfs_vmobject_wunlock(obj);
674	return (error);
675}
676
677offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
678
679/*
680 * Read bytes from specified file into supplied buffer.
681 *
682 *	IN:	vp	- vnode of file to be read from.
683 *		uio	- structure supplying read location, range info,
684 *			  and return buffer.
685 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
686 *		cr	- credentials of caller.
687 *		ct	- caller context
688 *
689 *	OUT:	uio	- updated offset and range, buffer filled.
690 *
691 *	RETURN:	0 on success, error code on failure.
692 *
693 * Side Effects:
694 *	vp - atime updated if byte count > 0
695 */
696/* ARGSUSED */
697static int
698zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
699{
700	znode_t		*zp = VTOZ(vp);
701	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
702	ssize_t		n, nbytes;
703	int		error = 0;
704	rl_t		*rl;
705	xuio_t		*xuio = NULL;
706
707	ZFS_ENTER(zfsvfs);
708	ZFS_VERIFY_ZP(zp);
709
710	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
711		ZFS_EXIT(zfsvfs);
712		return (SET_ERROR(EACCES));
713	}
714
715	/*
716	 * Validate file offset
717	 */
718	if (uio->uio_loffset < (offset_t)0) {
719		ZFS_EXIT(zfsvfs);
720		return (SET_ERROR(EINVAL));
721	}
722
723	/*
724	 * Fasttrack empty reads
725	 */
726	if (uio->uio_resid == 0) {
727		ZFS_EXIT(zfsvfs);
728		return (0);
729	}
730
731	/*
732	 * Check for mandatory locks
733	 */
734	if (MANDMODE(zp->z_mode)) {
735		if (error = chklock(vp, FREAD,
736		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
737			ZFS_EXIT(zfsvfs);
738			return (error);
739		}
740	}
741
742	/*
743	 * If we're in FRSYNC mode, sync out this znode before reading it.
744	 */
745	if (zfsvfs->z_log &&
746	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
747		zil_commit(zfsvfs->z_log, zp->z_id);
748
749	/*
750	 * Lock the range against changes.
751	 */
752	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
753
754	/*
755	 * If we are reading past end-of-file we can skip
756	 * to the end; but we might still need to set atime.
757	 */
758	if (uio->uio_loffset >= zp->z_size) {
759		error = 0;
760		goto out;
761	}
762
763	ASSERT(uio->uio_loffset < zp->z_size);
764	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
765
766#ifdef illumos
767	if ((uio->uio_extflg == UIO_XUIO) &&
768	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
769		int nblk;
770		int blksz = zp->z_blksz;
771		uint64_t offset = uio->uio_loffset;
772
773		xuio = (xuio_t *)uio;
774		if ((ISP2(blksz))) {
775			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
776			    blksz)) / blksz;
777		} else {
778			ASSERT(offset + n <= blksz);
779			nblk = 1;
780		}
781		(void) dmu_xuio_init(xuio, nblk);
782
783		if (vn_has_cached_data(vp)) {
784			/*
785			 * For simplicity, we always allocate a full buffer
786			 * even if we only expect to read a portion of a block.
787			 */
788			while (--nblk >= 0) {
789				(void) dmu_xuio_add(xuio,
790				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
791				    blksz), 0, blksz);
792			}
793		}
794	}
795#endif	/* illumos */
796
797	while (n > 0) {
798		nbytes = MIN(n, zfs_read_chunk_size -
799		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
800
801#ifdef __FreeBSD__
802		if (uio->uio_segflg == UIO_NOCOPY)
803			error = mappedread_sf(vp, nbytes, uio);
804		else
805#endif /* __FreeBSD__ */
806		if (vn_has_cached_data(vp)) {
807			error = mappedread(vp, nbytes, uio);
808		} else {
809			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
810			    uio, nbytes);
811		}
812		if (error) {
813			/* convert checksum errors into IO errors */
814			if (error == ECKSUM)
815				error = SET_ERROR(EIO);
816			break;
817		}
818
819		n -= nbytes;
820	}
821out:
822	zfs_range_unlock(rl);
823
824	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
825	ZFS_EXIT(zfsvfs);
826	return (error);
827}
828
829/*
830 * Write the bytes to a file.
831 *
832 *	IN:	vp	- vnode of file to be written to.
833 *		uio	- structure supplying write location, range info,
834 *			  and data buffer.
835 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
836 *			  set if in append mode.
837 *		cr	- credentials of caller.
838 *		ct	- caller context (NFS/CIFS fem monitor only)
839 *
840 *	OUT:	uio	- updated offset and range.
841 *
842 *	RETURN:	0 on success, error code on failure.
843 *
844 * Timestamps:
845 *	vp - ctime|mtime updated if byte count > 0
846 */
847
848/* ARGSUSED */
849static int
850zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
851{
852	znode_t		*zp = VTOZ(vp);
853	rlim64_t	limit = MAXOFFSET_T;
854	ssize_t		start_resid = uio->uio_resid;
855	ssize_t		tx_bytes;
856	uint64_t	end_size;
857	dmu_tx_t	*tx;
858	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
859	zilog_t		*zilog;
860	offset_t	woff;
861	ssize_t		n, nbytes;
862	rl_t		*rl;
863	int		max_blksz = zfsvfs->z_max_blksz;
864	int		error = 0;
865	arc_buf_t	*abuf;
866	iovec_t		*aiov = NULL;
867	xuio_t		*xuio = NULL;
868	int		i_iov = 0;
869	int		iovcnt = uio->uio_iovcnt;
870	iovec_t		*iovp = uio->uio_iov;
871	int		write_eof;
872	int		count = 0;
873	sa_bulk_attr_t	bulk[4];
874	uint64_t	mtime[2], ctime[2];
875
876	/*
877	 * Fasttrack empty write
878	 */
879	n = start_resid;
880	if (n == 0)
881		return (0);
882
883	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
884		limit = MAXOFFSET_T;
885
886	ZFS_ENTER(zfsvfs);
887	ZFS_VERIFY_ZP(zp);
888
889	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
890	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
891	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
892	    &zp->z_size, 8);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
894	    &zp->z_pflags, 8);
895
896	/*
897	 * If immutable or not appending then return EPERM
898	 */
899	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
900	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
901	    (uio->uio_loffset < zp->z_size))) {
902		ZFS_EXIT(zfsvfs);
903		return (SET_ERROR(EPERM));
904	}
905
906	zilog = zfsvfs->z_log;
907
908	/*
909	 * Validate file offset
910	 */
911	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
912	if (woff < 0) {
913		ZFS_EXIT(zfsvfs);
914		return (SET_ERROR(EINVAL));
915	}
916
917	/*
918	 * Check for mandatory locks before calling zfs_range_lock()
919	 * in order to prevent a deadlock with locks set via fcntl().
920	 */
921	if (MANDMODE((mode_t)zp->z_mode) &&
922	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
923		ZFS_EXIT(zfsvfs);
924		return (error);
925	}
926
927#ifdef illumos
928	/*
929	 * Pre-fault the pages to ensure slow (eg NFS) pages
930	 * don't hold up txg.
931	 * Skip this if uio contains loaned arc_buf.
932	 */
933	if ((uio->uio_extflg == UIO_XUIO) &&
934	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
935		xuio = (xuio_t *)uio;
936	else
937		uio_prefaultpages(MIN(n, max_blksz), uio);
938#endif
939
940	/*
941	 * If in append mode, set the io offset pointer to eof.
942	 */
943	if (ioflag & FAPPEND) {
944		/*
945		 * Obtain an appending range lock to guarantee file append
946		 * semantics.  We reset the write offset once we have the lock.
947		 */
948		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
949		woff = rl->r_off;
950		if (rl->r_len == UINT64_MAX) {
951			/*
952			 * We overlocked the file because this write will cause
953			 * the file block size to increase.
954			 * Note that zp_size cannot change with this lock held.
955			 */
956			woff = zp->z_size;
957		}
958		uio->uio_loffset = woff;
959	} else {
960		/*
961		 * Note that if the file block size will change as a result of
962		 * this write, then this range lock will lock the entire file
963		 * so that we can re-write the block safely.
964		 */
965		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
966	}
967
968	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
969		zfs_range_unlock(rl);
970		ZFS_EXIT(zfsvfs);
971		return (EFBIG);
972	}
973
974	if (woff >= limit) {
975		zfs_range_unlock(rl);
976		ZFS_EXIT(zfsvfs);
977		return (SET_ERROR(EFBIG));
978	}
979
980	if ((woff + n) > limit || woff > (limit - n))
981		n = limit - woff;
982
983	/* Will this write extend the file length? */
984	write_eof = (woff + n > zp->z_size);
985
986	end_size = MAX(zp->z_size, woff + n);
987
988	/*
989	 * Write the file in reasonable size chunks.  Each chunk is written
990	 * in a separate transaction; this keeps the intent log records small
991	 * and allows us to do more fine-grained space accounting.
992	 */
993	while (n > 0) {
994		abuf = NULL;
995		woff = uio->uio_loffset;
996		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
997		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
998			if (abuf != NULL)
999				dmu_return_arcbuf(abuf);
1000			error = SET_ERROR(EDQUOT);
1001			break;
1002		}
1003
1004		if (xuio && abuf == NULL) {
1005			ASSERT(i_iov < iovcnt);
1006			aiov = &iovp[i_iov];
1007			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1008			dmu_xuio_clear(xuio, i_iov);
1009			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1010			    iovec_t *, aiov, arc_buf_t *, abuf);
1011			ASSERT((aiov->iov_base == abuf->b_data) ||
1012			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1013			    aiov->iov_len == arc_buf_size(abuf)));
1014			i_iov++;
1015		} else if (abuf == NULL && n >= max_blksz &&
1016		    woff >= zp->z_size &&
1017		    P2PHASE(woff, max_blksz) == 0 &&
1018		    zp->z_blksz == max_blksz) {
1019			/*
1020			 * This write covers a full block.  "Borrow" a buffer
1021			 * from the dmu so that we can fill it before we enter
1022			 * a transaction.  This avoids the possibility of
1023			 * holding up the transaction if the data copy hangs
1024			 * up on a pagefault (e.g., from an NFS server mapping).
1025			 */
1026			size_t cbytes;
1027
1028			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1029			    max_blksz);
1030			ASSERT(abuf != NULL);
1031			ASSERT(arc_buf_size(abuf) == max_blksz);
1032			if (error = uiocopy(abuf->b_data, max_blksz,
1033			    UIO_WRITE, uio, &cbytes)) {
1034				dmu_return_arcbuf(abuf);
1035				break;
1036			}
1037			ASSERT(cbytes == max_blksz);
1038		}
1039
1040		/*
1041		 * Start a transaction.
1042		 */
1043		tx = dmu_tx_create(zfsvfs->z_os);
1044		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1045		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1046		zfs_sa_upgrade_txholds(tx, zp);
1047		error = dmu_tx_assign(tx, TXG_WAIT);
1048		if (error) {
1049			dmu_tx_abort(tx);
1050			if (abuf != NULL)
1051				dmu_return_arcbuf(abuf);
1052			break;
1053		}
1054
1055		/*
1056		 * If zfs_range_lock() over-locked we grow the blocksize
1057		 * and then reduce the lock range.  This will only happen
1058		 * on the first iteration since zfs_range_reduce() will
1059		 * shrink down r_len to the appropriate size.
1060		 */
1061		if (rl->r_len == UINT64_MAX) {
1062			uint64_t new_blksz;
1063
1064			if (zp->z_blksz > max_blksz) {
1065				/*
1066				 * File's blocksize is already larger than the
1067				 * "recordsize" property.  Only let it grow to
1068				 * the next power of 2.
1069				 */
1070				ASSERT(!ISP2(zp->z_blksz));
1071				new_blksz = MIN(end_size,
1072				    1 << highbit64(zp->z_blksz));
1073			} else {
1074				new_blksz = MIN(end_size, max_blksz);
1075			}
1076			zfs_grow_blocksize(zp, new_blksz, tx);
1077			zfs_range_reduce(rl, woff, n);
1078		}
1079
1080		/*
1081		 * XXX - should we really limit each write to z_max_blksz?
1082		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1083		 */
1084		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1085
1086		if (woff + nbytes > zp->z_size)
1087			vnode_pager_setsize(vp, woff + nbytes);
1088
1089		if (abuf == NULL) {
1090			tx_bytes = uio->uio_resid;
1091			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1092			    uio, nbytes, tx);
1093			tx_bytes -= uio->uio_resid;
1094		} else {
1095			tx_bytes = nbytes;
1096			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1097			/*
1098			 * If this is not a full block write, but we are
1099			 * extending the file past EOF and this data starts
1100			 * block-aligned, use assign_arcbuf().  Otherwise,
1101			 * write via dmu_write().
1102			 */
1103			if (tx_bytes < max_blksz && (!write_eof ||
1104			    aiov->iov_base != abuf->b_data)) {
1105				ASSERT(xuio);
1106				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1107				    aiov->iov_len, aiov->iov_base, tx);
1108				dmu_return_arcbuf(abuf);
1109				xuio_stat_wbuf_copied();
1110			} else {
1111				ASSERT(xuio || tx_bytes == max_blksz);
1112				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1113				    woff, abuf, tx);
1114			}
1115			ASSERT(tx_bytes <= uio->uio_resid);
1116			uioskip(uio, tx_bytes);
1117		}
1118		if (tx_bytes && vn_has_cached_data(vp)) {
1119			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1120			    zp->z_id, uio->uio_segflg, tx);
1121		}
1122
1123		/*
1124		 * If we made no progress, we're done.  If we made even
1125		 * partial progress, update the znode and ZIL accordingly.
1126		 */
1127		if (tx_bytes == 0) {
1128			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1129			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1130			dmu_tx_commit(tx);
1131			ASSERT(error != 0);
1132			break;
1133		}
1134
1135		/*
1136		 * Clear Set-UID/Set-GID bits on successful write if not
1137		 * privileged and at least one of the excute bits is set.
1138		 *
1139		 * It would be nice to to this after all writes have
1140		 * been done, but that would still expose the ISUID/ISGID
1141		 * to another app after the partial write is committed.
1142		 *
1143		 * Note: we don't call zfs_fuid_map_id() here because
1144		 * user 0 is not an ephemeral uid.
1145		 */
1146		mutex_enter(&zp->z_acl_lock);
1147		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1148		    (S_IXUSR >> 6))) != 0 &&
1149		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1150		    secpolicy_vnode_setid_retain(vp, cr,
1151		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1152			uint64_t newmode;
1153			zp->z_mode &= ~(S_ISUID | S_ISGID);
1154			newmode = zp->z_mode;
1155			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1156			    (void *)&newmode, sizeof (uint64_t), tx);
1157		}
1158		mutex_exit(&zp->z_acl_lock);
1159
1160		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1161		    B_TRUE);
1162
1163		/*
1164		 * Update the file size (zp_size) if it has changed;
1165		 * account for possible concurrent updates.
1166		 */
1167		while ((end_size = zp->z_size) < uio->uio_loffset) {
1168			(void) atomic_cas_64(&zp->z_size, end_size,
1169			    uio->uio_loffset);
1170			ASSERT(error == 0);
1171		}
1172		/*
1173		 * If we are replaying and eof is non zero then force
1174		 * the file size to the specified eof. Note, there's no
1175		 * concurrency during replay.
1176		 */
1177		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1178			zp->z_size = zfsvfs->z_replay_eof;
1179
1180		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1181
1182		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1183		dmu_tx_commit(tx);
1184
1185		if (error != 0)
1186			break;
1187		ASSERT(tx_bytes == nbytes);
1188		n -= nbytes;
1189
1190#ifdef illumos
1191		if (!xuio && n > 0)
1192			uio_prefaultpages(MIN(n, max_blksz), uio);
1193#endif
1194	}
1195
1196	zfs_range_unlock(rl);
1197
1198	/*
1199	 * If we're in replay mode, or we made no progress, return error.
1200	 * Otherwise, it's at least a partial write, so it's successful.
1201	 */
1202	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1203		ZFS_EXIT(zfsvfs);
1204		return (error);
1205	}
1206
1207	if (ioflag & (FSYNC | FDSYNC) ||
1208	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1209		zil_commit(zilog, zp->z_id);
1210
1211	ZFS_EXIT(zfsvfs);
1212	return (0);
1213}
1214
1215void
1216zfs_get_done(zgd_t *zgd, int error)
1217{
1218	znode_t *zp = zgd->zgd_private;
1219	objset_t *os = zp->z_zfsvfs->z_os;
1220
1221	if (zgd->zgd_db)
1222		dmu_buf_rele(zgd->zgd_db, zgd);
1223
1224	zfs_range_unlock(zgd->zgd_rl);
1225
1226	/*
1227	 * Release the vnode asynchronously as we currently have the
1228	 * txg stopped from syncing.
1229	 */
1230	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1231
1232	if (error == 0 && zgd->zgd_bp)
1233		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1234
1235	kmem_free(zgd, sizeof (zgd_t));
1236}
1237
1238#ifdef DEBUG
1239static int zil_fault_io = 0;
1240#endif
1241
1242/*
1243 * Get data to generate a TX_WRITE intent log record.
1244 */
1245int
1246zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1247{
1248	zfsvfs_t *zfsvfs = arg;
1249	objset_t *os = zfsvfs->z_os;
1250	znode_t *zp;
1251	uint64_t object = lr->lr_foid;
1252	uint64_t offset = lr->lr_offset;
1253	uint64_t size = lr->lr_length;
1254	blkptr_t *bp = &lr->lr_blkptr;
1255	dmu_buf_t *db;
1256	zgd_t *zgd;
1257	int error = 0;
1258
1259	ASSERT(zio != NULL);
1260	ASSERT(size != 0);
1261
1262	/*
1263	 * Nothing to do if the file has been removed
1264	 */
1265	if (zfs_zget(zfsvfs, object, &zp) != 0)
1266		return (SET_ERROR(ENOENT));
1267	if (zp->z_unlinked) {
1268		/*
1269		 * Release the vnode asynchronously as we currently have the
1270		 * txg stopped from syncing.
1271		 */
1272		VN_RELE_ASYNC(ZTOV(zp),
1273		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1274		return (SET_ERROR(ENOENT));
1275	}
1276
1277	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1278	zgd->zgd_zilog = zfsvfs->z_log;
1279	zgd->zgd_private = zp;
1280
1281	/*
1282	 * Write records come in two flavors: immediate and indirect.
1283	 * For small writes it's cheaper to store the data with the
1284	 * log record (immediate); for large writes it's cheaper to
1285	 * sync the data and get a pointer to it (indirect) so that
1286	 * we don't have to write the data twice.
1287	 */
1288	if (buf != NULL) { /* immediate write */
1289		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1290		/* test for truncation needs to be done while range locked */
1291		if (offset >= zp->z_size) {
1292			error = SET_ERROR(ENOENT);
1293		} else {
1294			error = dmu_read(os, object, offset, size, buf,
1295			    DMU_READ_NO_PREFETCH);
1296		}
1297		ASSERT(error == 0 || error == ENOENT);
1298	} else { /* indirect write */
1299		/*
1300		 * Have to lock the whole block to ensure when it's
1301		 * written out and it's checksum is being calculated
1302		 * that no one can change the data. We need to re-check
1303		 * blocksize after we get the lock in case it's changed!
1304		 */
1305		for (;;) {
1306			uint64_t blkoff;
1307			size = zp->z_blksz;
1308			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1309			offset -= blkoff;
1310			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1311			    RL_READER);
1312			if (zp->z_blksz == size)
1313				break;
1314			offset += blkoff;
1315			zfs_range_unlock(zgd->zgd_rl);
1316		}
1317		/* test for truncation needs to be done while range locked */
1318		if (lr->lr_offset >= zp->z_size)
1319			error = SET_ERROR(ENOENT);
1320#ifdef DEBUG
1321		if (zil_fault_io) {
1322			error = SET_ERROR(EIO);
1323			zil_fault_io = 0;
1324		}
1325#endif
1326		if (error == 0)
1327			error = dmu_buf_hold(os, object, offset, zgd, &db,
1328			    DMU_READ_NO_PREFETCH);
1329
1330		if (error == 0) {
1331			blkptr_t *obp = dmu_buf_get_blkptr(db);
1332			if (obp) {
1333				ASSERT(BP_IS_HOLE(bp));
1334				*bp = *obp;
1335			}
1336
1337			zgd->zgd_db = db;
1338			zgd->zgd_bp = bp;
1339
1340			ASSERT(db->db_offset == offset);
1341			ASSERT(db->db_size == size);
1342
1343			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1344			    zfs_get_done, zgd);
1345			ASSERT(error || lr->lr_length <= zp->z_blksz);
1346
1347			/*
1348			 * On success, we need to wait for the write I/O
1349			 * initiated by dmu_sync() to complete before we can
1350			 * release this dbuf.  We will finish everything up
1351			 * in the zfs_get_done() callback.
1352			 */
1353			if (error == 0)
1354				return (0);
1355
1356			if (error == EALREADY) {
1357				lr->lr_common.lrc_txtype = TX_WRITE2;
1358				error = 0;
1359			}
1360		}
1361	}
1362
1363	zfs_get_done(zgd, error);
1364
1365	return (error);
1366}
1367
1368/*ARGSUSED*/
1369static int
1370zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1371    caller_context_t *ct)
1372{
1373	znode_t *zp = VTOZ(vp);
1374	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1375	int error;
1376
1377	ZFS_ENTER(zfsvfs);
1378	ZFS_VERIFY_ZP(zp);
1379
1380	if (flag & V_ACE_MASK)
1381		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1382	else
1383		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1384
1385	ZFS_EXIT(zfsvfs);
1386	return (error);
1387}
1388
1389/*
1390 * If vnode is for a device return a specfs vnode instead.
1391 */
1392static int
1393specvp_check(vnode_t **vpp, cred_t *cr)
1394{
1395	int error = 0;
1396
1397	if (IS_DEVVP(*vpp)) {
1398		struct vnode *svp;
1399
1400		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1401		VN_RELE(*vpp);
1402		if (svp == NULL)
1403			error = SET_ERROR(ENOSYS);
1404		*vpp = svp;
1405	}
1406	return (error);
1407}
1408
1409
1410/*
1411 * Lookup an entry in a directory, or an extended attribute directory.
1412 * If it exists, return a held vnode reference for it.
1413 *
1414 *	IN:	dvp	- vnode of directory to search.
1415 *		nm	- name of entry to lookup.
1416 *		pnp	- full pathname to lookup [UNUSED].
1417 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1418 *		rdir	- root directory vnode [UNUSED].
1419 *		cr	- credentials of caller.
1420 *		ct	- caller context
1421 *		direntflags - directory lookup flags
1422 *		realpnp - returned pathname.
1423 *
1424 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1425 *
1426 *	RETURN:	0 on success, error code on failure.
1427 *
1428 * Timestamps:
1429 *	NA
1430 */
1431/* ARGSUSED */
1432static int
1433zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1434    int nameiop, cred_t *cr, kthread_t *td, int flags)
1435{
1436	znode_t *zdp = VTOZ(dvp);
1437	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1438	int	error = 0;
1439	int *direntflags = NULL;
1440	void *realpnp = NULL;
1441
1442	/* fast path */
1443	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1444
1445		if (dvp->v_type != VDIR) {
1446			return (SET_ERROR(ENOTDIR));
1447		} else if (zdp->z_sa_hdl == NULL) {
1448			return (SET_ERROR(EIO));
1449		}
1450
1451		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1452			error = zfs_fastaccesschk_execute(zdp, cr);
1453			if (!error) {
1454				*vpp = dvp;
1455				VN_HOLD(*vpp);
1456				return (0);
1457			}
1458			return (error);
1459		} else {
1460			vnode_t *tvp = dnlc_lookup(dvp, nm);
1461
1462			if (tvp) {
1463				error = zfs_fastaccesschk_execute(zdp, cr);
1464				if (error) {
1465					VN_RELE(tvp);
1466					return (error);
1467				}
1468				if (tvp == DNLC_NO_VNODE) {
1469					VN_RELE(tvp);
1470					return (SET_ERROR(ENOENT));
1471				} else {
1472					*vpp = tvp;
1473					return (specvp_check(vpp, cr));
1474				}
1475			}
1476		}
1477	}
1478
1479	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1480
1481	ZFS_ENTER(zfsvfs);
1482	ZFS_VERIFY_ZP(zdp);
1483
1484	*vpp = NULL;
1485
1486	if (flags & LOOKUP_XATTR) {
1487#ifdef TODO
1488		/*
1489		 * If the xattr property is off, refuse the lookup request.
1490		 */
1491		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1492			ZFS_EXIT(zfsvfs);
1493			return (SET_ERROR(EINVAL));
1494		}
1495#endif
1496
1497		/*
1498		 * We don't allow recursive attributes..
1499		 * Maybe someday we will.
1500		 */
1501		if (zdp->z_pflags & ZFS_XATTR) {
1502			ZFS_EXIT(zfsvfs);
1503			return (SET_ERROR(EINVAL));
1504		}
1505
1506		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1507			ZFS_EXIT(zfsvfs);
1508			return (error);
1509		}
1510
1511		/*
1512		 * Do we have permission to get into attribute directory?
1513		 */
1514
1515		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1516		    B_FALSE, cr)) {
1517			VN_RELE(*vpp);
1518			*vpp = NULL;
1519		}
1520
1521		ZFS_EXIT(zfsvfs);
1522		return (error);
1523	}
1524
1525	if (dvp->v_type != VDIR) {
1526		ZFS_EXIT(zfsvfs);
1527		return (SET_ERROR(ENOTDIR));
1528	}
1529
1530	/*
1531	 * Check accessibility of directory.
1532	 */
1533
1534	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1535		ZFS_EXIT(zfsvfs);
1536		return (error);
1537	}
1538
1539	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1540	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1541		ZFS_EXIT(zfsvfs);
1542		return (SET_ERROR(EILSEQ));
1543	}
1544
1545	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1546	if (error == 0)
1547		error = specvp_check(vpp, cr);
1548
1549	/* Translate errors and add SAVENAME when needed. */
1550	if (cnp->cn_flags & ISLASTCN) {
1551		switch (nameiop) {
1552		case CREATE:
1553		case RENAME:
1554			if (error == ENOENT) {
1555				error = EJUSTRETURN;
1556				cnp->cn_flags |= SAVENAME;
1557				break;
1558			}
1559			/* FALLTHROUGH */
1560		case DELETE:
1561			if (error == 0)
1562				cnp->cn_flags |= SAVENAME;
1563			break;
1564		}
1565	}
1566	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1567		int ltype = 0;
1568
1569		if (cnp->cn_flags & ISDOTDOT) {
1570			ltype = VOP_ISLOCKED(dvp);
1571			VOP_UNLOCK(dvp, 0);
1572		}
1573		ZFS_EXIT(zfsvfs);
1574		error = vn_lock(*vpp, cnp->cn_lkflags);
1575		if (cnp->cn_flags & ISDOTDOT)
1576			vn_lock(dvp, ltype | LK_RETRY);
1577		if (error != 0) {
1578			VN_RELE(*vpp);
1579			*vpp = NULL;
1580			return (error);
1581		}
1582	} else {
1583		ZFS_EXIT(zfsvfs);
1584	}
1585
1586#ifdef FREEBSD_NAMECACHE
1587	/*
1588	 * Insert name into cache (as non-existent) if appropriate.
1589	 */
1590	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1591		cache_enter(dvp, *vpp, cnp);
1592	/*
1593	 * Insert name into cache if appropriate.
1594	 */
1595	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1596		if (!(cnp->cn_flags & ISLASTCN) ||
1597		    (nameiop != DELETE && nameiop != RENAME)) {
1598			cache_enter(dvp, *vpp, cnp);
1599		}
1600	}
1601#endif
1602
1603	return (error);
1604}
1605
1606/*
1607 * Attempt to create a new entry in a directory.  If the entry
1608 * already exists, truncate the file if permissible, else return
1609 * an error.  Return the vp of the created or trunc'd file.
1610 *
1611 *	IN:	dvp	- vnode of directory to put new file entry in.
1612 *		name	- name of new file entry.
1613 *		vap	- attributes of new file.
1614 *		excl	- flag indicating exclusive or non-exclusive mode.
1615 *		mode	- mode to open file with.
1616 *		cr	- credentials of caller.
1617 *		flag	- large file flag [UNUSED].
1618 *		ct	- caller context
1619 *		vsecp	- ACL to be set
1620 *
1621 *	OUT:	vpp	- vnode of created or trunc'd entry.
1622 *
1623 *	RETURN:	0 on success, error code on failure.
1624 *
1625 * Timestamps:
1626 *	dvp - ctime|mtime updated if new entry created
1627 *	 vp - ctime|mtime always, atime if new
1628 */
1629
1630/* ARGSUSED */
1631static int
1632zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1633    vnode_t **vpp, cred_t *cr, kthread_t *td)
1634{
1635	znode_t		*zp, *dzp = VTOZ(dvp);
1636	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1637	zilog_t		*zilog;
1638	objset_t	*os;
1639	zfs_dirlock_t	*dl;
1640	dmu_tx_t	*tx;
1641	int		error;
1642	ksid_t		*ksid;
1643	uid_t		uid;
1644	gid_t		gid = crgetgid(cr);
1645	zfs_acl_ids_t   acl_ids;
1646	boolean_t	fuid_dirtied;
1647	boolean_t	have_acl = B_FALSE;
1648	boolean_t	waited = B_FALSE;
1649	void		*vsecp = NULL;
1650	int		flag = 0;
1651
1652	/*
1653	 * If we have an ephemeral id, ACL, or XVATTR then
1654	 * make sure file system is at proper version
1655	 */
1656
1657	ksid = crgetsid(cr, KSID_OWNER);
1658	if (ksid)
1659		uid = ksid_getid(ksid);
1660	else
1661		uid = crgetuid(cr);
1662
1663	if (zfsvfs->z_use_fuids == B_FALSE &&
1664	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1665	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1666		return (SET_ERROR(EINVAL));
1667
1668	ZFS_ENTER(zfsvfs);
1669	ZFS_VERIFY_ZP(dzp);
1670	os = zfsvfs->z_os;
1671	zilog = zfsvfs->z_log;
1672
1673	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1674	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1675		ZFS_EXIT(zfsvfs);
1676		return (SET_ERROR(EILSEQ));
1677	}
1678
1679	if (vap->va_mask & AT_XVATTR) {
1680		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1681		    crgetuid(cr), cr, vap->va_type)) != 0) {
1682			ZFS_EXIT(zfsvfs);
1683			return (error);
1684		}
1685	}
1686
1687	getnewvnode_reserve(1);
1688
1689top:
1690	*vpp = NULL;
1691
1692	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1693		vap->va_mode &= ~S_ISVTX;
1694
1695	if (*name == '\0') {
1696		/*
1697		 * Null component name refers to the directory itself.
1698		 */
1699		VN_HOLD(dvp);
1700		zp = dzp;
1701		dl = NULL;
1702		error = 0;
1703	} else {
1704		/* possible VN_HOLD(zp) */
1705		int zflg = 0;
1706
1707		if (flag & FIGNORECASE)
1708			zflg |= ZCILOOK;
1709
1710		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1711		    NULL, NULL);
1712		if (error) {
1713			if (have_acl)
1714				zfs_acl_ids_free(&acl_ids);
1715			if (strcmp(name, "..") == 0)
1716				error = SET_ERROR(EISDIR);
1717			getnewvnode_drop_reserve();
1718			ZFS_EXIT(zfsvfs);
1719			return (error);
1720		}
1721	}
1722
1723	if (zp == NULL) {
1724		uint64_t txtype;
1725
1726		/*
1727		 * Create a new file object and update the directory
1728		 * to reference it.
1729		 */
1730		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1731			if (have_acl)
1732				zfs_acl_ids_free(&acl_ids);
1733			goto out;
1734		}
1735
1736		/*
1737		 * We only support the creation of regular files in
1738		 * extended attribute directories.
1739		 */
1740
1741		if ((dzp->z_pflags & ZFS_XATTR) &&
1742		    (vap->va_type != VREG)) {
1743			if (have_acl)
1744				zfs_acl_ids_free(&acl_ids);
1745			error = SET_ERROR(EINVAL);
1746			goto out;
1747		}
1748
1749		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1750		    cr, vsecp, &acl_ids)) != 0)
1751			goto out;
1752		have_acl = B_TRUE;
1753
1754		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1755			zfs_acl_ids_free(&acl_ids);
1756			error = SET_ERROR(EDQUOT);
1757			goto out;
1758		}
1759
1760		tx = dmu_tx_create(os);
1761
1762		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1763		    ZFS_SA_BASE_ATTR_SIZE);
1764
1765		fuid_dirtied = zfsvfs->z_fuid_dirty;
1766		if (fuid_dirtied)
1767			zfs_fuid_txhold(zfsvfs, tx);
1768		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1769		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1770		if (!zfsvfs->z_use_sa &&
1771		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1772			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1773			    0, acl_ids.z_aclp->z_acl_bytes);
1774		}
1775		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1776		if (error) {
1777			zfs_dirent_unlock(dl);
1778			if (error == ERESTART) {
1779				waited = B_TRUE;
1780				dmu_tx_wait(tx);
1781				dmu_tx_abort(tx);
1782				goto top;
1783			}
1784			zfs_acl_ids_free(&acl_ids);
1785			dmu_tx_abort(tx);
1786			getnewvnode_drop_reserve();
1787			ZFS_EXIT(zfsvfs);
1788			return (error);
1789		}
1790		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1791
1792		if (fuid_dirtied)
1793			zfs_fuid_sync(zfsvfs, tx);
1794
1795		(void) zfs_link_create(dl, zp, tx, ZNEW);
1796		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1797		if (flag & FIGNORECASE)
1798			txtype |= TX_CI;
1799		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1800		    vsecp, acl_ids.z_fuidp, vap);
1801		zfs_acl_ids_free(&acl_ids);
1802		dmu_tx_commit(tx);
1803	} else {
1804		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1805
1806		if (have_acl)
1807			zfs_acl_ids_free(&acl_ids);
1808		have_acl = B_FALSE;
1809
1810		/*
1811		 * A directory entry already exists for this name.
1812		 */
1813		/*
1814		 * Can't truncate an existing file if in exclusive mode.
1815		 */
1816		if (excl == EXCL) {
1817			error = SET_ERROR(EEXIST);
1818			goto out;
1819		}
1820		/*
1821		 * Can't open a directory for writing.
1822		 */
1823		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1824			error = SET_ERROR(EISDIR);
1825			goto out;
1826		}
1827		/*
1828		 * Verify requested access to file.
1829		 */
1830		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1831			goto out;
1832		}
1833
1834		mutex_enter(&dzp->z_lock);
1835		dzp->z_seq++;
1836		mutex_exit(&dzp->z_lock);
1837
1838		/*
1839		 * Truncate regular files if requested.
1840		 */
1841		if ((ZTOV(zp)->v_type == VREG) &&
1842		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1843			/* we can't hold any locks when calling zfs_freesp() */
1844			zfs_dirent_unlock(dl);
1845			dl = NULL;
1846			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1847			if (error == 0) {
1848				vnevent_create(ZTOV(zp), ct);
1849			}
1850		}
1851	}
1852out:
1853	getnewvnode_drop_reserve();
1854	if (dl)
1855		zfs_dirent_unlock(dl);
1856
1857	if (error) {
1858		if (zp)
1859			VN_RELE(ZTOV(zp));
1860	} else {
1861		*vpp = ZTOV(zp);
1862		error = specvp_check(vpp, cr);
1863	}
1864
1865	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1866		zil_commit(zilog, 0);
1867
1868	ZFS_EXIT(zfsvfs);
1869	return (error);
1870}
1871
1872/*
1873 * Remove an entry from a directory.
1874 *
1875 *	IN:	dvp	- vnode of directory to remove entry from.
1876 *		name	- name of entry to remove.
1877 *		cr	- credentials of caller.
1878 *		ct	- caller context
1879 *		flags	- case flags
1880 *
1881 *	RETURN:	0 on success, error code on failure.
1882 *
1883 * Timestamps:
1884 *	dvp - ctime|mtime
1885 *	 vp - ctime (if nlink > 0)
1886 */
1887
1888uint64_t null_xattr = 0;
1889
1890/*ARGSUSED*/
1891static int
1892zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1893    int flags)
1894{
1895	znode_t		*zp, *dzp = VTOZ(dvp);
1896	znode_t		*xzp;
1897	vnode_t		*vp;
1898	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1899	zilog_t		*zilog;
1900	uint64_t	acl_obj, xattr_obj;
1901	uint64_t	xattr_obj_unlinked = 0;
1902	uint64_t	obj = 0;
1903	zfs_dirlock_t	*dl;
1904	dmu_tx_t	*tx;
1905	boolean_t	may_delete_now, delete_now = FALSE;
1906	boolean_t	unlinked, toobig = FALSE;
1907	uint64_t	txtype;
1908	pathname_t	*realnmp = NULL;
1909	pathname_t	realnm;
1910	int		error;
1911	int		zflg = ZEXISTS;
1912	boolean_t	waited = B_FALSE;
1913
1914	ZFS_ENTER(zfsvfs);
1915	ZFS_VERIFY_ZP(dzp);
1916	zilog = zfsvfs->z_log;
1917
1918	if (flags & FIGNORECASE) {
1919		zflg |= ZCILOOK;
1920		pn_alloc(&realnm);
1921		realnmp = &realnm;
1922	}
1923
1924top:
1925	xattr_obj = 0;
1926	xzp = NULL;
1927	/*
1928	 * Attempt to lock directory; fail if entry doesn't exist.
1929	 */
1930	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1931	    NULL, realnmp)) {
1932		if (realnmp)
1933			pn_free(realnmp);
1934		ZFS_EXIT(zfsvfs);
1935		return (error);
1936	}
1937
1938	vp = ZTOV(zp);
1939
1940	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1941		goto out;
1942	}
1943
1944	/*
1945	 * Need to use rmdir for removing directories.
1946	 */
1947	if (vp->v_type == VDIR) {
1948		error = SET_ERROR(EPERM);
1949		goto out;
1950	}
1951
1952	vnevent_remove(vp, dvp, name, ct);
1953
1954	if (realnmp)
1955		dnlc_remove(dvp, realnmp->pn_buf);
1956	else
1957		dnlc_remove(dvp, name);
1958
1959	VI_LOCK(vp);
1960	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1961	VI_UNLOCK(vp);
1962
1963	/*
1964	 * We may delete the znode now, or we may put it in the unlinked set;
1965	 * it depends on whether we're the last link, and on whether there are
1966	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1967	 * allow for either case.
1968	 */
1969	obj = zp->z_id;
1970	tx = dmu_tx_create(zfsvfs->z_os);
1971	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1972	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1973	zfs_sa_upgrade_txholds(tx, zp);
1974	zfs_sa_upgrade_txholds(tx, dzp);
1975	if (may_delete_now) {
1976		toobig =
1977		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1978		/* if the file is too big, only hold_free a token amount */
1979		dmu_tx_hold_free(tx, zp->z_id, 0,
1980		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1981	}
1982
1983	/* are there any extended attributes? */
1984	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1985	    &xattr_obj, sizeof (xattr_obj));
1986	if (error == 0 && xattr_obj) {
1987		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1988		ASSERT0(error);
1989		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1990		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1991	}
1992
1993	mutex_enter(&zp->z_lock);
1994	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1995		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1996	mutex_exit(&zp->z_lock);
1997
1998	/* charge as an update -- would be nice not to charge at all */
1999	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2000
2001	/*
2002	 * Mark this transaction as typically resulting in a net free of
2003	 * space, unless object removal will be delayed indefinitely
2004	 * (due to active holds on the vnode due to the file being open).
2005	 */
2006	if (may_delete_now)
2007		dmu_tx_mark_netfree(tx);
2008
2009	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2010	if (error) {
2011		zfs_dirent_unlock(dl);
2012		VN_RELE(vp);
2013		if (xzp)
2014			VN_RELE(ZTOV(xzp));
2015		if (error == ERESTART) {
2016			waited = B_TRUE;
2017			dmu_tx_wait(tx);
2018			dmu_tx_abort(tx);
2019			goto top;
2020		}
2021		if (realnmp)
2022			pn_free(realnmp);
2023		dmu_tx_abort(tx);
2024		ZFS_EXIT(zfsvfs);
2025		return (error);
2026	}
2027
2028	/*
2029	 * Remove the directory entry.
2030	 */
2031	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
2032
2033	if (error) {
2034		dmu_tx_commit(tx);
2035		goto out;
2036	}
2037
2038	if (unlinked) {
2039		/*
2040		 * Hold z_lock so that we can make sure that the ACL obj
2041		 * hasn't changed.  Could have been deleted due to
2042		 * zfs_sa_upgrade().
2043		 */
2044		mutex_enter(&zp->z_lock);
2045		VI_LOCK(vp);
2046		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2047		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
2048		delete_now = may_delete_now && !toobig &&
2049		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
2050		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
2051		    acl_obj;
2052		VI_UNLOCK(vp);
2053	}
2054
2055	if (delete_now) {
2056#ifdef __FreeBSD__
2057		panic("zfs_remove: delete_now branch taken");
2058#endif
2059		if (xattr_obj_unlinked) {
2060			ASSERT3U(xzp->z_links, ==, 2);
2061			mutex_enter(&xzp->z_lock);
2062			xzp->z_unlinked = 1;
2063			xzp->z_links = 0;
2064			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
2065			    &xzp->z_links, sizeof (xzp->z_links), tx);
2066			ASSERT3U(error,  ==,  0);
2067			mutex_exit(&xzp->z_lock);
2068			zfs_unlinked_add(xzp, tx);
2069
2070			if (zp->z_is_sa)
2071				error = sa_remove(zp->z_sa_hdl,
2072				    SA_ZPL_XATTR(zfsvfs), tx);
2073			else
2074				error = sa_update(zp->z_sa_hdl,
2075				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
2076				    sizeof (uint64_t), tx);
2077			ASSERT0(error);
2078		}
2079		VI_LOCK(vp);
2080		vp->v_count--;
2081		ASSERT0(vp->v_count);
2082		VI_UNLOCK(vp);
2083		mutex_exit(&zp->z_lock);
2084		zfs_znode_delete(zp, tx);
2085	} else if (unlinked) {
2086		mutex_exit(&zp->z_lock);
2087		zfs_unlinked_add(zp, tx);
2088#ifdef __FreeBSD__
2089		vp->v_vflag |= VV_NOSYNC;
2090#endif
2091	}
2092
2093	txtype = TX_REMOVE;
2094	if (flags & FIGNORECASE)
2095		txtype |= TX_CI;
2096	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2097
2098	dmu_tx_commit(tx);
2099out:
2100	if (realnmp)
2101		pn_free(realnmp);
2102
2103	zfs_dirent_unlock(dl);
2104
2105	if (!delete_now)
2106		VN_RELE(vp);
2107	if (xzp)
2108		VN_RELE(ZTOV(xzp));
2109
2110	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2111		zil_commit(zilog, 0);
2112
2113	ZFS_EXIT(zfsvfs);
2114	return (error);
2115}
2116
2117/*
2118 * Create a new directory and insert it into dvp using the name
2119 * provided.  Return a pointer to the inserted directory.
2120 *
2121 *	IN:	dvp	- vnode of directory to add subdir to.
2122 *		dirname	- name of new directory.
2123 *		vap	- attributes of new directory.
2124 *		cr	- credentials of caller.
2125 *		ct	- caller context
2126 *		flags	- case flags
2127 *		vsecp	- ACL to be set
2128 *
2129 *	OUT:	vpp	- vnode of created directory.
2130 *
2131 *	RETURN:	0 on success, error code on failure.
2132 *
2133 * Timestamps:
2134 *	dvp - ctime|mtime updated
2135 *	 vp - ctime|mtime|atime updated
2136 */
2137/*ARGSUSED*/
2138static int
2139zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2140    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2141{
2142	znode_t		*zp, *dzp = VTOZ(dvp);
2143	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2144	zilog_t		*zilog;
2145	zfs_dirlock_t	*dl;
2146	uint64_t	txtype;
2147	dmu_tx_t	*tx;
2148	int		error;
2149	int		zf = ZNEW;
2150	ksid_t		*ksid;
2151	uid_t		uid;
2152	gid_t		gid = crgetgid(cr);
2153	zfs_acl_ids_t   acl_ids;
2154	boolean_t	fuid_dirtied;
2155	boolean_t	waited = B_FALSE;
2156
2157	ASSERT(vap->va_type == VDIR);
2158
2159	/*
2160	 * If we have an ephemeral id, ACL, or XVATTR then
2161	 * make sure file system is at proper version
2162	 */
2163
2164	ksid = crgetsid(cr, KSID_OWNER);
2165	if (ksid)
2166		uid = ksid_getid(ksid);
2167	else
2168		uid = crgetuid(cr);
2169	if (zfsvfs->z_use_fuids == B_FALSE &&
2170	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2171	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2172		return (SET_ERROR(EINVAL));
2173
2174	ZFS_ENTER(zfsvfs);
2175	ZFS_VERIFY_ZP(dzp);
2176	zilog = zfsvfs->z_log;
2177
2178	if (dzp->z_pflags & ZFS_XATTR) {
2179		ZFS_EXIT(zfsvfs);
2180		return (SET_ERROR(EINVAL));
2181	}
2182
2183	if (zfsvfs->z_utf8 && u8_validate(dirname,
2184	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2185		ZFS_EXIT(zfsvfs);
2186		return (SET_ERROR(EILSEQ));
2187	}
2188	if (flags & FIGNORECASE)
2189		zf |= ZCILOOK;
2190
2191	if (vap->va_mask & AT_XVATTR) {
2192		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2193		    crgetuid(cr), cr, vap->va_type)) != 0) {
2194			ZFS_EXIT(zfsvfs);
2195			return (error);
2196		}
2197	}
2198
2199	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2200	    vsecp, &acl_ids)) != 0) {
2201		ZFS_EXIT(zfsvfs);
2202		return (error);
2203	}
2204
2205	getnewvnode_reserve(1);
2206
2207	/*
2208	 * First make sure the new directory doesn't exist.
2209	 *
2210	 * Existence is checked first to make sure we don't return
2211	 * EACCES instead of EEXIST which can cause some applications
2212	 * to fail.
2213	 */
2214top:
2215	*vpp = NULL;
2216
2217	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2218	    NULL, NULL)) {
2219		zfs_acl_ids_free(&acl_ids);
2220		getnewvnode_drop_reserve();
2221		ZFS_EXIT(zfsvfs);
2222		return (error);
2223	}
2224
2225	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2226		zfs_acl_ids_free(&acl_ids);
2227		zfs_dirent_unlock(dl);
2228		getnewvnode_drop_reserve();
2229		ZFS_EXIT(zfsvfs);
2230		return (error);
2231	}
2232
2233	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2234		zfs_acl_ids_free(&acl_ids);
2235		zfs_dirent_unlock(dl);
2236		getnewvnode_drop_reserve();
2237		ZFS_EXIT(zfsvfs);
2238		return (SET_ERROR(EDQUOT));
2239	}
2240
2241	/*
2242	 * Add a new entry to the directory.
2243	 */
2244	tx = dmu_tx_create(zfsvfs->z_os);
2245	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2246	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2247	fuid_dirtied = zfsvfs->z_fuid_dirty;
2248	if (fuid_dirtied)
2249		zfs_fuid_txhold(zfsvfs, tx);
2250	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2251		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2252		    acl_ids.z_aclp->z_acl_bytes);
2253	}
2254
2255	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2256	    ZFS_SA_BASE_ATTR_SIZE);
2257
2258	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2259	if (error) {
2260		zfs_dirent_unlock(dl);
2261		if (error == ERESTART) {
2262			waited = B_TRUE;
2263			dmu_tx_wait(tx);
2264			dmu_tx_abort(tx);
2265			goto top;
2266		}
2267		zfs_acl_ids_free(&acl_ids);
2268		dmu_tx_abort(tx);
2269		getnewvnode_drop_reserve();
2270		ZFS_EXIT(zfsvfs);
2271		return (error);
2272	}
2273
2274	/*
2275	 * Create new node.
2276	 */
2277	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2278
2279	if (fuid_dirtied)
2280		zfs_fuid_sync(zfsvfs, tx);
2281
2282	/*
2283	 * Now put new name in parent dir.
2284	 */
2285	(void) zfs_link_create(dl, zp, tx, ZNEW);
2286
2287	*vpp = ZTOV(zp);
2288
2289	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2290	if (flags & FIGNORECASE)
2291		txtype |= TX_CI;
2292	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2293	    acl_ids.z_fuidp, vap);
2294
2295	zfs_acl_ids_free(&acl_ids);
2296
2297	dmu_tx_commit(tx);
2298
2299	getnewvnode_drop_reserve();
2300
2301	zfs_dirent_unlock(dl);
2302
2303	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2304		zil_commit(zilog, 0);
2305
2306	ZFS_EXIT(zfsvfs);
2307	return (0);
2308}
2309
2310/*
2311 * Remove a directory subdir entry.  If the current working
2312 * directory is the same as the subdir to be removed, the
2313 * remove will fail.
2314 *
2315 *	IN:	dvp	- vnode of directory to remove from.
2316 *		name	- name of directory to be removed.
2317 *		cwd	- vnode of current working directory.
2318 *		cr	- credentials of caller.
2319 *		ct	- caller context
2320 *		flags	- case flags
2321 *
2322 *	RETURN:	0 on success, error code on failure.
2323 *
2324 * Timestamps:
2325 *	dvp - ctime|mtime updated
2326 */
2327/*ARGSUSED*/
2328static int
2329zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2330    caller_context_t *ct, int flags)
2331{
2332	znode_t		*dzp = VTOZ(dvp);
2333	znode_t		*zp;
2334	vnode_t		*vp;
2335	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2336	zilog_t		*zilog;
2337	zfs_dirlock_t	*dl;
2338	dmu_tx_t	*tx;
2339	int		error;
2340	int		zflg = ZEXISTS;
2341	boolean_t	waited = B_FALSE;
2342
2343	ZFS_ENTER(zfsvfs);
2344	ZFS_VERIFY_ZP(dzp);
2345	zilog = zfsvfs->z_log;
2346
2347	if (flags & FIGNORECASE)
2348		zflg |= ZCILOOK;
2349top:
2350	zp = NULL;
2351
2352	/*
2353	 * Attempt to lock directory; fail if entry doesn't exist.
2354	 */
2355	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2356	    NULL, NULL)) {
2357		ZFS_EXIT(zfsvfs);
2358		return (error);
2359	}
2360
2361	vp = ZTOV(zp);
2362
2363	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2364		goto out;
2365	}
2366
2367	if (vp->v_type != VDIR) {
2368		error = SET_ERROR(ENOTDIR);
2369		goto out;
2370	}
2371
2372	if (vp == cwd) {
2373		error = SET_ERROR(EINVAL);
2374		goto out;
2375	}
2376
2377	vnevent_rmdir(vp, dvp, name, ct);
2378
2379	/*
2380	 * Grab a lock on the directory to make sure that noone is
2381	 * trying to add (or lookup) entries while we are removing it.
2382	 */
2383	rw_enter(&zp->z_name_lock, RW_WRITER);
2384
2385	/*
2386	 * Grab a lock on the parent pointer to make sure we play well
2387	 * with the treewalk and directory rename code.
2388	 */
2389	rw_enter(&zp->z_parent_lock, RW_WRITER);
2390
2391	tx = dmu_tx_create(zfsvfs->z_os);
2392	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2393	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2394	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2395	zfs_sa_upgrade_txholds(tx, zp);
2396	zfs_sa_upgrade_txholds(tx, dzp);
2397	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2398	if (error) {
2399		rw_exit(&zp->z_parent_lock);
2400		rw_exit(&zp->z_name_lock);
2401		zfs_dirent_unlock(dl);
2402		VN_RELE(vp);
2403		if (error == ERESTART) {
2404			waited = B_TRUE;
2405			dmu_tx_wait(tx);
2406			dmu_tx_abort(tx);
2407			goto top;
2408		}
2409		dmu_tx_abort(tx);
2410		ZFS_EXIT(zfsvfs);
2411		return (error);
2412	}
2413
2414#ifdef FREEBSD_NAMECACHE
2415	cache_purge(dvp);
2416#endif
2417
2418	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2419
2420	if (error == 0) {
2421		uint64_t txtype = TX_RMDIR;
2422		if (flags & FIGNORECASE)
2423			txtype |= TX_CI;
2424		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2425	}
2426
2427	dmu_tx_commit(tx);
2428
2429	rw_exit(&zp->z_parent_lock);
2430	rw_exit(&zp->z_name_lock);
2431#ifdef FREEBSD_NAMECACHE
2432	cache_purge(vp);
2433#endif
2434out:
2435	zfs_dirent_unlock(dl);
2436
2437	VN_RELE(vp);
2438
2439	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2440		zil_commit(zilog, 0);
2441
2442	ZFS_EXIT(zfsvfs);
2443	return (error);
2444}
2445
2446/*
2447 * Read as many directory entries as will fit into the provided
2448 * buffer from the given directory cursor position (specified in
2449 * the uio structure).
2450 *
2451 *	IN:	vp	- vnode of directory to read.
2452 *		uio	- structure supplying read location, range info,
2453 *			  and return buffer.
2454 *		cr	- credentials of caller.
2455 *		ct	- caller context
2456 *		flags	- case flags
2457 *
2458 *	OUT:	uio	- updated offset and range, buffer filled.
2459 *		eofp	- set to true if end-of-file detected.
2460 *
2461 *	RETURN:	0 on success, error code on failure.
2462 *
2463 * Timestamps:
2464 *	vp - atime updated
2465 *
2466 * Note that the low 4 bits of the cookie returned by zap is always zero.
2467 * This allows us to use the low range for "special" directory entries:
2468 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2469 * we use the offset 2 for the '.zfs' directory.
2470 */
2471/* ARGSUSED */
2472static int
2473zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2474{
2475	znode_t		*zp = VTOZ(vp);
2476	iovec_t		*iovp;
2477	edirent_t	*eodp;
2478	dirent64_t	*odp;
2479	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2480	objset_t	*os;
2481	caddr_t		outbuf;
2482	size_t		bufsize;
2483	zap_cursor_t	zc;
2484	zap_attribute_t	zap;
2485	uint_t		bytes_wanted;
2486	uint64_t	offset; /* must be unsigned; checks for < 1 */
2487	uint64_t	parent;
2488	int		local_eof;
2489	int		outcount;
2490	int		error;
2491	uint8_t		prefetch;
2492	boolean_t	check_sysattrs;
2493	uint8_t		type;
2494	int		ncooks;
2495	u_long		*cooks = NULL;
2496	int		flags = 0;
2497
2498	ZFS_ENTER(zfsvfs);
2499	ZFS_VERIFY_ZP(zp);
2500
2501	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2502	    &parent, sizeof (parent))) != 0) {
2503		ZFS_EXIT(zfsvfs);
2504		return (error);
2505	}
2506
2507	/*
2508	 * If we are not given an eof variable,
2509	 * use a local one.
2510	 */
2511	if (eofp == NULL)
2512		eofp = &local_eof;
2513
2514	/*
2515	 * Check for valid iov_len.
2516	 */
2517	if (uio->uio_iov->iov_len <= 0) {
2518		ZFS_EXIT(zfsvfs);
2519		return (SET_ERROR(EINVAL));
2520	}
2521
2522	/*
2523	 * Quit if directory has been removed (posix)
2524	 */
2525	if ((*eofp = zp->z_unlinked) != 0) {
2526		ZFS_EXIT(zfsvfs);
2527		return (0);
2528	}
2529
2530	error = 0;
2531	os = zfsvfs->z_os;
2532	offset = uio->uio_loffset;
2533	prefetch = zp->z_zn_prefetch;
2534
2535	/*
2536	 * Initialize the iterator cursor.
2537	 */
2538	if (offset <= 3) {
2539		/*
2540		 * Start iteration from the beginning of the directory.
2541		 */
2542		zap_cursor_init(&zc, os, zp->z_id);
2543	} else {
2544		/*
2545		 * The offset is a serialized cursor.
2546		 */
2547		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2548	}
2549
2550	/*
2551	 * Get space to change directory entries into fs independent format.
2552	 */
2553	iovp = uio->uio_iov;
2554	bytes_wanted = iovp->iov_len;
2555	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2556		bufsize = bytes_wanted;
2557		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2558		odp = (struct dirent64 *)outbuf;
2559	} else {
2560		bufsize = bytes_wanted;
2561		outbuf = NULL;
2562		odp = (struct dirent64 *)iovp->iov_base;
2563	}
2564	eodp = (struct edirent *)odp;
2565
2566	if (ncookies != NULL) {
2567		/*
2568		 * Minimum entry size is dirent size and 1 byte for a file name.
2569		 */
2570		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2571		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2572		*cookies = cooks;
2573		*ncookies = ncooks;
2574	}
2575	/*
2576	 * If this VFS supports the system attribute view interface; and
2577	 * we're looking at an extended attribute directory; and we care
2578	 * about normalization conflicts on this vfs; then we must check
2579	 * for normalization conflicts with the sysattr name space.
2580	 */
2581#ifdef TODO
2582	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2583	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2584	    (flags & V_RDDIR_ENTFLAGS);
2585#else
2586	check_sysattrs = 0;
2587#endif
2588
2589	/*
2590	 * Transform to file-system independent format
2591	 */
2592	outcount = 0;
2593	while (outcount < bytes_wanted) {
2594		ino64_t objnum;
2595		ushort_t reclen;
2596		off64_t *next = NULL;
2597
2598		/*
2599		 * Special case `.', `..', and `.zfs'.
2600		 */
2601		if (offset == 0) {
2602			(void) strcpy(zap.za_name, ".");
2603			zap.za_normalization_conflict = 0;
2604			objnum = zp->z_id;
2605			type = DT_DIR;
2606		} else if (offset == 1) {
2607			(void) strcpy(zap.za_name, "..");
2608			zap.za_normalization_conflict = 0;
2609			objnum = parent;
2610			type = DT_DIR;
2611		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2612			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2613			zap.za_normalization_conflict = 0;
2614			objnum = ZFSCTL_INO_ROOT;
2615			type = DT_DIR;
2616		} else {
2617			/*
2618			 * Grab next entry.
2619			 */
2620			if (error = zap_cursor_retrieve(&zc, &zap)) {
2621				if ((*eofp = (error == ENOENT)) != 0)
2622					break;
2623				else
2624					goto update;
2625			}
2626
2627			if (zap.za_integer_length != 8 ||
2628			    zap.za_num_integers != 1) {
2629				cmn_err(CE_WARN, "zap_readdir: bad directory "
2630				    "entry, obj = %lld, offset = %lld\n",
2631				    (u_longlong_t)zp->z_id,
2632				    (u_longlong_t)offset);
2633				error = SET_ERROR(ENXIO);
2634				goto update;
2635			}
2636
2637			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2638			/*
2639			 * MacOS X can extract the object type here such as:
2640			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2641			 */
2642			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2643
2644			if (check_sysattrs && !zap.za_normalization_conflict) {
2645#ifdef TODO
2646				zap.za_normalization_conflict =
2647				    xattr_sysattr_casechk(zap.za_name);
2648#else
2649				panic("%s:%u: TODO", __func__, __LINE__);
2650#endif
2651			}
2652		}
2653
2654		if (flags & V_RDDIR_ACCFILTER) {
2655			/*
2656			 * If we have no access at all, don't include
2657			 * this entry in the returned information
2658			 */
2659			znode_t	*ezp;
2660			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2661				goto skip_entry;
2662			if (!zfs_has_access(ezp, cr)) {
2663				VN_RELE(ZTOV(ezp));
2664				goto skip_entry;
2665			}
2666			VN_RELE(ZTOV(ezp));
2667		}
2668
2669		if (flags & V_RDDIR_ENTFLAGS)
2670			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2671		else
2672			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2673
2674		/*
2675		 * Will this entry fit in the buffer?
2676		 */
2677		if (outcount + reclen > bufsize) {
2678			/*
2679			 * Did we manage to fit anything in the buffer?
2680			 */
2681			if (!outcount) {
2682				error = SET_ERROR(EINVAL);
2683				goto update;
2684			}
2685			break;
2686		}
2687		if (flags & V_RDDIR_ENTFLAGS) {
2688			/*
2689			 * Add extended flag entry:
2690			 */
2691			eodp->ed_ino = objnum;
2692			eodp->ed_reclen = reclen;
2693			/* NOTE: ed_off is the offset for the *next* entry */
2694			next = &(eodp->ed_off);
2695			eodp->ed_eflags = zap.za_normalization_conflict ?
2696			    ED_CASE_CONFLICT : 0;
2697			(void) strncpy(eodp->ed_name, zap.za_name,
2698			    EDIRENT_NAMELEN(reclen));
2699			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2700		} else {
2701			/*
2702			 * Add normal entry:
2703			 */
2704			odp->d_ino = objnum;
2705			odp->d_reclen = reclen;
2706			odp->d_namlen = strlen(zap.za_name);
2707			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2708			odp->d_type = type;
2709			odp = (dirent64_t *)((intptr_t)odp + reclen);
2710		}
2711		outcount += reclen;
2712
2713		ASSERT(outcount <= bufsize);
2714
2715		/* Prefetch znode */
2716		if (prefetch)
2717			dmu_prefetch(os, objnum, 0, 0, 0,
2718			    ZIO_PRIORITY_SYNC_READ);
2719
2720	skip_entry:
2721		/*
2722		 * Move to the next entry, fill in the previous offset.
2723		 */
2724		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2725			zap_cursor_advance(&zc);
2726			offset = zap_cursor_serialize(&zc);
2727		} else {
2728			offset += 1;
2729		}
2730
2731		if (cooks != NULL) {
2732			*cooks++ = offset;
2733			ncooks--;
2734			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2735		}
2736	}
2737	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2738
2739	/* Subtract unused cookies */
2740	if (ncookies != NULL)
2741		*ncookies -= ncooks;
2742
2743	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2744		iovp->iov_base += outcount;
2745		iovp->iov_len -= outcount;
2746		uio->uio_resid -= outcount;
2747	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2748		/*
2749		 * Reset the pointer.
2750		 */
2751		offset = uio->uio_loffset;
2752	}
2753
2754update:
2755	zap_cursor_fini(&zc);
2756	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2757		kmem_free(outbuf, bufsize);
2758
2759	if (error == ENOENT)
2760		error = 0;
2761
2762	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2763
2764	uio->uio_loffset = offset;
2765	ZFS_EXIT(zfsvfs);
2766	if (error != 0 && cookies != NULL) {
2767		free(*cookies, M_TEMP);
2768		*cookies = NULL;
2769		*ncookies = 0;
2770	}
2771	return (error);
2772}
2773
2774ulong_t zfs_fsync_sync_cnt = 4;
2775
2776static int
2777zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2778{
2779	znode_t	*zp = VTOZ(vp);
2780	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2781
2782	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2783
2784	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2785		ZFS_ENTER(zfsvfs);
2786		ZFS_VERIFY_ZP(zp);
2787		zil_commit(zfsvfs->z_log, zp->z_id);
2788		ZFS_EXIT(zfsvfs);
2789	}
2790	return (0);
2791}
2792
2793
2794/*
2795 * Get the requested file attributes and place them in the provided
2796 * vattr structure.
2797 *
2798 *	IN:	vp	- vnode of file.
2799 *		vap	- va_mask identifies requested attributes.
2800 *			  If AT_XVATTR set, then optional attrs are requested
2801 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2802 *		cr	- credentials of caller.
2803 *		ct	- caller context
2804 *
2805 *	OUT:	vap	- attribute values.
2806 *
2807 *	RETURN:	0 (always succeeds).
2808 */
2809/* ARGSUSED */
2810static int
2811zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2812    caller_context_t *ct)
2813{
2814	znode_t *zp = VTOZ(vp);
2815	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2816	int	error = 0;
2817	uint32_t blksize;
2818	u_longlong_t nblocks;
2819	uint64_t links;
2820	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2821	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2822	xoptattr_t *xoap = NULL;
2823	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2824	sa_bulk_attr_t bulk[4];
2825	int count = 0;
2826
2827	ZFS_ENTER(zfsvfs);
2828	ZFS_VERIFY_ZP(zp);
2829
2830	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2831
2832	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2833	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2834	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2835	if (vp->v_type == VBLK || vp->v_type == VCHR)
2836		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2837		    &rdev, 8);
2838
2839	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2840		ZFS_EXIT(zfsvfs);
2841		return (error);
2842	}
2843
2844	/*
2845	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2846	 * Also, if we are the owner don't bother, since owner should
2847	 * always be allowed to read basic attributes of file.
2848	 */
2849	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2850	    (vap->va_uid != crgetuid(cr))) {
2851		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2852		    skipaclchk, cr)) {
2853			ZFS_EXIT(zfsvfs);
2854			return (error);
2855		}
2856	}
2857
2858	/*
2859	 * Return all attributes.  It's cheaper to provide the answer
2860	 * than to determine whether we were asked the question.
2861	 */
2862
2863	mutex_enter(&zp->z_lock);
2864	vap->va_type = IFTOVT(zp->z_mode);
2865	vap->va_mode = zp->z_mode & ~S_IFMT;
2866#ifdef illumos
2867	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2868#else
2869	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2870#endif
2871	vap->va_nodeid = zp->z_id;
2872	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2873		links = zp->z_links + 1;
2874	else
2875		links = zp->z_links;
2876	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2877	vap->va_size = zp->z_size;
2878#ifdef illumos
2879	vap->va_rdev = vp->v_rdev;
2880#else
2881	if (vp->v_type == VBLK || vp->v_type == VCHR)
2882		vap->va_rdev = zfs_cmpldev(rdev);
2883#endif
2884	vap->va_seq = zp->z_seq;
2885	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2886	vap->va_filerev = zp->z_seq;
2887
2888	/*
2889	 * Add in any requested optional attributes and the create time.
2890	 * Also set the corresponding bits in the returned attribute bitmap.
2891	 */
2892	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2893		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2894			xoap->xoa_archive =
2895			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2896			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2897		}
2898
2899		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2900			xoap->xoa_readonly =
2901			    ((zp->z_pflags & ZFS_READONLY) != 0);
2902			XVA_SET_RTN(xvap, XAT_READONLY);
2903		}
2904
2905		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2906			xoap->xoa_system =
2907			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2908			XVA_SET_RTN(xvap, XAT_SYSTEM);
2909		}
2910
2911		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2912			xoap->xoa_hidden =
2913			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2914			XVA_SET_RTN(xvap, XAT_HIDDEN);
2915		}
2916
2917		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2918			xoap->xoa_nounlink =
2919			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2920			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2921		}
2922
2923		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2924			xoap->xoa_immutable =
2925			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2926			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2927		}
2928
2929		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2930			xoap->xoa_appendonly =
2931			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2932			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2933		}
2934
2935		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2936			xoap->xoa_nodump =
2937			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2938			XVA_SET_RTN(xvap, XAT_NODUMP);
2939		}
2940
2941		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2942			xoap->xoa_opaque =
2943			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2944			XVA_SET_RTN(xvap, XAT_OPAQUE);
2945		}
2946
2947		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2948			xoap->xoa_av_quarantined =
2949			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2950			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2951		}
2952
2953		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2954			xoap->xoa_av_modified =
2955			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2956			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2957		}
2958
2959		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2960		    vp->v_type == VREG) {
2961			zfs_sa_get_scanstamp(zp, xvap);
2962		}
2963
2964		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2965			uint64_t times[2];
2966
2967			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2968			    times, sizeof (times));
2969			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2970			XVA_SET_RTN(xvap, XAT_CREATETIME);
2971		}
2972
2973		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2974			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2975			XVA_SET_RTN(xvap, XAT_REPARSE);
2976		}
2977		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2978			xoap->xoa_generation = zp->z_gen;
2979			XVA_SET_RTN(xvap, XAT_GEN);
2980		}
2981
2982		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2983			xoap->xoa_offline =
2984			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2985			XVA_SET_RTN(xvap, XAT_OFFLINE);
2986		}
2987
2988		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2989			xoap->xoa_sparse =
2990			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2991			XVA_SET_RTN(xvap, XAT_SPARSE);
2992		}
2993	}
2994
2995	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2996	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2997	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2998	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2999
3000	mutex_exit(&zp->z_lock);
3001
3002	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3003	vap->va_blksize = blksize;
3004	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
3005
3006	if (zp->z_blksz == 0) {
3007		/*
3008		 * Block size hasn't been set; suggest maximal I/O transfers.
3009		 */
3010		vap->va_blksize = zfsvfs->z_max_blksz;
3011	}
3012
3013	ZFS_EXIT(zfsvfs);
3014	return (0);
3015}
3016
3017/*
3018 * Set the file attributes to the values contained in the
3019 * vattr structure.
3020 *
3021 *	IN:	vp	- vnode of file to be modified.
3022 *		vap	- new attribute values.
3023 *			  If AT_XVATTR set, then optional attrs are being set
3024 *		flags	- ATTR_UTIME set if non-default time values provided.
3025 *			- ATTR_NOACLCHECK (CIFS context only).
3026 *		cr	- credentials of caller.
3027 *		ct	- caller context
3028 *
3029 *	RETURN:	0 on success, error code on failure.
3030 *
3031 * Timestamps:
3032 *	vp - ctime updated, mtime updated if size changed.
3033 */
3034/* ARGSUSED */
3035static int
3036zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3037    caller_context_t *ct)
3038{
3039	znode_t		*zp = VTOZ(vp);
3040	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3041	zilog_t		*zilog;
3042	dmu_tx_t	*tx;
3043	vattr_t		oldva;
3044	xvattr_t	tmpxvattr;
3045	uint_t		mask = vap->va_mask;
3046	uint_t		saved_mask = 0;
3047	uint64_t	saved_mode;
3048	int		trim_mask = 0;
3049	uint64_t	new_mode;
3050	uint64_t	new_uid, new_gid;
3051	uint64_t	xattr_obj;
3052	uint64_t	mtime[2], ctime[2];
3053	znode_t		*attrzp;
3054	int		need_policy = FALSE;
3055	int		err, err2;
3056	zfs_fuid_info_t *fuidp = NULL;
3057	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3058	xoptattr_t	*xoap;
3059	zfs_acl_t	*aclp;
3060	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3061	boolean_t	fuid_dirtied = B_FALSE;
3062	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
3063	int		count = 0, xattr_count = 0;
3064
3065	if (mask == 0)
3066		return (0);
3067
3068	if (mask & AT_NOSET)
3069		return (SET_ERROR(EINVAL));
3070
3071	ZFS_ENTER(zfsvfs);
3072	ZFS_VERIFY_ZP(zp);
3073
3074	zilog = zfsvfs->z_log;
3075
3076	/*
3077	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3078	 * that file system is at proper version level
3079	 */
3080
3081	if (zfsvfs->z_use_fuids == B_FALSE &&
3082	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3083	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3084	    (mask & AT_XVATTR))) {
3085		ZFS_EXIT(zfsvfs);
3086		return (SET_ERROR(EINVAL));
3087	}
3088
3089	if (mask & AT_SIZE && vp->v_type == VDIR) {
3090		ZFS_EXIT(zfsvfs);
3091		return (SET_ERROR(EISDIR));
3092	}
3093
3094	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3095		ZFS_EXIT(zfsvfs);
3096		return (SET_ERROR(EINVAL));
3097	}
3098
3099	/*
3100	 * If this is an xvattr_t, then get a pointer to the structure of
3101	 * optional attributes.  If this is NULL, then we have a vattr_t.
3102	 */
3103	xoap = xva_getxoptattr(xvap);
3104
3105	xva_init(&tmpxvattr);
3106
3107	/*
3108	 * Immutable files can only alter immutable bit and atime
3109	 */
3110	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3111	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3112	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3113		ZFS_EXIT(zfsvfs);
3114		return (SET_ERROR(EPERM));
3115	}
3116
3117	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3118		ZFS_EXIT(zfsvfs);
3119		return (SET_ERROR(EPERM));
3120	}
3121
3122	/*
3123	 * Verify timestamps doesn't overflow 32 bits.
3124	 * ZFS can handle large timestamps, but 32bit syscalls can't
3125	 * handle times greater than 2039.  This check should be removed
3126	 * once large timestamps are fully supported.
3127	 */
3128	if (mask & (AT_ATIME | AT_MTIME)) {
3129		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3130		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3131			ZFS_EXIT(zfsvfs);
3132			return (SET_ERROR(EOVERFLOW));
3133		}
3134	}
3135
3136top:
3137	attrzp = NULL;
3138	aclp = NULL;
3139
3140	/* Can this be moved to before the top label? */
3141	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3142		ZFS_EXIT(zfsvfs);
3143		return (SET_ERROR(EROFS));
3144	}
3145
3146	/*
3147	 * First validate permissions
3148	 */
3149
3150	if (mask & AT_SIZE) {
3151		/*
3152		 * XXX - Note, we are not providing any open
3153		 * mode flags here (like FNDELAY), so we may
3154		 * block if there are locks present... this
3155		 * should be addressed in openat().
3156		 */
3157		/* XXX - would it be OK to generate a log record here? */
3158		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3159		if (err) {
3160			ZFS_EXIT(zfsvfs);
3161			return (err);
3162		}
3163	}
3164
3165	if (mask & (AT_ATIME|AT_MTIME) ||
3166	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3167	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3168	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3169	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3170	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3171	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3172	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3173		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3174		    skipaclchk, cr);
3175	}
3176
3177	if (mask & (AT_UID|AT_GID)) {
3178		int	idmask = (mask & (AT_UID|AT_GID));
3179		int	take_owner;
3180		int	take_group;
3181
3182		/*
3183		 * NOTE: even if a new mode is being set,
3184		 * we may clear S_ISUID/S_ISGID bits.
3185		 */
3186
3187		if (!(mask & AT_MODE))
3188			vap->va_mode = zp->z_mode;
3189
3190		/*
3191		 * Take ownership or chgrp to group we are a member of
3192		 */
3193
3194		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3195		take_group = (mask & AT_GID) &&
3196		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3197
3198		/*
3199		 * If both AT_UID and AT_GID are set then take_owner and
3200		 * take_group must both be set in order to allow taking
3201		 * ownership.
3202		 *
3203		 * Otherwise, send the check through secpolicy_vnode_setattr()
3204		 *
3205		 */
3206
3207		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3208		    ((idmask == AT_UID) && take_owner) ||
3209		    ((idmask == AT_GID) && take_group)) {
3210			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3211			    skipaclchk, cr) == 0) {
3212				/*
3213				 * Remove setuid/setgid for non-privileged users
3214				 */
3215				secpolicy_setid_clear(vap, vp, cr);
3216				trim_mask = (mask & (AT_UID|AT_GID));
3217			} else {
3218				need_policy =  TRUE;
3219			}
3220		} else {
3221			need_policy =  TRUE;
3222		}
3223	}
3224
3225	mutex_enter(&zp->z_lock);
3226	oldva.va_mode = zp->z_mode;
3227	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3228	if (mask & AT_XVATTR) {
3229		/*
3230		 * Update xvattr mask to include only those attributes
3231		 * that are actually changing.
3232		 *
3233		 * the bits will be restored prior to actually setting
3234		 * the attributes so the caller thinks they were set.
3235		 */
3236		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3237			if (xoap->xoa_appendonly !=
3238			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3239				need_policy = TRUE;
3240			} else {
3241				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3242				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3243			}
3244		}
3245
3246		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3247			if (xoap->xoa_nounlink !=
3248			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3249				need_policy = TRUE;
3250			} else {
3251				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3252				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3253			}
3254		}
3255
3256		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3257			if (xoap->xoa_immutable !=
3258			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3259				need_policy = TRUE;
3260			} else {
3261				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3262				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3263			}
3264		}
3265
3266		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3267			if (xoap->xoa_nodump !=
3268			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3269				need_policy = TRUE;
3270			} else {
3271				XVA_CLR_REQ(xvap, XAT_NODUMP);
3272				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3273			}
3274		}
3275
3276		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3277			if (xoap->xoa_av_modified !=
3278			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3279				need_policy = TRUE;
3280			} else {
3281				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3282				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3283			}
3284		}
3285
3286		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3287			if ((vp->v_type != VREG &&
3288			    xoap->xoa_av_quarantined) ||
3289			    xoap->xoa_av_quarantined !=
3290			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3291				need_policy = TRUE;
3292			} else {
3293				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3294				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3295			}
3296		}
3297
3298		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3299			mutex_exit(&zp->z_lock);
3300			ZFS_EXIT(zfsvfs);
3301			return (SET_ERROR(EPERM));
3302		}
3303
3304		if (need_policy == FALSE &&
3305		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3306		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3307			need_policy = TRUE;
3308		}
3309	}
3310
3311	mutex_exit(&zp->z_lock);
3312
3313	if (mask & AT_MODE) {
3314		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3315			err = secpolicy_setid_setsticky_clear(vp, vap,
3316			    &oldva, cr);
3317			if (err) {
3318				ZFS_EXIT(zfsvfs);
3319				return (err);
3320			}
3321			trim_mask |= AT_MODE;
3322		} else {
3323			need_policy = TRUE;
3324		}
3325	}
3326
3327	if (need_policy) {
3328		/*
3329		 * If trim_mask is set then take ownership
3330		 * has been granted or write_acl is present and user
3331		 * has the ability to modify mode.  In that case remove
3332		 * UID|GID and or MODE from mask so that
3333		 * secpolicy_vnode_setattr() doesn't revoke it.
3334		 */
3335
3336		if (trim_mask) {
3337			saved_mask = vap->va_mask;
3338			vap->va_mask &= ~trim_mask;
3339			if (trim_mask & AT_MODE) {
3340				/*
3341				 * Save the mode, as secpolicy_vnode_setattr()
3342				 * will overwrite it with ova.va_mode.
3343				 */
3344				saved_mode = vap->va_mode;
3345			}
3346		}
3347		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3348		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3349		if (err) {
3350			ZFS_EXIT(zfsvfs);
3351			return (err);
3352		}
3353
3354		if (trim_mask) {
3355			vap->va_mask |= saved_mask;
3356			if (trim_mask & AT_MODE) {
3357				/*
3358				 * Recover the mode after
3359				 * secpolicy_vnode_setattr().
3360				 */
3361				vap->va_mode = saved_mode;
3362			}
3363		}
3364	}
3365
3366	/*
3367	 * secpolicy_vnode_setattr, or take ownership may have
3368	 * changed va_mask
3369	 */
3370	mask = vap->va_mask;
3371
3372	if ((mask & (AT_UID | AT_GID))) {
3373		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3374		    &xattr_obj, sizeof (xattr_obj));
3375
3376		if (err == 0 && xattr_obj) {
3377			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3378			if (err)
3379				goto out2;
3380		}
3381		if (mask & AT_UID) {
3382			new_uid = zfs_fuid_create(zfsvfs,
3383			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3384			if (new_uid != zp->z_uid &&
3385			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3386				if (attrzp)
3387					VN_RELE(ZTOV(attrzp));
3388				err = SET_ERROR(EDQUOT);
3389				goto out2;
3390			}
3391		}
3392
3393		if (mask & AT_GID) {
3394			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3395			    cr, ZFS_GROUP, &fuidp);
3396			if (new_gid != zp->z_gid &&
3397			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3398				if (attrzp)
3399					VN_RELE(ZTOV(attrzp));
3400				err = SET_ERROR(EDQUOT);
3401				goto out2;
3402			}
3403		}
3404	}
3405	tx = dmu_tx_create(zfsvfs->z_os);
3406
3407	if (mask & AT_MODE) {
3408		uint64_t pmode = zp->z_mode;
3409		uint64_t acl_obj;
3410		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3411
3412		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3413		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3414			err = SET_ERROR(EPERM);
3415			goto out;
3416		}
3417
3418		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3419			goto out;
3420
3421		mutex_enter(&zp->z_lock);
3422		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3423			/*
3424			 * Are we upgrading ACL from old V0 format
3425			 * to V1 format?
3426			 */
3427			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3428			    zfs_znode_acl_version(zp) ==
3429			    ZFS_ACL_VERSION_INITIAL) {
3430				dmu_tx_hold_free(tx, acl_obj, 0,
3431				    DMU_OBJECT_END);
3432				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3433				    0, aclp->z_acl_bytes);
3434			} else {
3435				dmu_tx_hold_write(tx, acl_obj, 0,
3436				    aclp->z_acl_bytes);
3437			}
3438		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3439			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3440			    0, aclp->z_acl_bytes);
3441		}
3442		mutex_exit(&zp->z_lock);
3443		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3444	} else {
3445		if ((mask & AT_XVATTR) &&
3446		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3447			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3448		else
3449			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3450	}
3451
3452	if (attrzp) {
3453		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3454	}
3455
3456	fuid_dirtied = zfsvfs->z_fuid_dirty;
3457	if (fuid_dirtied)
3458		zfs_fuid_txhold(zfsvfs, tx);
3459
3460	zfs_sa_upgrade_txholds(tx, zp);
3461
3462	err = dmu_tx_assign(tx, TXG_WAIT);
3463	if (err)
3464		goto out;
3465
3466	count = 0;
3467	/*
3468	 * Set each attribute requested.
3469	 * We group settings according to the locks they need to acquire.
3470	 *
3471	 * Note: you cannot set ctime directly, although it will be
3472	 * updated as a side-effect of calling this function.
3473	 */
3474
3475
3476	if (mask & (AT_UID|AT_GID|AT_MODE))
3477		mutex_enter(&zp->z_acl_lock);
3478	mutex_enter(&zp->z_lock);
3479
3480	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3481	    &zp->z_pflags, sizeof (zp->z_pflags));
3482
3483	if (attrzp) {
3484		if (mask & (AT_UID|AT_GID|AT_MODE))
3485			mutex_enter(&attrzp->z_acl_lock);
3486		mutex_enter(&attrzp->z_lock);
3487		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3488		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3489		    sizeof (attrzp->z_pflags));
3490	}
3491
3492	if (mask & (AT_UID|AT_GID)) {
3493
3494		if (mask & AT_UID) {
3495			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3496			    &new_uid, sizeof (new_uid));
3497			zp->z_uid = new_uid;
3498			if (attrzp) {
3499				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3500				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3501				    sizeof (new_uid));
3502				attrzp->z_uid = new_uid;
3503			}
3504		}
3505
3506		if (mask & AT_GID) {
3507			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3508			    NULL, &new_gid, sizeof (new_gid));
3509			zp->z_gid = new_gid;
3510			if (attrzp) {
3511				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3512				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3513				    sizeof (new_gid));
3514				attrzp->z_gid = new_gid;
3515			}
3516		}
3517		if (!(mask & AT_MODE)) {
3518			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3519			    NULL, &new_mode, sizeof (new_mode));
3520			new_mode = zp->z_mode;
3521		}
3522		err = zfs_acl_chown_setattr(zp);
3523		ASSERT(err == 0);
3524		if (attrzp) {
3525			err = zfs_acl_chown_setattr(attrzp);
3526			ASSERT(err == 0);
3527		}
3528	}
3529
3530	if (mask & AT_MODE) {
3531		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3532		    &new_mode, sizeof (new_mode));
3533		zp->z_mode = new_mode;
3534		ASSERT3U((uintptr_t)aclp, !=, 0);
3535		err = zfs_aclset_common(zp, aclp, cr, tx);
3536		ASSERT0(err);
3537		if (zp->z_acl_cached)
3538			zfs_acl_free(zp->z_acl_cached);
3539		zp->z_acl_cached = aclp;
3540		aclp = NULL;
3541	}
3542
3543
3544	if (mask & AT_ATIME) {
3545		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3546		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3547		    &zp->z_atime, sizeof (zp->z_atime));
3548	}
3549
3550	if (mask & AT_MTIME) {
3551		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3552		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3553		    mtime, sizeof (mtime));
3554	}
3555
3556	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3557	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3558		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3559		    NULL, mtime, sizeof (mtime));
3560		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3561		    &ctime, sizeof (ctime));
3562		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3563		    B_TRUE);
3564	} else if (mask != 0) {
3565		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3566		    &ctime, sizeof (ctime));
3567		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3568		    B_TRUE);
3569		if (attrzp) {
3570			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3571			    SA_ZPL_CTIME(zfsvfs), NULL,
3572			    &ctime, sizeof (ctime));
3573			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3574			    mtime, ctime, B_TRUE);
3575		}
3576	}
3577	/*
3578	 * Do this after setting timestamps to prevent timestamp
3579	 * update from toggling bit
3580	 */
3581
3582	if (xoap && (mask & AT_XVATTR)) {
3583
3584		/*
3585		 * restore trimmed off masks
3586		 * so that return masks can be set for caller.
3587		 */
3588
3589		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3590			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3591		}
3592		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3593			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3594		}
3595		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3596			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3597		}
3598		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3599			XVA_SET_REQ(xvap, XAT_NODUMP);
3600		}
3601		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3602			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3603		}
3604		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3605			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3606		}
3607
3608		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3609			ASSERT(vp->v_type == VREG);
3610
3611		zfs_xvattr_set(zp, xvap, tx);
3612	}
3613
3614	if (fuid_dirtied)
3615		zfs_fuid_sync(zfsvfs, tx);
3616
3617	if (mask != 0)
3618		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3619
3620	mutex_exit(&zp->z_lock);
3621	if (mask & (AT_UID|AT_GID|AT_MODE))
3622		mutex_exit(&zp->z_acl_lock);
3623
3624	if (attrzp) {
3625		if (mask & (AT_UID|AT_GID|AT_MODE))
3626			mutex_exit(&attrzp->z_acl_lock);
3627		mutex_exit(&attrzp->z_lock);
3628	}
3629out:
3630	if (err == 0 && attrzp) {
3631		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3632		    xattr_count, tx);
3633		ASSERT(err2 == 0);
3634	}
3635
3636	if (attrzp)
3637		VN_RELE(ZTOV(attrzp));
3638
3639	if (aclp)
3640		zfs_acl_free(aclp);
3641
3642	if (fuidp) {
3643		zfs_fuid_info_free(fuidp);
3644		fuidp = NULL;
3645	}
3646
3647	if (err) {
3648		dmu_tx_abort(tx);
3649		if (err == ERESTART)
3650			goto top;
3651	} else {
3652		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3653		dmu_tx_commit(tx);
3654	}
3655
3656out2:
3657	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3658		zil_commit(zilog, 0);
3659
3660	ZFS_EXIT(zfsvfs);
3661	return (err);
3662}
3663
3664typedef struct zfs_zlock {
3665	krwlock_t	*zl_rwlock;	/* lock we acquired */
3666	znode_t		*zl_znode;	/* znode we held */
3667	struct zfs_zlock *zl_next;	/* next in list */
3668} zfs_zlock_t;
3669
3670/*
3671 * Drop locks and release vnodes that were held by zfs_rename_lock().
3672 */
3673static void
3674zfs_rename_unlock(zfs_zlock_t **zlpp)
3675{
3676	zfs_zlock_t *zl;
3677
3678	while ((zl = *zlpp) != NULL) {
3679		if (zl->zl_znode != NULL)
3680			VN_RELE(ZTOV(zl->zl_znode));
3681		rw_exit(zl->zl_rwlock);
3682		*zlpp = zl->zl_next;
3683		kmem_free(zl, sizeof (*zl));
3684	}
3685}
3686
3687/*
3688 * Search back through the directory tree, using the ".." entries.
3689 * Lock each directory in the chain to prevent concurrent renames.
3690 * Fail any attempt to move a directory into one of its own descendants.
3691 * XXX - z_parent_lock can overlap with map or grow locks
3692 */
3693static int
3694zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3695{
3696	zfs_zlock_t	*zl;
3697	znode_t		*zp = tdzp;
3698	uint64_t	rootid = zp->z_zfsvfs->z_root;
3699	uint64_t	oidp = zp->z_id;
3700	krwlock_t	*rwlp = &szp->z_parent_lock;
3701	krw_t		rw = RW_WRITER;
3702
3703	/*
3704	 * First pass write-locks szp and compares to zp->z_id.
3705	 * Later passes read-lock zp and compare to zp->z_parent.
3706	 */
3707	do {
3708		if (!rw_tryenter(rwlp, rw)) {
3709			/*
3710			 * Another thread is renaming in this path.
3711			 * Note that if we are a WRITER, we don't have any
3712			 * parent_locks held yet.
3713			 */
3714			if (rw == RW_READER && zp->z_id > szp->z_id) {
3715				/*
3716				 * Drop our locks and restart
3717				 */
3718				zfs_rename_unlock(&zl);
3719				*zlpp = NULL;
3720				zp = tdzp;
3721				oidp = zp->z_id;
3722				rwlp = &szp->z_parent_lock;
3723				rw = RW_WRITER;
3724				continue;
3725			} else {
3726				/*
3727				 * Wait for other thread to drop its locks
3728				 */
3729				rw_enter(rwlp, rw);
3730			}
3731		}
3732
3733		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3734		zl->zl_rwlock = rwlp;
3735		zl->zl_znode = NULL;
3736		zl->zl_next = *zlpp;
3737		*zlpp = zl;
3738
3739		if (oidp == szp->z_id)		/* We're a descendant of szp */
3740			return (SET_ERROR(EINVAL));
3741
3742		if (oidp == rootid)		/* We've hit the top */
3743			return (0);
3744
3745		if (rw == RW_READER) {		/* i.e. not the first pass */
3746			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3747			if (error)
3748				return (error);
3749			zl->zl_znode = zp;
3750		}
3751		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3752		    &oidp, sizeof (oidp));
3753		rwlp = &zp->z_parent_lock;
3754		rw = RW_READER;
3755
3756	} while (zp->z_id != sdzp->z_id);
3757
3758	return (0);
3759}
3760
3761/*
3762 * Move an entry from the provided source directory to the target
3763 * directory.  Change the entry name as indicated.
3764 *
3765 *	IN:	sdvp	- Source directory containing the "old entry".
3766 *		snm	- Old entry name.
3767 *		tdvp	- Target directory to contain the "new entry".
3768 *		tnm	- New entry name.
3769 *		cr	- credentials of caller.
3770 *		ct	- caller context
3771 *		flags	- case flags
3772 *
3773 *	RETURN:	0 on success, error code on failure.
3774 *
3775 * Timestamps:
3776 *	sdvp,tdvp - ctime|mtime updated
3777 */
3778/*ARGSUSED*/
3779static int
3780zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3781    caller_context_t *ct, int flags)
3782{
3783	znode_t		*tdzp, *sdzp, *szp, *tzp;
3784	zfsvfs_t 	*zfsvfs;
3785	zilog_t		*zilog;
3786	vnode_t		*realvp;
3787	zfs_dirlock_t	*sdl, *tdl;
3788	dmu_tx_t	*tx;
3789	zfs_zlock_t	*zl;
3790	int		cmp, serr, terr;
3791	int		error = 0;
3792	int		zflg = 0;
3793	boolean_t	waited = B_FALSE;
3794
3795	tdzp = VTOZ(tdvp);
3796	ZFS_VERIFY_ZP(tdzp);
3797	zfsvfs = tdzp->z_zfsvfs;
3798	ZFS_ENTER(zfsvfs);
3799	zilog = zfsvfs->z_log;
3800	sdzp = VTOZ(sdvp);
3801
3802	/*
3803	 * In case sdzp is not valid, let's be sure to exit from the right
3804	 * zfsvfs_t.
3805	 */
3806	if (sdzp->z_sa_hdl == NULL) {
3807		ZFS_EXIT(zfsvfs);
3808		return (SET_ERROR(EIO));
3809	}
3810
3811	/*
3812	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3813	 * ctldir appear to have the same v_vfsp.
3814	 */
3815	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3816		ZFS_EXIT(zfsvfs);
3817		return (SET_ERROR(EXDEV));
3818	}
3819
3820	if (zfsvfs->z_utf8 && u8_validate(tnm,
3821	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3822		ZFS_EXIT(zfsvfs);
3823		return (SET_ERROR(EILSEQ));
3824	}
3825
3826	if (flags & FIGNORECASE)
3827		zflg |= ZCILOOK;
3828
3829top:
3830	szp = NULL;
3831	tzp = NULL;
3832	zl = NULL;
3833
3834	/*
3835	 * This is to prevent the creation of links into attribute space
3836	 * by renaming a linked file into/outof an attribute directory.
3837	 * See the comment in zfs_link() for why this is considered bad.
3838	 */
3839	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3840		ZFS_EXIT(zfsvfs);
3841		return (SET_ERROR(EINVAL));
3842	}
3843
3844	/*
3845	 * Lock source and target directory entries.  To prevent deadlock,
3846	 * a lock ordering must be defined.  We lock the directory with
3847	 * the smallest object id first, or if it's a tie, the one with
3848	 * the lexically first name.
3849	 */
3850	if (sdzp->z_id < tdzp->z_id) {
3851		cmp = -1;
3852	} else if (sdzp->z_id > tdzp->z_id) {
3853		cmp = 1;
3854	} else {
3855		/*
3856		 * First compare the two name arguments without
3857		 * considering any case folding.
3858		 */
3859		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3860
3861		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3862		ASSERT(error == 0 || !zfsvfs->z_utf8);
3863		if (cmp == 0) {
3864			/*
3865			 * POSIX: "If the old argument and the new argument
3866			 * both refer to links to the same existing file,
3867			 * the rename() function shall return successfully
3868			 * and perform no other action."
3869			 */
3870			ZFS_EXIT(zfsvfs);
3871			return (0);
3872		}
3873		/*
3874		 * If the file system is case-folding, then we may
3875		 * have some more checking to do.  A case-folding file
3876		 * system is either supporting mixed case sensitivity
3877		 * access or is completely case-insensitive.  Note
3878		 * that the file system is always case preserving.
3879		 *
3880		 * In mixed sensitivity mode case sensitive behavior
3881		 * is the default.  FIGNORECASE must be used to
3882		 * explicitly request case insensitive behavior.
3883		 *
3884		 * If the source and target names provided differ only
3885		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3886		 * we will treat this as a special case in the
3887		 * case-insensitive mode: as long as the source name
3888		 * is an exact match, we will allow this to proceed as
3889		 * a name-change request.
3890		 */
3891		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3892		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3893		    flags & FIGNORECASE)) &&
3894		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3895		    &error) == 0) {
3896			/*
3897			 * case preserving rename request, require exact
3898			 * name matches
3899			 */
3900			zflg |= ZCIEXACT;
3901			zflg &= ~ZCILOOK;
3902		}
3903	}
3904
3905	/*
3906	 * If the source and destination directories are the same, we should
3907	 * grab the z_name_lock of that directory only once.
3908	 */
3909	if (sdzp == tdzp) {
3910		zflg |= ZHAVELOCK;
3911		rw_enter(&sdzp->z_name_lock, RW_READER);
3912	}
3913
3914	if (cmp < 0) {
3915		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3916		    ZEXISTS | zflg, NULL, NULL);
3917		terr = zfs_dirent_lock(&tdl,
3918		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3919	} else {
3920		terr = zfs_dirent_lock(&tdl,
3921		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3922		serr = zfs_dirent_lock(&sdl,
3923		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3924		    NULL, NULL);
3925	}
3926
3927	if (serr) {
3928		/*
3929		 * Source entry invalid or not there.
3930		 */
3931		if (!terr) {
3932			zfs_dirent_unlock(tdl);
3933			if (tzp)
3934				VN_RELE(ZTOV(tzp));
3935		}
3936
3937		if (sdzp == tdzp)
3938			rw_exit(&sdzp->z_name_lock);
3939
3940		/*
3941		 * FreeBSD: In OpenSolaris they only check if rename source is
3942		 * ".." here, because "." is handled in their lookup. This is
3943		 * not the case for FreeBSD, so we check for "." explicitly.
3944		 */
3945		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3946			serr = SET_ERROR(EINVAL);
3947		ZFS_EXIT(zfsvfs);
3948		return (serr);
3949	}
3950	if (terr) {
3951		zfs_dirent_unlock(sdl);
3952		VN_RELE(ZTOV(szp));
3953
3954		if (sdzp == tdzp)
3955			rw_exit(&sdzp->z_name_lock);
3956
3957		if (strcmp(tnm, "..") == 0)
3958			terr = SET_ERROR(EINVAL);
3959		ZFS_EXIT(zfsvfs);
3960		return (terr);
3961	}
3962
3963	/*
3964	 * Must have write access at the source to remove the old entry
3965	 * and write access at the target to create the new entry.
3966	 * Note that if target and source are the same, this can be
3967	 * done in a single check.
3968	 */
3969
3970	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3971		goto out;
3972
3973	if (ZTOV(szp)->v_type == VDIR) {
3974		/*
3975		 * Check to make sure rename is valid.
3976		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3977		 */
3978		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3979			goto out;
3980	}
3981
3982	/*
3983	 * Does target exist?
3984	 */
3985	if (tzp) {
3986		/*
3987		 * Source and target must be the same type.
3988		 */
3989		if (ZTOV(szp)->v_type == VDIR) {
3990			if (ZTOV(tzp)->v_type != VDIR) {
3991				error = SET_ERROR(ENOTDIR);
3992				goto out;
3993			}
3994		} else {
3995			if (ZTOV(tzp)->v_type == VDIR) {
3996				error = SET_ERROR(EISDIR);
3997				goto out;
3998			}
3999		}
4000		/*
4001		 * POSIX dictates that when the source and target
4002		 * entries refer to the same file object, rename
4003		 * must do nothing and exit without error.
4004		 */
4005		if (szp->z_id == tzp->z_id) {
4006			error = 0;
4007			goto out;
4008		}
4009	}
4010
4011	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
4012	if (tzp)
4013		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
4014
4015	/*
4016	 * notify the target directory if it is not the same
4017	 * as source directory.
4018	 */
4019	if (tdvp != sdvp) {
4020		vnevent_rename_dest_dir(tdvp, ct);
4021	}
4022
4023	tx = dmu_tx_create(zfsvfs->z_os);
4024	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4025	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4026	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4027	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4028	if (sdzp != tdzp) {
4029		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4030		zfs_sa_upgrade_txholds(tx, tdzp);
4031	}
4032	if (tzp) {
4033		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4034		zfs_sa_upgrade_txholds(tx, tzp);
4035	}
4036
4037	zfs_sa_upgrade_txholds(tx, szp);
4038	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4039	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4040	if (error) {
4041		if (zl != NULL)
4042			zfs_rename_unlock(&zl);
4043		zfs_dirent_unlock(sdl);
4044		zfs_dirent_unlock(tdl);
4045
4046		if (sdzp == tdzp)
4047			rw_exit(&sdzp->z_name_lock);
4048
4049		VN_RELE(ZTOV(szp));
4050		if (tzp)
4051			VN_RELE(ZTOV(tzp));
4052		if (error == ERESTART) {
4053			waited = B_TRUE;
4054			dmu_tx_wait(tx);
4055			dmu_tx_abort(tx);
4056			goto top;
4057		}
4058		dmu_tx_abort(tx);
4059		ZFS_EXIT(zfsvfs);
4060		return (error);
4061	}
4062
4063	if (tzp)	/* Attempt to remove the existing target */
4064		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
4065
4066	if (error == 0) {
4067		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
4068		if (error == 0) {
4069			szp->z_pflags |= ZFS_AV_MODIFIED;
4070
4071			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4072			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4073			ASSERT0(error);
4074
4075			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
4076			if (error == 0) {
4077				zfs_log_rename(zilog, tx, TX_RENAME |
4078				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
4079				    sdl->dl_name, tdzp, tdl->dl_name, szp);
4080
4081				/*
4082				 * Update path information for the target vnode
4083				 */
4084				vn_renamepath(tdvp, ZTOV(szp), tnm,
4085				    strlen(tnm));
4086			} else {
4087				/*
4088				 * At this point, we have successfully created
4089				 * the target name, but have failed to remove
4090				 * the source name.  Since the create was done
4091				 * with the ZRENAMING flag, there are
4092				 * complications; for one, the link count is
4093				 * wrong.  The easiest way to deal with this
4094				 * is to remove the newly created target, and
4095				 * return the original error.  This must
4096				 * succeed; fortunately, it is very unlikely to
4097				 * fail, since we just created it.
4098				 */
4099				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4100				    ZRENAMING, NULL), ==, 0);
4101			}
4102		}
4103#ifdef FREEBSD_NAMECACHE
4104		if (error == 0) {
4105			cache_purge(sdvp);
4106			cache_purge(tdvp);
4107			cache_purge(ZTOV(szp));
4108			if (tzp)
4109				cache_purge(ZTOV(tzp));
4110		}
4111#endif
4112	}
4113
4114	dmu_tx_commit(tx);
4115out:
4116	if (zl != NULL)
4117		zfs_rename_unlock(&zl);
4118
4119	zfs_dirent_unlock(sdl);
4120	zfs_dirent_unlock(tdl);
4121
4122	if (sdzp == tdzp)
4123		rw_exit(&sdzp->z_name_lock);
4124
4125
4126	VN_RELE(ZTOV(szp));
4127	if (tzp)
4128		VN_RELE(ZTOV(tzp));
4129
4130	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4131		zil_commit(zilog, 0);
4132
4133	ZFS_EXIT(zfsvfs);
4134
4135	return (error);
4136}
4137
4138/*
4139 * Insert the indicated symbolic reference entry into the directory.
4140 *
4141 *	IN:	dvp	- Directory to contain new symbolic link.
4142 *		link	- Name for new symlink entry.
4143 *		vap	- Attributes of new entry.
4144 *		cr	- credentials of caller.
4145 *		ct	- caller context
4146 *		flags	- case flags
4147 *
4148 *	RETURN:	0 on success, error code on failure.
4149 *
4150 * Timestamps:
4151 *	dvp - ctime|mtime updated
4152 */
4153/*ARGSUSED*/
4154static int
4155zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4156    cred_t *cr, kthread_t *td)
4157{
4158	znode_t		*zp, *dzp = VTOZ(dvp);
4159	zfs_dirlock_t	*dl;
4160	dmu_tx_t	*tx;
4161	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4162	zilog_t		*zilog;
4163	uint64_t	len = strlen(link);
4164	int		error;
4165	int		zflg = ZNEW;
4166	zfs_acl_ids_t	acl_ids;
4167	boolean_t	fuid_dirtied;
4168	uint64_t	txtype = TX_SYMLINK;
4169	boolean_t	waited = B_FALSE;
4170	int		flags = 0;
4171
4172	ASSERT(vap->va_type == VLNK);
4173
4174	ZFS_ENTER(zfsvfs);
4175	ZFS_VERIFY_ZP(dzp);
4176	zilog = zfsvfs->z_log;
4177
4178	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4179	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4180		ZFS_EXIT(zfsvfs);
4181		return (SET_ERROR(EILSEQ));
4182	}
4183	if (flags & FIGNORECASE)
4184		zflg |= ZCILOOK;
4185
4186	if (len > MAXPATHLEN) {
4187		ZFS_EXIT(zfsvfs);
4188		return (SET_ERROR(ENAMETOOLONG));
4189	}
4190
4191	if ((error = zfs_acl_ids_create(dzp, 0,
4192	    vap, cr, NULL, &acl_ids)) != 0) {
4193		ZFS_EXIT(zfsvfs);
4194		return (error);
4195	}
4196
4197	getnewvnode_reserve(1);
4198
4199top:
4200	/*
4201	 * Attempt to lock directory; fail if entry already exists.
4202	 */
4203	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4204	if (error) {
4205		zfs_acl_ids_free(&acl_ids);
4206		getnewvnode_drop_reserve();
4207		ZFS_EXIT(zfsvfs);
4208		return (error);
4209	}
4210
4211	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4212		zfs_acl_ids_free(&acl_ids);
4213		zfs_dirent_unlock(dl);
4214		getnewvnode_drop_reserve();
4215		ZFS_EXIT(zfsvfs);
4216		return (error);
4217	}
4218
4219	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4220		zfs_acl_ids_free(&acl_ids);
4221		zfs_dirent_unlock(dl);
4222		getnewvnode_drop_reserve();
4223		ZFS_EXIT(zfsvfs);
4224		return (SET_ERROR(EDQUOT));
4225	}
4226	tx = dmu_tx_create(zfsvfs->z_os);
4227	fuid_dirtied = zfsvfs->z_fuid_dirty;
4228	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4229	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4230	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4231	    ZFS_SA_BASE_ATTR_SIZE + len);
4232	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4233	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4234		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4235		    acl_ids.z_aclp->z_acl_bytes);
4236	}
4237	if (fuid_dirtied)
4238		zfs_fuid_txhold(zfsvfs, tx);
4239	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4240	if (error) {
4241		zfs_dirent_unlock(dl);
4242		if (error == ERESTART) {
4243			waited = B_TRUE;
4244			dmu_tx_wait(tx);
4245			dmu_tx_abort(tx);
4246			goto top;
4247		}
4248		zfs_acl_ids_free(&acl_ids);
4249		dmu_tx_abort(tx);
4250		getnewvnode_drop_reserve();
4251		ZFS_EXIT(zfsvfs);
4252		return (error);
4253	}
4254
4255	/*
4256	 * Create a new object for the symlink.
4257	 * for version 4 ZPL datsets the symlink will be an SA attribute
4258	 */
4259	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4260
4261	if (fuid_dirtied)
4262		zfs_fuid_sync(zfsvfs, tx);
4263
4264	mutex_enter(&zp->z_lock);
4265	if (zp->z_is_sa)
4266		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4267		    link, len, tx);
4268	else
4269		zfs_sa_symlink(zp, link, len, tx);
4270	mutex_exit(&zp->z_lock);
4271
4272	zp->z_size = len;
4273	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4274	    &zp->z_size, sizeof (zp->z_size), tx);
4275	/*
4276	 * Insert the new object into the directory.
4277	 */
4278	(void) zfs_link_create(dl, zp, tx, ZNEW);
4279
4280	if (flags & FIGNORECASE)
4281		txtype |= TX_CI;
4282	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4283	*vpp = ZTOV(zp);
4284
4285	zfs_acl_ids_free(&acl_ids);
4286
4287	dmu_tx_commit(tx);
4288
4289	getnewvnode_drop_reserve();
4290
4291	zfs_dirent_unlock(dl);
4292
4293	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4294		zil_commit(zilog, 0);
4295
4296	ZFS_EXIT(zfsvfs);
4297	return (error);
4298}
4299
4300/*
4301 * Return, in the buffer contained in the provided uio structure,
4302 * the symbolic path referred to by vp.
4303 *
4304 *	IN:	vp	- vnode of symbolic link.
4305 *		uio	- structure to contain the link path.
4306 *		cr	- credentials of caller.
4307 *		ct	- caller context
4308 *
4309 *	OUT:	uio	- structure containing the link path.
4310 *
4311 *	RETURN:	0 on success, error code on failure.
4312 *
4313 * Timestamps:
4314 *	vp - atime updated
4315 */
4316/* ARGSUSED */
4317static int
4318zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4319{
4320	znode_t		*zp = VTOZ(vp);
4321	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4322	int		error;
4323
4324	ZFS_ENTER(zfsvfs);
4325	ZFS_VERIFY_ZP(zp);
4326
4327	mutex_enter(&zp->z_lock);
4328	if (zp->z_is_sa)
4329		error = sa_lookup_uio(zp->z_sa_hdl,
4330		    SA_ZPL_SYMLINK(zfsvfs), uio);
4331	else
4332		error = zfs_sa_readlink(zp, uio);
4333	mutex_exit(&zp->z_lock);
4334
4335	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4336
4337	ZFS_EXIT(zfsvfs);
4338	return (error);
4339}
4340
4341/*
4342 * Insert a new entry into directory tdvp referencing svp.
4343 *
4344 *	IN:	tdvp	- Directory to contain new entry.
4345 *		svp	- vnode of new entry.
4346 *		name	- name of new entry.
4347 *		cr	- credentials of caller.
4348 *		ct	- caller context
4349 *
4350 *	RETURN:	0 on success, error code on failure.
4351 *
4352 * Timestamps:
4353 *	tdvp - ctime|mtime updated
4354 *	 svp - ctime updated
4355 */
4356/* ARGSUSED */
4357static int
4358zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4359    caller_context_t *ct, int flags)
4360{
4361	znode_t		*dzp = VTOZ(tdvp);
4362	znode_t		*tzp, *szp;
4363	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4364	zilog_t		*zilog;
4365	zfs_dirlock_t	*dl;
4366	dmu_tx_t	*tx;
4367	vnode_t		*realvp;
4368	int		error;
4369	int		zf = ZNEW;
4370	uint64_t	parent;
4371	uid_t		owner;
4372	boolean_t	waited = B_FALSE;
4373
4374	ASSERT(tdvp->v_type == VDIR);
4375
4376	ZFS_ENTER(zfsvfs);
4377	ZFS_VERIFY_ZP(dzp);
4378	zilog = zfsvfs->z_log;
4379
4380	if (VOP_REALVP(svp, &realvp, ct) == 0)
4381		svp = realvp;
4382
4383	/*
4384	 * POSIX dictates that we return EPERM here.
4385	 * Better choices include ENOTSUP or EISDIR.
4386	 */
4387	if (svp->v_type == VDIR) {
4388		ZFS_EXIT(zfsvfs);
4389		return (SET_ERROR(EPERM));
4390	}
4391
4392	szp = VTOZ(svp);
4393	ZFS_VERIFY_ZP(szp);
4394
4395	/*
4396	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4397	 * ctldir appear to have the same v_vfsp.
4398	 */
4399	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4400		ZFS_EXIT(zfsvfs);
4401		return (SET_ERROR(EXDEV));
4402	}
4403
4404	/* Prevent links to .zfs/shares files */
4405
4406	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4407	    &parent, sizeof (uint64_t))) != 0) {
4408		ZFS_EXIT(zfsvfs);
4409		return (error);
4410	}
4411	if (parent == zfsvfs->z_shares_dir) {
4412		ZFS_EXIT(zfsvfs);
4413		return (SET_ERROR(EPERM));
4414	}
4415
4416	if (zfsvfs->z_utf8 && u8_validate(name,
4417	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4418		ZFS_EXIT(zfsvfs);
4419		return (SET_ERROR(EILSEQ));
4420	}
4421	if (flags & FIGNORECASE)
4422		zf |= ZCILOOK;
4423
4424	/*
4425	 * We do not support links between attributes and non-attributes
4426	 * because of the potential security risk of creating links
4427	 * into "normal" file space in order to circumvent restrictions
4428	 * imposed in attribute space.
4429	 */
4430	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4431		ZFS_EXIT(zfsvfs);
4432		return (SET_ERROR(EINVAL));
4433	}
4434
4435
4436	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4437	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4438		ZFS_EXIT(zfsvfs);
4439		return (SET_ERROR(EPERM));
4440	}
4441
4442	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4443		ZFS_EXIT(zfsvfs);
4444		return (error);
4445	}
4446
4447top:
4448	/*
4449	 * Attempt to lock directory; fail if entry already exists.
4450	 */
4451	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4452	if (error) {
4453		ZFS_EXIT(zfsvfs);
4454		return (error);
4455	}
4456
4457	tx = dmu_tx_create(zfsvfs->z_os);
4458	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4459	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4460	zfs_sa_upgrade_txholds(tx, szp);
4461	zfs_sa_upgrade_txholds(tx, dzp);
4462	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4463	if (error) {
4464		zfs_dirent_unlock(dl);
4465		if (error == ERESTART) {
4466			waited = B_TRUE;
4467			dmu_tx_wait(tx);
4468			dmu_tx_abort(tx);
4469			goto top;
4470		}
4471		dmu_tx_abort(tx);
4472		ZFS_EXIT(zfsvfs);
4473		return (error);
4474	}
4475
4476	error = zfs_link_create(dl, szp, tx, 0);
4477
4478	if (error == 0) {
4479		uint64_t txtype = TX_LINK;
4480		if (flags & FIGNORECASE)
4481			txtype |= TX_CI;
4482		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4483	}
4484
4485	dmu_tx_commit(tx);
4486
4487	zfs_dirent_unlock(dl);
4488
4489	if (error == 0) {
4490		vnevent_link(svp, ct);
4491	}
4492
4493	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4494		zil_commit(zilog, 0);
4495
4496	ZFS_EXIT(zfsvfs);
4497	return (error);
4498}
4499
4500#ifdef illumos
4501/*
4502 * zfs_null_putapage() is used when the file system has been force
4503 * unmounted. It just drops the pages.
4504 */
4505/* ARGSUSED */
4506static int
4507zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4508    size_t *lenp, int flags, cred_t *cr)
4509{
4510	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4511	return (0);
4512}
4513
4514/*
4515 * Push a page out to disk, klustering if possible.
4516 *
4517 *	IN:	vp	- file to push page to.
4518 *		pp	- page to push.
4519 *		flags	- additional flags.
4520 *		cr	- credentials of caller.
4521 *
4522 *	OUT:	offp	- start of range pushed.
4523 *		lenp	- len of range pushed.
4524 *
4525 *	RETURN:	0 on success, error code on failure.
4526 *
4527 * NOTE: callers must have locked the page to be pushed.  On
4528 * exit, the page (and all other pages in the kluster) must be
4529 * unlocked.
4530 */
4531/* ARGSUSED */
4532static int
4533zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4534    size_t *lenp, int flags, cred_t *cr)
4535{
4536	znode_t		*zp = VTOZ(vp);
4537	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4538	dmu_tx_t	*tx;
4539	u_offset_t	off, koff;
4540	size_t		len, klen;
4541	int		err;
4542
4543	off = pp->p_offset;
4544	len = PAGESIZE;
4545	/*
4546	 * If our blocksize is bigger than the page size, try to kluster
4547	 * multiple pages so that we write a full block (thus avoiding
4548	 * a read-modify-write).
4549	 */
4550	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4551		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4552		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4553		ASSERT(koff <= zp->z_size);
4554		if (koff + klen > zp->z_size)
4555			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4556		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4557	}
4558	ASSERT3U(btop(len), ==, btopr(len));
4559
4560	/*
4561	 * Can't push pages past end-of-file.
4562	 */
4563	if (off >= zp->z_size) {
4564		/* ignore all pages */
4565		err = 0;
4566		goto out;
4567	} else if (off + len > zp->z_size) {
4568		int npages = btopr(zp->z_size - off);
4569		page_t *trunc;
4570
4571		page_list_break(&pp, &trunc, npages);
4572		/* ignore pages past end of file */
4573		if (trunc)
4574			pvn_write_done(trunc, flags);
4575		len = zp->z_size - off;
4576	}
4577
4578	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4579	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4580		err = SET_ERROR(EDQUOT);
4581		goto out;
4582	}
4583	tx = dmu_tx_create(zfsvfs->z_os);
4584	dmu_tx_hold_write(tx, zp->z_id, off, len);
4585
4586	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4587	zfs_sa_upgrade_txholds(tx, zp);
4588	err = dmu_tx_assign(tx, TXG_WAIT);
4589	if (err != 0) {
4590		dmu_tx_abort(tx);
4591		goto out;
4592	}
4593
4594	if (zp->z_blksz <= PAGESIZE) {
4595		caddr_t va = zfs_map_page(pp, S_READ);
4596		ASSERT3U(len, <=, PAGESIZE);
4597		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4598		zfs_unmap_page(pp, va);
4599	} else {
4600		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4601	}
4602
4603	if (err == 0) {
4604		uint64_t mtime[2], ctime[2];
4605		sa_bulk_attr_t bulk[3];
4606		int count = 0;
4607
4608		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4609		    &mtime, 16);
4610		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4611		    &ctime, 16);
4612		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4613		    &zp->z_pflags, 8);
4614		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4615		    B_TRUE);
4616		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4617	}
4618	dmu_tx_commit(tx);
4619
4620out:
4621	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4622	if (offp)
4623		*offp = off;
4624	if (lenp)
4625		*lenp = len;
4626
4627	return (err);
4628}
4629
4630/*
4631 * Copy the portion of the file indicated from pages into the file.
4632 * The pages are stored in a page list attached to the files vnode.
4633 *
4634 *	IN:	vp	- vnode of file to push page data to.
4635 *		off	- position in file to put data.
4636 *		len	- amount of data to write.
4637 *		flags	- flags to control the operation.
4638 *		cr	- credentials of caller.
4639 *		ct	- caller context.
4640 *
4641 *	RETURN:	0 on success, error code on failure.
4642 *
4643 * Timestamps:
4644 *	vp - ctime|mtime updated
4645 */
4646/*ARGSUSED*/
4647static int
4648zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4649    caller_context_t *ct)
4650{
4651	znode_t		*zp = VTOZ(vp);
4652	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4653	page_t		*pp;
4654	size_t		io_len;
4655	u_offset_t	io_off;
4656	uint_t		blksz;
4657	rl_t		*rl;
4658	int		error = 0;
4659
4660	ZFS_ENTER(zfsvfs);
4661	ZFS_VERIFY_ZP(zp);
4662
4663	/*
4664	 * Align this request to the file block size in case we kluster.
4665	 * XXX - this can result in pretty aggresive locking, which can
4666	 * impact simultanious read/write access.  One option might be
4667	 * to break up long requests (len == 0) into block-by-block
4668	 * operations to get narrower locking.
4669	 */
4670	blksz = zp->z_blksz;
4671	if (ISP2(blksz))
4672		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4673	else
4674		io_off = 0;
4675	if (len > 0 && ISP2(blksz))
4676		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4677	else
4678		io_len = 0;
4679
4680	if (io_len == 0) {
4681		/*
4682		 * Search the entire vp list for pages >= io_off.
4683		 */
4684		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4685		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4686		goto out;
4687	}
4688	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4689
4690	if (off > zp->z_size) {
4691		/* past end of file */
4692		zfs_range_unlock(rl);
4693		ZFS_EXIT(zfsvfs);
4694		return (0);
4695	}
4696
4697	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4698
4699	for (off = io_off; io_off < off + len; io_off += io_len) {
4700		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4701			pp = page_lookup(vp, io_off,
4702			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4703		} else {
4704			pp = page_lookup_nowait(vp, io_off,
4705			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4706		}
4707
4708		if (pp != NULL && pvn_getdirty(pp, flags)) {
4709			int err;
4710
4711			/*
4712			 * Found a dirty page to push
4713			 */
4714			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4715			if (err)
4716				error = err;
4717		} else {
4718			io_len = PAGESIZE;
4719		}
4720	}
4721out:
4722	zfs_range_unlock(rl);
4723	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4724		zil_commit(zfsvfs->z_log, zp->z_id);
4725	ZFS_EXIT(zfsvfs);
4726	return (error);
4727}
4728#endif	/* illumos */
4729
4730/*ARGSUSED*/
4731void
4732zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4733{
4734	znode_t	*zp = VTOZ(vp);
4735	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4736	int error;
4737
4738	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4739	if (zp->z_sa_hdl == NULL) {
4740		/*
4741		 * The fs has been unmounted, or we did a
4742		 * suspend/resume and this file no longer exists.
4743		 */
4744		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4745		vrecycle(vp);
4746		return;
4747	}
4748
4749	mutex_enter(&zp->z_lock);
4750	if (zp->z_unlinked) {
4751		/*
4752		 * Fast path to recycle a vnode of a removed file.
4753		 */
4754		mutex_exit(&zp->z_lock);
4755		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4756		vrecycle(vp);
4757		return;
4758	}
4759	mutex_exit(&zp->z_lock);
4760
4761	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4762		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4763
4764		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4765		zfs_sa_upgrade_txholds(tx, zp);
4766		error = dmu_tx_assign(tx, TXG_WAIT);
4767		if (error) {
4768			dmu_tx_abort(tx);
4769		} else {
4770			mutex_enter(&zp->z_lock);
4771			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4772			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4773			zp->z_atime_dirty = 0;
4774			mutex_exit(&zp->z_lock);
4775			dmu_tx_commit(tx);
4776		}
4777	}
4778	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4779}
4780
4781#ifdef illumos
4782/*
4783 * Bounds-check the seek operation.
4784 *
4785 *	IN:	vp	- vnode seeking within
4786 *		ooff	- old file offset
4787 *		noffp	- pointer to new file offset
4788 *		ct	- caller context
4789 *
4790 *	RETURN:	0 on success, EINVAL if new offset invalid.
4791 */
4792/* ARGSUSED */
4793static int
4794zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4795    caller_context_t *ct)
4796{
4797	if (vp->v_type == VDIR)
4798		return (0);
4799	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4800}
4801
4802/*
4803 * Pre-filter the generic locking function to trap attempts to place
4804 * a mandatory lock on a memory mapped file.
4805 */
4806static int
4807zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4808    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4809{
4810	znode_t *zp = VTOZ(vp);
4811	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4812
4813	ZFS_ENTER(zfsvfs);
4814	ZFS_VERIFY_ZP(zp);
4815
4816	/*
4817	 * We are following the UFS semantics with respect to mapcnt
4818	 * here: If we see that the file is mapped already, then we will
4819	 * return an error, but we don't worry about races between this
4820	 * function and zfs_map().
4821	 */
4822	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4823		ZFS_EXIT(zfsvfs);
4824		return (SET_ERROR(EAGAIN));
4825	}
4826	ZFS_EXIT(zfsvfs);
4827	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4828}
4829
4830/*
4831 * If we can't find a page in the cache, we will create a new page
4832 * and fill it with file data.  For efficiency, we may try to fill
4833 * multiple pages at once (klustering) to fill up the supplied page
4834 * list.  Note that the pages to be filled are held with an exclusive
4835 * lock to prevent access by other threads while they are being filled.
4836 */
4837static int
4838zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4839    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4840{
4841	znode_t *zp = VTOZ(vp);
4842	page_t *pp, *cur_pp;
4843	objset_t *os = zp->z_zfsvfs->z_os;
4844	u_offset_t io_off, total;
4845	size_t io_len;
4846	int err;
4847
4848	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4849		/*
4850		 * We only have a single page, don't bother klustering
4851		 */
4852		io_off = off;
4853		io_len = PAGESIZE;
4854		pp = page_create_va(vp, io_off, io_len,
4855		    PG_EXCL | PG_WAIT, seg, addr);
4856	} else {
4857		/*
4858		 * Try to find enough pages to fill the page list
4859		 */
4860		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4861		    &io_len, off, plsz, 0);
4862	}
4863	if (pp == NULL) {
4864		/*
4865		 * The page already exists, nothing to do here.
4866		 */
4867		*pl = NULL;
4868		return (0);
4869	}
4870
4871	/*
4872	 * Fill the pages in the kluster.
4873	 */
4874	cur_pp = pp;
4875	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4876		caddr_t va;
4877
4878		ASSERT3U(io_off, ==, cur_pp->p_offset);
4879		va = zfs_map_page(cur_pp, S_WRITE);
4880		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4881		    DMU_READ_PREFETCH);
4882		zfs_unmap_page(cur_pp, va);
4883		if (err) {
4884			/* On error, toss the entire kluster */
4885			pvn_read_done(pp, B_ERROR);
4886			/* convert checksum errors into IO errors */
4887			if (err == ECKSUM)
4888				err = SET_ERROR(EIO);
4889			return (err);
4890		}
4891		cur_pp = cur_pp->p_next;
4892	}
4893
4894	/*
4895	 * Fill in the page list array from the kluster starting
4896	 * from the desired offset `off'.
4897	 * NOTE: the page list will always be null terminated.
4898	 */
4899	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4900	ASSERT(pl == NULL || (*pl)->p_offset == off);
4901
4902	return (0);
4903}
4904
4905/*
4906 * Return pointers to the pages for the file region [off, off + len]
4907 * in the pl array.  If plsz is greater than len, this function may
4908 * also return page pointers from after the specified region
4909 * (i.e. the region [off, off + plsz]).  These additional pages are
4910 * only returned if they are already in the cache, or were created as
4911 * part of a klustered read.
4912 *
4913 *	IN:	vp	- vnode of file to get data from.
4914 *		off	- position in file to get data from.
4915 *		len	- amount of data to retrieve.
4916 *		plsz	- length of provided page list.
4917 *		seg	- segment to obtain pages for.
4918 *		addr	- virtual address of fault.
4919 *		rw	- mode of created pages.
4920 *		cr	- credentials of caller.
4921 *		ct	- caller context.
4922 *
4923 *	OUT:	protp	- protection mode of created pages.
4924 *		pl	- list of pages created.
4925 *
4926 *	RETURN:	0 on success, error code on failure.
4927 *
4928 * Timestamps:
4929 *	vp - atime updated
4930 */
4931/* ARGSUSED */
4932static int
4933zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4934    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4935    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4936{
4937	znode_t		*zp = VTOZ(vp);
4938	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4939	page_t		**pl0 = pl;
4940	int		err = 0;
4941
4942	/* we do our own caching, faultahead is unnecessary */
4943	if (pl == NULL)
4944		return (0);
4945	else if (len > plsz)
4946		len = plsz;
4947	else
4948		len = P2ROUNDUP(len, PAGESIZE);
4949	ASSERT(plsz >= len);
4950
4951	ZFS_ENTER(zfsvfs);
4952	ZFS_VERIFY_ZP(zp);
4953
4954	if (protp)
4955		*protp = PROT_ALL;
4956
4957	/*
4958	 * Loop through the requested range [off, off + len) looking
4959	 * for pages.  If we don't find a page, we will need to create
4960	 * a new page and fill it with data from the file.
4961	 */
4962	while (len > 0) {
4963		if (*pl = page_lookup(vp, off, SE_SHARED))
4964			*(pl+1) = NULL;
4965		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4966			goto out;
4967		while (*pl) {
4968			ASSERT3U((*pl)->p_offset, ==, off);
4969			off += PAGESIZE;
4970			addr += PAGESIZE;
4971			if (len > 0) {
4972				ASSERT3U(len, >=, PAGESIZE);
4973				len -= PAGESIZE;
4974			}
4975			ASSERT3U(plsz, >=, PAGESIZE);
4976			plsz -= PAGESIZE;
4977			pl++;
4978		}
4979	}
4980
4981	/*
4982	 * Fill out the page array with any pages already in the cache.
4983	 */
4984	while (plsz > 0 &&
4985	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4986			off += PAGESIZE;
4987			plsz -= PAGESIZE;
4988	}
4989out:
4990	if (err) {
4991		/*
4992		 * Release any pages we have previously locked.
4993		 */
4994		while (pl > pl0)
4995			page_unlock(*--pl);
4996	} else {
4997		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4998	}
4999
5000	*pl = NULL;
5001
5002	ZFS_EXIT(zfsvfs);
5003	return (err);
5004}
5005
5006/*
5007 * Request a memory map for a section of a file.  This code interacts
5008 * with common code and the VM system as follows:
5009 *
5010 * - common code calls mmap(), which ends up in smmap_common()
5011 * - this calls VOP_MAP(), which takes you into (say) zfs
5012 * - zfs_map() calls as_map(), passing segvn_create() as the callback
5013 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
5014 * - zfs_addmap() updates z_mapcnt
5015 */
5016/*ARGSUSED*/
5017static int
5018zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5019    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5020    caller_context_t *ct)
5021{
5022	znode_t *zp = VTOZ(vp);
5023	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5024	segvn_crargs_t	vn_a;
5025	int		error;
5026
5027	ZFS_ENTER(zfsvfs);
5028	ZFS_VERIFY_ZP(zp);
5029
5030	if ((prot & PROT_WRITE) && (zp->z_pflags &
5031	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
5032		ZFS_EXIT(zfsvfs);
5033		return (SET_ERROR(EPERM));
5034	}
5035
5036	if ((prot & (PROT_READ | PROT_EXEC)) &&
5037	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
5038		ZFS_EXIT(zfsvfs);
5039		return (SET_ERROR(EACCES));
5040	}
5041
5042	if (vp->v_flag & VNOMAP) {
5043		ZFS_EXIT(zfsvfs);
5044		return (SET_ERROR(ENOSYS));
5045	}
5046
5047	if (off < 0 || len > MAXOFFSET_T - off) {
5048		ZFS_EXIT(zfsvfs);
5049		return (SET_ERROR(ENXIO));
5050	}
5051
5052	if (vp->v_type != VREG) {
5053		ZFS_EXIT(zfsvfs);
5054		return (SET_ERROR(ENODEV));
5055	}
5056
5057	/*
5058	 * If file is locked, disallow mapping.
5059	 */
5060	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
5061		ZFS_EXIT(zfsvfs);
5062		return (SET_ERROR(EAGAIN));
5063	}
5064
5065	as_rangelock(as);
5066	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5067	if (error != 0) {
5068		as_rangeunlock(as);
5069		ZFS_EXIT(zfsvfs);
5070		return (error);
5071	}
5072
5073	vn_a.vp = vp;
5074	vn_a.offset = (u_offset_t)off;
5075	vn_a.type = flags & MAP_TYPE;
5076	vn_a.prot = prot;
5077	vn_a.maxprot = maxprot;
5078	vn_a.cred = cr;
5079	vn_a.amp = NULL;
5080	vn_a.flags = flags & ~MAP_TYPE;
5081	vn_a.szc = 0;
5082	vn_a.lgrp_mem_policy_flags = 0;
5083
5084	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5085
5086	as_rangeunlock(as);
5087	ZFS_EXIT(zfsvfs);
5088	return (error);
5089}
5090
5091/* ARGSUSED */
5092static int
5093zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5094    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5095    caller_context_t *ct)
5096{
5097	uint64_t pages = btopr(len);
5098
5099	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5100	return (0);
5101}
5102
5103/*
5104 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5105 * more accurate mtime for the associated file.  Since we don't have a way of
5106 * detecting when the data was actually modified, we have to resort to
5107 * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5108 * last page is pushed.  The problem occurs when the msync() call is omitted,
5109 * which by far the most common case:
5110 *
5111 *	open()
5112 *	mmap()
5113 *	<modify memory>
5114 *	munmap()
5115 *	close()
5116 *	<time lapse>
5117 *	putpage() via fsflush
5118 *
5119 * If we wait until fsflush to come along, we can have a modification time that
5120 * is some arbitrary point in the future.  In order to prevent this in the
5121 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5122 * torn down.
5123 */
5124/* ARGSUSED */
5125static int
5126zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5127    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5128    caller_context_t *ct)
5129{
5130	uint64_t pages = btopr(len);
5131
5132	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5133	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5134
5135	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5136	    vn_has_cached_data(vp))
5137		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5138
5139	return (0);
5140}
5141
5142/*
5143 * Free or allocate space in a file.  Currently, this function only
5144 * supports the `F_FREESP' command.  However, this command is somewhat
5145 * misnamed, as its functionality includes the ability to allocate as
5146 * well as free space.
5147 *
5148 *	IN:	vp	- vnode of file to free data in.
5149 *		cmd	- action to take (only F_FREESP supported).
5150 *		bfp	- section of file to free/alloc.
5151 *		flag	- current file open mode flags.
5152 *		offset	- current file offset.
5153 *		cr	- credentials of caller [UNUSED].
5154 *		ct	- caller context.
5155 *
5156 *	RETURN:	0 on success, error code on failure.
5157 *
5158 * Timestamps:
5159 *	vp - ctime|mtime updated
5160 */
5161/* ARGSUSED */
5162static int
5163zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5164    offset_t offset, cred_t *cr, caller_context_t *ct)
5165{
5166	znode_t		*zp = VTOZ(vp);
5167	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5168	uint64_t	off, len;
5169	int		error;
5170
5171	ZFS_ENTER(zfsvfs);
5172	ZFS_VERIFY_ZP(zp);
5173
5174	if (cmd != F_FREESP) {
5175		ZFS_EXIT(zfsvfs);
5176		return (SET_ERROR(EINVAL));
5177	}
5178
5179	if (error = convoff(vp, bfp, 0, offset)) {
5180		ZFS_EXIT(zfsvfs);
5181		return (error);
5182	}
5183
5184	if (bfp->l_len < 0) {
5185		ZFS_EXIT(zfsvfs);
5186		return (SET_ERROR(EINVAL));
5187	}
5188
5189	off = bfp->l_start;
5190	len = bfp->l_len; /* 0 means from off to end of file */
5191
5192	error = zfs_freesp(zp, off, len, flag, TRUE);
5193
5194	ZFS_EXIT(zfsvfs);
5195	return (error);
5196}
5197#endif	/* illumos */
5198
5199CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5200CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5201
5202/*ARGSUSED*/
5203static int
5204zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5205{
5206	znode_t		*zp = VTOZ(vp);
5207	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5208	uint32_t	gen;
5209	uint64_t	gen64;
5210	uint64_t	object = zp->z_id;
5211	zfid_short_t	*zfid;
5212	int		size, i, error;
5213
5214	ZFS_ENTER(zfsvfs);
5215	ZFS_VERIFY_ZP(zp);
5216
5217	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5218	    &gen64, sizeof (uint64_t))) != 0) {
5219		ZFS_EXIT(zfsvfs);
5220		return (error);
5221	}
5222
5223	gen = (uint32_t)gen64;
5224
5225	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5226
5227#ifdef illumos
5228	if (fidp->fid_len < size) {
5229		fidp->fid_len = size;
5230		ZFS_EXIT(zfsvfs);
5231		return (SET_ERROR(ENOSPC));
5232	}
5233#else
5234	fidp->fid_len = size;
5235#endif
5236
5237	zfid = (zfid_short_t *)fidp;
5238
5239	zfid->zf_len = size;
5240
5241	for (i = 0; i < sizeof (zfid->zf_object); i++)
5242		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5243
5244	/* Must have a non-zero generation number to distinguish from .zfs */
5245	if (gen == 0)
5246		gen = 1;
5247	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5248		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5249
5250	if (size == LONG_FID_LEN) {
5251		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5252		zfid_long_t	*zlfid;
5253
5254		zlfid = (zfid_long_t *)fidp;
5255
5256		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5257			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5258
5259		/* XXX - this should be the generation number for the objset */
5260		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5261			zlfid->zf_setgen[i] = 0;
5262	}
5263
5264	ZFS_EXIT(zfsvfs);
5265	return (0);
5266}
5267
5268static int
5269zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5270    caller_context_t *ct)
5271{
5272	znode_t		*zp, *xzp;
5273	zfsvfs_t	*zfsvfs;
5274	zfs_dirlock_t	*dl;
5275	int		error;
5276
5277	switch (cmd) {
5278	case _PC_LINK_MAX:
5279		*valp = INT_MAX;
5280		return (0);
5281
5282	case _PC_FILESIZEBITS:
5283		*valp = 64;
5284		return (0);
5285#ifdef illumos
5286	case _PC_XATTR_EXISTS:
5287		zp = VTOZ(vp);
5288		zfsvfs = zp->z_zfsvfs;
5289		ZFS_ENTER(zfsvfs);
5290		ZFS_VERIFY_ZP(zp);
5291		*valp = 0;
5292		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5293		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5294		if (error == 0) {
5295			zfs_dirent_unlock(dl);
5296			if (!zfs_dirempty(xzp))
5297				*valp = 1;
5298			VN_RELE(ZTOV(xzp));
5299		} else if (error == ENOENT) {
5300			/*
5301			 * If there aren't extended attributes, it's the
5302			 * same as having zero of them.
5303			 */
5304			error = 0;
5305		}
5306		ZFS_EXIT(zfsvfs);
5307		return (error);
5308
5309	case _PC_SATTR_ENABLED:
5310	case _PC_SATTR_EXISTS:
5311		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5312		    (vp->v_type == VREG || vp->v_type == VDIR);
5313		return (0);
5314
5315	case _PC_ACCESS_FILTERING:
5316		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5317		    vp->v_type == VDIR;
5318		return (0);
5319
5320	case _PC_ACL_ENABLED:
5321		*valp = _ACL_ACE_ENABLED;
5322		return (0);
5323#endif	/* illumos */
5324	case _PC_MIN_HOLE_SIZE:
5325		*valp = (int)SPA_MINBLOCKSIZE;
5326		return (0);
5327#ifdef illumos
5328	case _PC_TIMESTAMP_RESOLUTION:
5329		/* nanosecond timestamp resolution */
5330		*valp = 1L;
5331		return (0);
5332#endif
5333	case _PC_ACL_EXTENDED:
5334		*valp = 0;
5335		return (0);
5336
5337	case _PC_ACL_NFS4:
5338		*valp = 1;
5339		return (0);
5340
5341	case _PC_ACL_PATH_MAX:
5342		*valp = ACL_MAX_ENTRIES;
5343		return (0);
5344
5345	default:
5346		return (EOPNOTSUPP);
5347	}
5348}
5349
5350/*ARGSUSED*/
5351static int
5352zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5353    caller_context_t *ct)
5354{
5355	znode_t *zp = VTOZ(vp);
5356	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5357	int error;
5358	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5359
5360	ZFS_ENTER(zfsvfs);
5361	ZFS_VERIFY_ZP(zp);
5362	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5363	ZFS_EXIT(zfsvfs);
5364
5365	return (error);
5366}
5367
5368/*ARGSUSED*/
5369int
5370zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5371    caller_context_t *ct)
5372{
5373	znode_t *zp = VTOZ(vp);
5374	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5375	int error;
5376	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5377	zilog_t	*zilog = zfsvfs->z_log;
5378
5379	ZFS_ENTER(zfsvfs);
5380	ZFS_VERIFY_ZP(zp);
5381
5382	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5383
5384	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5385		zil_commit(zilog, 0);
5386
5387	ZFS_EXIT(zfsvfs);
5388	return (error);
5389}
5390
5391#ifdef illumos
5392/*
5393 * The smallest read we may consider to loan out an arcbuf.
5394 * This must be a power of 2.
5395 */
5396int zcr_blksz_min = (1 << 10);	/* 1K */
5397/*
5398 * If set to less than the file block size, allow loaning out of an
5399 * arcbuf for a partial block read.  This must be a power of 2.
5400 */
5401int zcr_blksz_max = (1 << 17);	/* 128K */
5402
5403/*ARGSUSED*/
5404static int
5405zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5406    caller_context_t *ct)
5407{
5408	znode_t	*zp = VTOZ(vp);
5409	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5410	int max_blksz = zfsvfs->z_max_blksz;
5411	uio_t *uio = &xuio->xu_uio;
5412	ssize_t size = uio->uio_resid;
5413	offset_t offset = uio->uio_loffset;
5414	int blksz;
5415	int fullblk, i;
5416	arc_buf_t *abuf;
5417	ssize_t maxsize;
5418	int preamble, postamble;
5419
5420	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5421		return (SET_ERROR(EINVAL));
5422
5423	ZFS_ENTER(zfsvfs);
5424	ZFS_VERIFY_ZP(zp);
5425	switch (ioflag) {
5426	case UIO_WRITE:
5427		/*
5428		 * Loan out an arc_buf for write if write size is bigger than
5429		 * max_blksz, and the file's block size is also max_blksz.
5430		 */
5431		blksz = max_blksz;
5432		if (size < blksz || zp->z_blksz != blksz) {
5433			ZFS_EXIT(zfsvfs);
5434			return (SET_ERROR(EINVAL));
5435		}
5436		/*
5437		 * Caller requests buffers for write before knowing where the
5438		 * write offset might be (e.g. NFS TCP write).
5439		 */
5440		if (offset == -1) {
5441			preamble = 0;
5442		} else {
5443			preamble = P2PHASE(offset, blksz);
5444			if (preamble) {
5445				preamble = blksz - preamble;
5446				size -= preamble;
5447			}
5448		}
5449
5450		postamble = P2PHASE(size, blksz);
5451		size -= postamble;
5452
5453		fullblk = size / blksz;
5454		(void) dmu_xuio_init(xuio,
5455		    (preamble != 0) + fullblk + (postamble != 0));
5456		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5457		    int, postamble, int,
5458		    (preamble != 0) + fullblk + (postamble != 0));
5459
5460		/*
5461		 * Have to fix iov base/len for partial buffers.  They
5462		 * currently represent full arc_buf's.
5463		 */
5464		if (preamble) {
5465			/* data begins in the middle of the arc_buf */
5466			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5467			    blksz);
5468			ASSERT(abuf);
5469			(void) dmu_xuio_add(xuio, abuf,
5470			    blksz - preamble, preamble);
5471		}
5472
5473		for (i = 0; i < fullblk; i++) {
5474			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5475			    blksz);
5476			ASSERT(abuf);
5477			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5478		}
5479
5480		if (postamble) {
5481			/* data ends in the middle of the arc_buf */
5482			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5483			    blksz);
5484			ASSERT(abuf);
5485			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5486		}
5487		break;
5488	case UIO_READ:
5489		/*
5490		 * Loan out an arc_buf for read if the read size is larger than
5491		 * the current file block size.  Block alignment is not
5492		 * considered.  Partial arc_buf will be loaned out for read.
5493		 */
5494		blksz = zp->z_blksz;
5495		if (blksz < zcr_blksz_min)
5496			blksz = zcr_blksz_min;
5497		if (blksz > zcr_blksz_max)
5498			blksz = zcr_blksz_max;
5499		/* avoid potential complexity of dealing with it */
5500		if (blksz > max_blksz) {
5501			ZFS_EXIT(zfsvfs);
5502			return (SET_ERROR(EINVAL));
5503		}
5504
5505		maxsize = zp->z_size - uio->uio_loffset;
5506		if (size > maxsize)
5507			size = maxsize;
5508
5509		if (size < blksz || vn_has_cached_data(vp)) {
5510			ZFS_EXIT(zfsvfs);
5511			return (SET_ERROR(EINVAL));
5512		}
5513		break;
5514	default:
5515		ZFS_EXIT(zfsvfs);
5516		return (SET_ERROR(EINVAL));
5517	}
5518
5519	uio->uio_extflg = UIO_XUIO;
5520	XUIO_XUZC_RW(xuio) = ioflag;
5521	ZFS_EXIT(zfsvfs);
5522	return (0);
5523}
5524
5525/*ARGSUSED*/
5526static int
5527zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5528{
5529	int i;
5530	arc_buf_t *abuf;
5531	int ioflag = XUIO_XUZC_RW(xuio);
5532
5533	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5534
5535	i = dmu_xuio_cnt(xuio);
5536	while (i-- > 0) {
5537		abuf = dmu_xuio_arcbuf(xuio, i);
5538		/*
5539		 * if abuf == NULL, it must be a write buffer
5540		 * that has been returned in zfs_write().
5541		 */
5542		if (abuf)
5543			dmu_return_arcbuf(abuf);
5544		ASSERT(abuf || ioflag == UIO_WRITE);
5545	}
5546
5547	dmu_xuio_fini(xuio);
5548	return (0);
5549}
5550
5551/*
5552 * Predeclare these here so that the compiler assumes that
5553 * this is an "old style" function declaration that does
5554 * not include arguments => we won't get type mismatch errors
5555 * in the initializations that follow.
5556 */
5557static int zfs_inval();
5558static int zfs_isdir();
5559
5560static int
5561zfs_inval()
5562{
5563	return (SET_ERROR(EINVAL));
5564}
5565
5566static int
5567zfs_isdir()
5568{
5569	return (SET_ERROR(EISDIR));
5570}
5571/*
5572 * Directory vnode operations template
5573 */
5574vnodeops_t *zfs_dvnodeops;
5575const fs_operation_def_t zfs_dvnodeops_template[] = {
5576	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5577	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5578	VOPNAME_READ,		{ .error = zfs_isdir },
5579	VOPNAME_WRITE,		{ .error = zfs_isdir },
5580	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5581	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5582	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5583	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5584	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5585	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5586	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5587	VOPNAME_LINK,		{ .vop_link = zfs_link },
5588	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5589	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5590	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5591	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5592	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5593	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5594	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5595	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5596	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5597	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5598	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5599	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5600	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5601	NULL,			NULL
5602};
5603
5604/*
5605 * Regular file vnode operations template
5606 */
5607vnodeops_t *zfs_fvnodeops;
5608const fs_operation_def_t zfs_fvnodeops_template[] = {
5609	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5610	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5611	VOPNAME_READ,		{ .vop_read = zfs_read },
5612	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5613	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5614	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5615	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5616	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5617	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5618	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5619	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5620	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5621	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5622	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5623	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5624	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5625	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5626	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5627	VOPNAME_MAP,		{ .vop_map = zfs_map },
5628	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5629	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5630	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5631	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5632	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5633	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5634	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
5635	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
5636	NULL,			NULL
5637};
5638
5639/*
5640 * Symbolic link vnode operations template
5641 */
5642vnodeops_t *zfs_symvnodeops;
5643const fs_operation_def_t zfs_symvnodeops_template[] = {
5644	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5645	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5646	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5647	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5648	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5649	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5650	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5651	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5652	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5653	NULL,			NULL
5654};
5655
5656/*
5657 * special share hidden files vnode operations template
5658 */
5659vnodeops_t *zfs_sharevnodeops;
5660const fs_operation_def_t zfs_sharevnodeops_template[] = {
5661	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5662	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5663	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5664	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5665	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5666	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5667	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5668	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5669	NULL,			NULL
5670};
5671
5672/*
5673 * Extended attribute directory vnode operations template
5674 *
5675 * This template is identical to the directory vnodes
5676 * operation template except for restricted operations:
5677 *	VOP_MKDIR()
5678 *	VOP_SYMLINK()
5679 *
5680 * Note that there are other restrictions embedded in:
5681 *	zfs_create()	- restrict type to VREG
5682 *	zfs_link()	- no links into/out of attribute space
5683 *	zfs_rename()	- no moves into/out of attribute space
5684 */
5685vnodeops_t *zfs_xdvnodeops;
5686const fs_operation_def_t zfs_xdvnodeops_template[] = {
5687	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5688	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5689	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5690	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5691	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5692	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5693	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5694	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5695	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5696	VOPNAME_LINK,		{ .vop_link = zfs_link },
5697	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5698	VOPNAME_MKDIR,		{ .error = zfs_inval },
5699	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5700	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5701	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5702	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5703	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5704	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5705	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5706	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5707	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5708	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5709	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5710	NULL,			NULL
5711};
5712
5713/*
5714 * Error vnode operations template
5715 */
5716vnodeops_t *zfs_evnodeops;
5717const fs_operation_def_t zfs_evnodeops_template[] = {
5718	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5719	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5720	NULL,			NULL
5721};
5722#endif	/* illumos */
5723
5724static int
5725ioflags(int ioflags)
5726{
5727	int flags = 0;
5728
5729	if (ioflags & IO_APPEND)
5730		flags |= FAPPEND;
5731	if (ioflags & IO_NDELAY)
5732        	flags |= FNONBLOCK;
5733	if (ioflags & IO_SYNC)
5734		flags |= (FSYNC | FDSYNC | FRSYNC);
5735
5736	return (flags);
5737}
5738
5739static int
5740zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5741{
5742	znode_t *zp = VTOZ(vp);
5743	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5744	objset_t *os = zp->z_zfsvfs->z_os;
5745	vm_page_t mfirst, mlast, mreq;
5746	vm_object_t object;
5747	caddr_t va;
5748	struct sf_buf *sf;
5749	off_t startoff, endoff;
5750	int i, error;
5751	vm_pindex_t reqstart, reqend;
5752	int pcount, lsize, reqsize, size;
5753
5754	ZFS_ENTER(zfsvfs);
5755	ZFS_VERIFY_ZP(zp);
5756
5757	pcount = OFF_TO_IDX(round_page(count));
5758	mreq = m[reqpage];
5759	object = mreq->object;
5760	error = 0;
5761
5762	KASSERT(vp->v_object == object, ("mismatching object"));
5763
5764	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
5765		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
5766		reqstart = OFF_TO_IDX(round_page(startoff));
5767		if (reqstart < m[0]->pindex)
5768			reqstart = 0;
5769		else
5770			reqstart = reqstart - m[0]->pindex;
5771		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
5772		    zp->z_blksz);
5773		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
5774		if (reqend > m[pcount - 1]->pindex)
5775			reqend = m[pcount - 1]->pindex;
5776		reqsize = reqend - m[reqstart]->pindex + 1;
5777		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
5778		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
5779	} else {
5780		reqstart = reqpage;
5781		reqsize = 1;
5782	}
5783	mfirst = m[reqstart];
5784	mlast = m[reqstart + reqsize - 1];
5785
5786	zfs_vmobject_wlock(object);
5787
5788	for (i = 0; i < reqstart; i++) {
5789		vm_page_lock(m[i]);
5790		vm_page_free(m[i]);
5791		vm_page_unlock(m[i]);
5792	}
5793	for (i = reqstart + reqsize; i < pcount; i++) {
5794		vm_page_lock(m[i]);
5795		vm_page_free(m[i]);
5796		vm_page_unlock(m[i]);
5797	}
5798
5799	if (mreq->valid && reqsize == 1) {
5800		if (mreq->valid != VM_PAGE_BITS_ALL)
5801			vm_page_zero_invalid(mreq, TRUE);
5802		zfs_vmobject_wunlock(object);
5803		ZFS_EXIT(zfsvfs);
5804		return (zfs_vm_pagerret_ok);
5805	}
5806
5807	PCPU_INC(cnt.v_vnodein);
5808	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
5809
5810	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5811		for (i = reqstart; i < reqstart + reqsize; i++) {
5812			if (i != reqpage) {
5813				vm_page_lock(m[i]);
5814				vm_page_free(m[i]);
5815				vm_page_unlock(m[i]);
5816			}
5817		}
5818		zfs_vmobject_wunlock(object);
5819		ZFS_EXIT(zfsvfs);
5820		return (zfs_vm_pagerret_bad);
5821	}
5822
5823	lsize = PAGE_SIZE;
5824	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
5825		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
5826
5827	zfs_vmobject_wunlock(object);
5828
5829	for (i = reqstart; i < reqstart + reqsize; i++) {
5830		size = PAGE_SIZE;
5831		if (i == (reqstart + reqsize - 1))
5832			size = lsize;
5833		va = zfs_map_page(m[i], &sf);
5834		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
5835		    size, va, DMU_READ_PREFETCH);
5836		if (size != PAGE_SIZE)
5837			bzero(va + size, PAGE_SIZE - size);
5838		zfs_unmap_page(sf);
5839		if (error != 0)
5840			break;
5841	}
5842
5843	zfs_vmobject_wlock(object);
5844
5845	for (i = reqstart; i < reqstart + reqsize; i++) {
5846		if (!error)
5847			m[i]->valid = VM_PAGE_BITS_ALL;
5848		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
5849		if (i != reqpage)
5850			vm_page_readahead_finish(m[i]);
5851	}
5852
5853	zfs_vmobject_wunlock(object);
5854
5855	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5856	ZFS_EXIT(zfsvfs);
5857	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
5858}
5859
5860static int
5861zfs_freebsd_getpages(ap)
5862	struct vop_getpages_args /* {
5863		struct vnode *a_vp;
5864		vm_page_t *a_m;
5865		int a_count;
5866		int a_reqpage;
5867		vm_ooffset_t a_offset;
5868	} */ *ap;
5869{
5870
5871	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5872}
5873
5874static int
5875zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
5876    int *rtvals)
5877{
5878	znode_t		*zp = VTOZ(vp);
5879	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5880	rl_t		*rl;
5881	dmu_tx_t	*tx;
5882	struct sf_buf	*sf;
5883	vm_object_t	object;
5884	vm_page_t	m;
5885	caddr_t		va;
5886	size_t		tocopy;
5887	size_t		lo_len;
5888	vm_ooffset_t	lo_off;
5889	vm_ooffset_t	off;
5890	uint_t		blksz;
5891	int		ncount;
5892	int		pcount;
5893	int		err;
5894	int		i;
5895
5896	ZFS_ENTER(zfsvfs);
5897	ZFS_VERIFY_ZP(zp);
5898
5899	object = vp->v_object;
5900	pcount = btoc(len);
5901	ncount = pcount;
5902
5903	KASSERT(ma[0]->object == object, ("mismatching object"));
5904	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
5905
5906	for (i = 0; i < pcount; i++)
5907		rtvals[i] = zfs_vm_pagerret_error;
5908
5909	off = IDX_TO_OFF(ma[0]->pindex);
5910	blksz = zp->z_blksz;
5911	lo_off = rounddown(off, blksz);
5912	lo_len = roundup(len + (off - lo_off), blksz);
5913	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
5914
5915	zfs_vmobject_wlock(object);
5916	if (len + off > object->un_pager.vnp.vnp_size) {
5917		if (object->un_pager.vnp.vnp_size > off) {
5918			int pgoff;
5919
5920			len = object->un_pager.vnp.vnp_size - off;
5921			ncount = btoc(len);
5922			if ((pgoff = (int)len & PAGE_MASK) != 0) {
5923				/*
5924				 * If the object is locked and the following
5925				 * conditions hold, then the page's dirty
5926				 * field cannot be concurrently changed by a
5927				 * pmap operation.
5928				 */
5929				m = ma[ncount - 1];
5930				vm_page_assert_sbusied(m);
5931				KASSERT(!pmap_page_is_write_mapped(m),
5932				    ("zfs_putpages: page %p is not read-only", m));
5933				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
5934				    pgoff);
5935			}
5936		} else {
5937			len = 0;
5938			ncount = 0;
5939		}
5940		if (ncount < pcount) {
5941			for (i = ncount; i < pcount; i++) {
5942				rtvals[i] = zfs_vm_pagerret_bad;
5943			}
5944		}
5945	}
5946	zfs_vmobject_wunlock(object);
5947
5948	if (ncount == 0)
5949		goto out;
5950
5951	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
5952	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
5953		goto out;
5954	}
5955
5956top:
5957	tx = dmu_tx_create(zfsvfs->z_os);
5958	dmu_tx_hold_write(tx, zp->z_id, off, len);
5959
5960	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5961	zfs_sa_upgrade_txholds(tx, zp);
5962	err = dmu_tx_assign(tx, TXG_NOWAIT);
5963	if (err != 0) {
5964		if (err == ERESTART) {
5965			dmu_tx_wait(tx);
5966			dmu_tx_abort(tx);
5967			goto top;
5968		}
5969		dmu_tx_abort(tx);
5970		goto out;
5971	}
5972
5973	if (zp->z_blksz < PAGE_SIZE) {
5974		i = 0;
5975		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
5976			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
5977			va = zfs_map_page(ma[i], &sf);
5978			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
5979			zfs_unmap_page(sf);
5980		}
5981	} else {
5982		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
5983	}
5984
5985	if (err == 0) {
5986		uint64_t mtime[2], ctime[2];
5987		sa_bulk_attr_t bulk[3];
5988		int count = 0;
5989
5990		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
5991		    &mtime, 16);
5992		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
5993		    &ctime, 16);
5994		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
5995		    &zp->z_pflags, 8);
5996		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
5997		    B_TRUE);
5998		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
5999		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6000
6001		zfs_vmobject_wlock(object);
6002		for (i = 0; i < ncount; i++) {
6003			rtvals[i] = zfs_vm_pagerret_ok;
6004			vm_page_undirty(ma[i]);
6005		}
6006		zfs_vmobject_wunlock(object);
6007		PCPU_INC(cnt.v_vnodeout);
6008		PCPU_ADD(cnt.v_vnodepgsout, ncount);
6009	}
6010	dmu_tx_commit(tx);
6011
6012out:
6013	zfs_range_unlock(rl);
6014	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
6015	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6016		zil_commit(zfsvfs->z_log, zp->z_id);
6017	ZFS_EXIT(zfsvfs);
6018	return (rtvals[0]);
6019}
6020
6021int
6022zfs_freebsd_putpages(ap)
6023	struct vop_putpages_args /* {
6024		struct vnode *a_vp;
6025		vm_page_t *a_m;
6026		int a_count;
6027		int a_sync;
6028		int *a_rtvals;
6029		vm_ooffset_t a_offset;
6030	} */ *ap;
6031{
6032
6033	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
6034	    ap->a_rtvals));
6035}
6036
6037static int
6038zfs_freebsd_bmap(ap)
6039	struct vop_bmap_args /* {
6040		struct vnode *a_vp;
6041		daddr_t  a_bn;
6042		struct bufobj **a_bop;
6043		daddr_t *a_bnp;
6044		int *a_runp;
6045		int *a_runb;
6046	} */ *ap;
6047{
6048
6049	if (ap->a_bop != NULL)
6050		*ap->a_bop = &ap->a_vp->v_bufobj;
6051	if (ap->a_bnp != NULL)
6052		*ap->a_bnp = ap->a_bn;
6053	if (ap->a_runp != NULL)
6054		*ap->a_runp = 0;
6055	if (ap->a_runb != NULL)
6056		*ap->a_runb = 0;
6057
6058	return (0);
6059}
6060
6061static int
6062zfs_freebsd_open(ap)
6063	struct vop_open_args /* {
6064		struct vnode *a_vp;
6065		int a_mode;
6066		struct ucred *a_cred;
6067		struct thread *a_td;
6068	} */ *ap;
6069{
6070	vnode_t	*vp = ap->a_vp;
6071	znode_t *zp = VTOZ(vp);
6072	int error;
6073
6074	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
6075	if (error == 0)
6076		vnode_create_vobject(vp, zp->z_size, ap->a_td);
6077	return (error);
6078}
6079
6080static int
6081zfs_freebsd_close(ap)
6082	struct vop_close_args /* {
6083		struct vnode *a_vp;
6084		int  a_fflag;
6085		struct ucred *a_cred;
6086		struct thread *a_td;
6087	} */ *ap;
6088{
6089
6090	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
6091}
6092
6093static int
6094zfs_freebsd_ioctl(ap)
6095	struct vop_ioctl_args /* {
6096		struct vnode *a_vp;
6097		u_long a_command;
6098		caddr_t a_data;
6099		int a_fflag;
6100		struct ucred *cred;
6101		struct thread *td;
6102	} */ *ap;
6103{
6104
6105	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
6106	    ap->a_fflag, ap->a_cred, NULL, NULL));
6107}
6108
6109static int
6110zfs_freebsd_read(ap)
6111	struct vop_read_args /* {
6112		struct vnode *a_vp;
6113		struct uio *a_uio;
6114		int a_ioflag;
6115		struct ucred *a_cred;
6116	} */ *ap;
6117{
6118
6119	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6120	    ap->a_cred, NULL));
6121}
6122
6123static int
6124zfs_freebsd_write(ap)
6125	struct vop_write_args /* {
6126		struct vnode *a_vp;
6127		struct uio *a_uio;
6128		int a_ioflag;
6129		struct ucred *a_cred;
6130	} */ *ap;
6131{
6132
6133	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6134	    ap->a_cred, NULL));
6135}
6136
6137static int
6138zfs_freebsd_access(ap)
6139	struct vop_access_args /* {
6140		struct vnode *a_vp;
6141		accmode_t a_accmode;
6142		struct ucred *a_cred;
6143		struct thread *a_td;
6144	} */ *ap;
6145{
6146	vnode_t *vp = ap->a_vp;
6147	znode_t *zp = VTOZ(vp);
6148	accmode_t accmode;
6149	int error = 0;
6150
6151	/*
6152	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
6153	 */
6154	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
6155	if (accmode != 0)
6156		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
6157
6158	/*
6159	 * VADMIN has to be handled by vaccess().
6160	 */
6161	if (error == 0) {
6162		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
6163		if (accmode != 0) {
6164			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
6165			    zp->z_gid, accmode, ap->a_cred, NULL);
6166		}
6167	}
6168
6169	/*
6170	 * For VEXEC, ensure that at least one execute bit is set for
6171	 * non-directories.
6172	 */
6173	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
6174	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
6175		error = EACCES;
6176	}
6177
6178	return (error);
6179}
6180
6181static int
6182zfs_freebsd_lookup(ap)
6183	struct vop_lookup_args /* {
6184		struct vnode *a_dvp;
6185		struct vnode **a_vpp;
6186		struct componentname *a_cnp;
6187	} */ *ap;
6188{
6189	struct componentname *cnp = ap->a_cnp;
6190	char nm[NAME_MAX + 1];
6191
6192	ASSERT(cnp->cn_namelen < sizeof(nm));
6193	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
6194
6195	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
6196	    cnp->cn_cred, cnp->cn_thread, 0));
6197}
6198
6199static int
6200zfs_freebsd_create(ap)
6201	struct vop_create_args /* {
6202		struct vnode *a_dvp;
6203		struct vnode **a_vpp;
6204		struct componentname *a_cnp;
6205		struct vattr *a_vap;
6206	} */ *ap;
6207{
6208	struct componentname *cnp = ap->a_cnp;
6209	vattr_t *vap = ap->a_vap;
6210	int error, mode;
6211
6212	ASSERT(cnp->cn_flags & SAVENAME);
6213
6214	vattr_init_mask(vap);
6215	mode = vap->va_mode & ALLPERMS;
6216
6217	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
6218	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
6219#ifdef FREEBSD_NAMECACHE
6220	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
6221		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
6222#endif
6223	return (error);
6224}
6225
6226static int
6227zfs_freebsd_remove(ap)
6228	struct vop_remove_args /* {
6229		struct vnode *a_dvp;
6230		struct vnode *a_vp;
6231		struct componentname *a_cnp;
6232	} */ *ap;
6233{
6234
6235	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6236
6237	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
6238	    ap->a_cnp->cn_cred, NULL, 0));
6239}
6240
6241static int
6242zfs_freebsd_mkdir(ap)
6243	struct vop_mkdir_args /* {
6244		struct vnode *a_dvp;
6245		struct vnode **a_vpp;
6246		struct componentname *a_cnp;
6247		struct vattr *a_vap;
6248	} */ *ap;
6249{
6250	vattr_t *vap = ap->a_vap;
6251
6252	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6253
6254	vattr_init_mask(vap);
6255
6256	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
6257	    ap->a_cnp->cn_cred, NULL, 0, NULL));
6258}
6259
6260static int
6261zfs_freebsd_rmdir(ap)
6262	struct vop_rmdir_args /* {
6263		struct vnode *a_dvp;
6264		struct vnode *a_vp;
6265		struct componentname *a_cnp;
6266	} */ *ap;
6267{
6268	struct componentname *cnp = ap->a_cnp;
6269
6270	ASSERT(cnp->cn_flags & SAVENAME);
6271
6272	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
6273}
6274
6275static int
6276zfs_freebsd_readdir(ap)
6277	struct vop_readdir_args /* {
6278		struct vnode *a_vp;
6279		struct uio *a_uio;
6280		struct ucred *a_cred;
6281		int *a_eofflag;
6282		int *a_ncookies;
6283		u_long **a_cookies;
6284	} */ *ap;
6285{
6286
6287	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
6288	    ap->a_ncookies, ap->a_cookies));
6289}
6290
6291static int
6292zfs_freebsd_fsync(ap)
6293	struct vop_fsync_args /* {
6294		struct vnode *a_vp;
6295		int a_waitfor;
6296		struct thread *a_td;
6297	} */ *ap;
6298{
6299
6300	vop_stdfsync(ap);
6301	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
6302}
6303
6304static int
6305zfs_freebsd_getattr(ap)
6306	struct vop_getattr_args /* {
6307		struct vnode *a_vp;
6308		struct vattr *a_vap;
6309		struct ucred *a_cred;
6310	} */ *ap;
6311{
6312	vattr_t *vap = ap->a_vap;
6313	xvattr_t xvap;
6314	u_long fflags = 0;
6315	int error;
6316
6317	xva_init(&xvap);
6318	xvap.xva_vattr = *vap;
6319	xvap.xva_vattr.va_mask |= AT_XVATTR;
6320
6321	/* Convert chflags into ZFS-type flags. */
6322	/* XXX: what about SF_SETTABLE?. */
6323	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
6324	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
6325	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
6326	XVA_SET_REQ(&xvap, XAT_NODUMP);
6327	XVA_SET_REQ(&xvap, XAT_READONLY);
6328	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
6329	XVA_SET_REQ(&xvap, XAT_SYSTEM);
6330	XVA_SET_REQ(&xvap, XAT_HIDDEN);
6331	XVA_SET_REQ(&xvap, XAT_REPARSE);
6332	XVA_SET_REQ(&xvap, XAT_OFFLINE);
6333	XVA_SET_REQ(&xvap, XAT_SPARSE);
6334
6335	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
6336	if (error != 0)
6337		return (error);
6338
6339	/* Convert ZFS xattr into chflags. */
6340#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
6341	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
6342		fflags |= (fflag);					\
6343} while (0)
6344	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
6345	    xvap.xva_xoptattrs.xoa_immutable);
6346	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
6347	    xvap.xva_xoptattrs.xoa_appendonly);
6348	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
6349	    xvap.xva_xoptattrs.xoa_nounlink);
6350	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
6351	    xvap.xva_xoptattrs.xoa_archive);
6352	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
6353	    xvap.xva_xoptattrs.xoa_nodump);
6354	FLAG_CHECK(UF_READONLY, XAT_READONLY,
6355	    xvap.xva_xoptattrs.xoa_readonly);
6356	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
6357	    xvap.xva_xoptattrs.xoa_system);
6358	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
6359	    xvap.xva_xoptattrs.xoa_hidden);
6360	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
6361	    xvap.xva_xoptattrs.xoa_reparse);
6362	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
6363	    xvap.xva_xoptattrs.xoa_offline);
6364	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
6365	    xvap.xva_xoptattrs.xoa_sparse);
6366
6367#undef	FLAG_CHECK
6368	*vap = xvap.xva_vattr;
6369	vap->va_flags = fflags;
6370	return (0);
6371}
6372
6373static int
6374zfs_freebsd_setattr(ap)
6375	struct vop_setattr_args /* {
6376		struct vnode *a_vp;
6377		struct vattr *a_vap;
6378		struct ucred *a_cred;
6379	} */ *ap;
6380{
6381	vnode_t *vp = ap->a_vp;
6382	vattr_t *vap = ap->a_vap;
6383	cred_t *cred = ap->a_cred;
6384	xvattr_t xvap;
6385	u_long fflags;
6386	uint64_t zflags;
6387
6388	vattr_init_mask(vap);
6389	vap->va_mask &= ~AT_NOSET;
6390
6391	xva_init(&xvap);
6392	xvap.xva_vattr = *vap;
6393
6394	zflags = VTOZ(vp)->z_pflags;
6395
6396	if (vap->va_flags != VNOVAL) {
6397		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
6398		int error;
6399
6400		if (zfsvfs->z_use_fuids == B_FALSE)
6401			return (EOPNOTSUPP);
6402
6403		fflags = vap->va_flags;
6404		/*
6405		 * XXX KDM
6406		 * We need to figure out whether it makes sense to allow
6407		 * UF_REPARSE through, since we don't really have other
6408		 * facilities to handle reparse points and zfs_setattr()
6409		 * doesn't currently allow setting that attribute anyway.
6410		 */
6411		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
6412		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
6413		     UF_OFFLINE|UF_SPARSE)) != 0)
6414			return (EOPNOTSUPP);
6415		/*
6416		 * Unprivileged processes are not permitted to unset system
6417		 * flags, or modify flags if any system flags are set.
6418		 * Privileged non-jail processes may not modify system flags
6419		 * if securelevel > 0 and any existing system flags are set.
6420		 * Privileged jail processes behave like privileged non-jail
6421		 * processes if the security.jail.chflags_allowed sysctl is
6422		 * is non-zero; otherwise, they behave like unprivileged
6423		 * processes.
6424		 */
6425		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
6426		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
6427			if (zflags &
6428			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6429				error = securelevel_gt(cred, 0);
6430				if (error != 0)
6431					return (error);
6432			}
6433		} else {
6434			/*
6435			 * Callers may only modify the file flags on objects they
6436			 * have VADMIN rights for.
6437			 */
6438			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
6439				return (error);
6440			if (zflags &
6441			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6442				return (EPERM);
6443			}
6444			if (fflags &
6445			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
6446				return (EPERM);
6447			}
6448		}
6449
6450#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
6451	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
6452	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
6453		XVA_SET_REQ(&xvap, (xflag));				\
6454		(xfield) = ((fflags & (fflag)) != 0);			\
6455	}								\
6456} while (0)
6457		/* Convert chflags into ZFS-type flags. */
6458		/* XXX: what about SF_SETTABLE?. */
6459		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6460		    xvap.xva_xoptattrs.xoa_immutable);
6461		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6462		    xvap.xva_xoptattrs.xoa_appendonly);
6463		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6464		    xvap.xva_xoptattrs.xoa_nounlink);
6465		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
6466		    xvap.xva_xoptattrs.xoa_archive);
6467		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6468		    xvap.xva_xoptattrs.xoa_nodump);
6469		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
6470		    xvap.xva_xoptattrs.xoa_readonly);
6471		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
6472		    xvap.xva_xoptattrs.xoa_system);
6473		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
6474		    xvap.xva_xoptattrs.xoa_hidden);
6475		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
6476		    xvap.xva_xoptattrs.xoa_hidden);
6477		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
6478		    xvap.xva_xoptattrs.xoa_offline);
6479		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
6480		    xvap.xva_xoptattrs.xoa_sparse);
6481#undef	FLAG_CHANGE
6482	}
6483	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6484}
6485
6486static int
6487zfs_freebsd_rename(ap)
6488	struct vop_rename_args  /* {
6489		struct vnode *a_fdvp;
6490		struct vnode *a_fvp;
6491		struct componentname *a_fcnp;
6492		struct vnode *a_tdvp;
6493		struct vnode *a_tvp;
6494		struct componentname *a_tcnp;
6495	} */ *ap;
6496{
6497	vnode_t *fdvp = ap->a_fdvp;
6498	vnode_t *fvp = ap->a_fvp;
6499	vnode_t *tdvp = ap->a_tdvp;
6500	vnode_t *tvp = ap->a_tvp;
6501	int error;
6502
6503	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6504	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6505
6506	/*
6507	 * Check for cross-device rename.
6508	 */
6509	if ((fdvp->v_mount != tdvp->v_mount) ||
6510	    (tvp && (fdvp->v_mount != tvp->v_mount)))
6511		error = EXDEV;
6512	else
6513		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6514		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6515	if (tdvp == tvp)
6516		VN_RELE(tdvp);
6517	else
6518		VN_URELE(tdvp);
6519	if (tvp)
6520		VN_URELE(tvp);
6521	VN_RELE(fdvp);
6522	VN_RELE(fvp);
6523
6524	return (error);
6525}
6526
6527static int
6528zfs_freebsd_symlink(ap)
6529	struct vop_symlink_args /* {
6530		struct vnode *a_dvp;
6531		struct vnode **a_vpp;
6532		struct componentname *a_cnp;
6533		struct vattr *a_vap;
6534		char *a_target;
6535	} */ *ap;
6536{
6537	struct componentname *cnp = ap->a_cnp;
6538	vattr_t *vap = ap->a_vap;
6539
6540	ASSERT(cnp->cn_flags & SAVENAME);
6541
6542	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6543	vattr_init_mask(vap);
6544
6545	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6546	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6547}
6548
6549static int
6550zfs_freebsd_readlink(ap)
6551	struct vop_readlink_args /* {
6552		struct vnode *a_vp;
6553		struct uio *a_uio;
6554		struct ucred *a_cred;
6555	} */ *ap;
6556{
6557
6558	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6559}
6560
6561static int
6562zfs_freebsd_link(ap)
6563	struct vop_link_args /* {
6564		struct vnode *a_tdvp;
6565		struct vnode *a_vp;
6566		struct componentname *a_cnp;
6567	} */ *ap;
6568{
6569	struct componentname *cnp = ap->a_cnp;
6570	vnode_t *vp = ap->a_vp;
6571	vnode_t *tdvp = ap->a_tdvp;
6572
6573	if (tdvp->v_mount != vp->v_mount)
6574		return (EXDEV);
6575
6576	ASSERT(cnp->cn_flags & SAVENAME);
6577
6578	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6579}
6580
6581static int
6582zfs_freebsd_inactive(ap)
6583	struct vop_inactive_args /* {
6584		struct vnode *a_vp;
6585		struct thread *a_td;
6586	} */ *ap;
6587{
6588	vnode_t *vp = ap->a_vp;
6589
6590	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6591	return (0);
6592}
6593
6594static int
6595zfs_freebsd_reclaim(ap)
6596	struct vop_reclaim_args /* {
6597		struct vnode *a_vp;
6598		struct thread *a_td;
6599	} */ *ap;
6600{
6601	vnode_t	*vp = ap->a_vp;
6602	znode_t	*zp = VTOZ(vp);
6603	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6604
6605	ASSERT(zp != NULL);
6606
6607	/* Destroy the vm object and flush associated pages. */
6608	vnode_destroy_vobject(vp);
6609
6610	/*
6611	 * z_teardown_inactive_lock protects from a race with
6612	 * zfs_znode_dmu_fini in zfsvfs_teardown during
6613	 * force unmount.
6614	 */
6615	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6616	if (zp->z_sa_hdl == NULL)
6617		zfs_znode_free(zp);
6618	else
6619		zfs_zinactive(zp);
6620	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6621
6622	vp->v_data = NULL;
6623	return (0);
6624}
6625
6626static int
6627zfs_freebsd_fid(ap)
6628	struct vop_fid_args /* {
6629		struct vnode *a_vp;
6630		struct fid *a_fid;
6631	} */ *ap;
6632{
6633
6634	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6635}
6636
6637static int
6638zfs_freebsd_pathconf(ap)
6639	struct vop_pathconf_args /* {
6640		struct vnode *a_vp;
6641		int a_name;
6642		register_t *a_retval;
6643	} */ *ap;
6644{
6645	ulong_t val;
6646	int error;
6647
6648	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6649	if (error == 0)
6650		*ap->a_retval = val;
6651	else if (error == EOPNOTSUPP)
6652		error = vop_stdpathconf(ap);
6653	return (error);
6654}
6655
6656static int
6657zfs_freebsd_fifo_pathconf(ap)
6658	struct vop_pathconf_args /* {
6659		struct vnode *a_vp;
6660		int a_name;
6661		register_t *a_retval;
6662	} */ *ap;
6663{
6664
6665	switch (ap->a_name) {
6666	case _PC_ACL_EXTENDED:
6667	case _PC_ACL_NFS4:
6668	case _PC_ACL_PATH_MAX:
6669	case _PC_MAC_PRESENT:
6670		return (zfs_freebsd_pathconf(ap));
6671	default:
6672		return (fifo_specops.vop_pathconf(ap));
6673	}
6674}
6675
6676/*
6677 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6678 * extended attribute name:
6679 *
6680 *	NAMESPACE	PREFIX
6681 *	system		freebsd:system:
6682 *	user		(none, can be used to access ZFS fsattr(5) attributes
6683 *			created on Solaris)
6684 */
6685static int
6686zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6687    size_t size)
6688{
6689	const char *namespace, *prefix, *suffix;
6690
6691	/* We don't allow '/' character in attribute name. */
6692	if (strchr(name, '/') != NULL)
6693		return (EINVAL);
6694	/* We don't allow attribute names that start with "freebsd:" string. */
6695	if (strncmp(name, "freebsd:", 8) == 0)
6696		return (EINVAL);
6697
6698	bzero(attrname, size);
6699
6700	switch (attrnamespace) {
6701	case EXTATTR_NAMESPACE_USER:
6702#if 0
6703		prefix = "freebsd:";
6704		namespace = EXTATTR_NAMESPACE_USER_STRING;
6705		suffix = ":";
6706#else
6707		/*
6708		 * This is the default namespace by which we can access all
6709		 * attributes created on Solaris.
6710		 */
6711		prefix = namespace = suffix = "";
6712#endif
6713		break;
6714	case EXTATTR_NAMESPACE_SYSTEM:
6715		prefix = "freebsd:";
6716		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6717		suffix = ":";
6718		break;
6719	case EXTATTR_NAMESPACE_EMPTY:
6720	default:
6721		return (EINVAL);
6722	}
6723	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6724	    name) >= size) {
6725		return (ENAMETOOLONG);
6726	}
6727	return (0);
6728}
6729
6730/*
6731 * Vnode operating to retrieve a named extended attribute.
6732 */
6733static int
6734zfs_getextattr(struct vop_getextattr_args *ap)
6735/*
6736vop_getextattr {
6737	IN struct vnode *a_vp;
6738	IN int a_attrnamespace;
6739	IN const char *a_name;
6740	INOUT struct uio *a_uio;
6741	OUT size_t *a_size;
6742	IN struct ucred *a_cred;
6743	IN struct thread *a_td;
6744};
6745*/
6746{
6747	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6748	struct thread *td = ap->a_td;
6749	struct nameidata nd;
6750	char attrname[255];
6751	struct vattr va;
6752	vnode_t *xvp = NULL, *vp;
6753	int error, flags;
6754
6755	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6756	    ap->a_cred, ap->a_td, VREAD);
6757	if (error != 0)
6758		return (error);
6759
6760	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6761	    sizeof(attrname));
6762	if (error != 0)
6763		return (error);
6764
6765	ZFS_ENTER(zfsvfs);
6766
6767	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6768	    LOOKUP_XATTR);
6769	if (error != 0) {
6770		ZFS_EXIT(zfsvfs);
6771		return (error);
6772	}
6773
6774	flags = FREAD;
6775	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6776	    xvp, td);
6777	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6778	vp = nd.ni_vp;
6779	NDFREE(&nd, NDF_ONLY_PNBUF);
6780	if (error != 0) {
6781		ZFS_EXIT(zfsvfs);
6782		if (error == ENOENT)
6783			error = ENOATTR;
6784		return (error);
6785	}
6786
6787	if (ap->a_size != NULL) {
6788		error = VOP_GETATTR(vp, &va, ap->a_cred);
6789		if (error == 0)
6790			*ap->a_size = (size_t)va.va_size;
6791	} else if (ap->a_uio != NULL)
6792		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6793
6794	VOP_UNLOCK(vp, 0);
6795	vn_close(vp, flags, ap->a_cred, td);
6796	ZFS_EXIT(zfsvfs);
6797
6798	return (error);
6799}
6800
6801/*
6802 * Vnode operation to remove a named attribute.
6803 */
6804int
6805zfs_deleteextattr(struct vop_deleteextattr_args *ap)
6806/*
6807vop_deleteextattr {
6808	IN struct vnode *a_vp;
6809	IN int a_attrnamespace;
6810	IN const char *a_name;
6811	IN struct ucred *a_cred;
6812	IN struct thread *a_td;
6813};
6814*/
6815{
6816	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6817	struct thread *td = ap->a_td;
6818	struct nameidata nd;
6819	char attrname[255];
6820	struct vattr va;
6821	vnode_t *xvp = NULL, *vp;
6822	int error, flags;
6823
6824	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6825	    ap->a_cred, ap->a_td, VWRITE);
6826	if (error != 0)
6827		return (error);
6828
6829	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6830	    sizeof(attrname));
6831	if (error != 0)
6832		return (error);
6833
6834	ZFS_ENTER(zfsvfs);
6835
6836	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6837	    LOOKUP_XATTR);
6838	if (error != 0) {
6839		ZFS_EXIT(zfsvfs);
6840		return (error);
6841	}
6842
6843	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6844	    UIO_SYSSPACE, attrname, xvp, td);
6845	error = namei(&nd);
6846	vp = nd.ni_vp;
6847	if (error != 0) {
6848		ZFS_EXIT(zfsvfs);
6849		NDFREE(&nd, NDF_ONLY_PNBUF);
6850		if (error == ENOENT)
6851			error = ENOATTR;
6852		return (error);
6853	}
6854
6855	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6856	NDFREE(&nd, NDF_ONLY_PNBUF);
6857
6858	vput(nd.ni_dvp);
6859	if (vp == nd.ni_dvp)
6860		vrele(vp);
6861	else
6862		vput(vp);
6863	ZFS_EXIT(zfsvfs);
6864
6865	return (error);
6866}
6867
6868/*
6869 * Vnode operation to set a named attribute.
6870 */
6871static int
6872zfs_setextattr(struct vop_setextattr_args *ap)
6873/*
6874vop_setextattr {
6875	IN struct vnode *a_vp;
6876	IN int a_attrnamespace;
6877	IN const char *a_name;
6878	INOUT struct uio *a_uio;
6879	IN struct ucred *a_cred;
6880	IN struct thread *a_td;
6881};
6882*/
6883{
6884	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6885	struct thread *td = ap->a_td;
6886	struct nameidata nd;
6887	char attrname[255];
6888	struct vattr va;
6889	vnode_t *xvp = NULL, *vp;
6890	int error, flags;
6891
6892	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6893	    ap->a_cred, ap->a_td, VWRITE);
6894	if (error != 0)
6895		return (error);
6896
6897	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6898	    sizeof(attrname));
6899	if (error != 0)
6900		return (error);
6901
6902	ZFS_ENTER(zfsvfs);
6903
6904	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6905	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6906	if (error != 0) {
6907		ZFS_EXIT(zfsvfs);
6908		return (error);
6909	}
6910
6911	flags = FFLAGS(O_WRONLY | O_CREAT);
6912	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6913	    xvp, td);
6914	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6915	vp = nd.ni_vp;
6916	NDFREE(&nd, NDF_ONLY_PNBUF);
6917	if (error != 0) {
6918		ZFS_EXIT(zfsvfs);
6919		return (error);
6920	}
6921
6922	VATTR_NULL(&va);
6923	va.va_size = 0;
6924	error = VOP_SETATTR(vp, &va, ap->a_cred);
6925	if (error == 0)
6926		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6927
6928	VOP_UNLOCK(vp, 0);
6929	vn_close(vp, flags, ap->a_cred, td);
6930	ZFS_EXIT(zfsvfs);
6931
6932	return (error);
6933}
6934
6935/*
6936 * Vnode operation to retrieve extended attributes on a vnode.
6937 */
6938static int
6939zfs_listextattr(struct vop_listextattr_args *ap)
6940/*
6941vop_listextattr {
6942	IN struct vnode *a_vp;
6943	IN int a_attrnamespace;
6944	INOUT struct uio *a_uio;
6945	OUT size_t *a_size;
6946	IN struct ucred *a_cred;
6947	IN struct thread *a_td;
6948};
6949*/
6950{
6951	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6952	struct thread *td = ap->a_td;
6953	struct nameidata nd;
6954	char attrprefix[16];
6955	u_char dirbuf[sizeof(struct dirent)];
6956	struct dirent *dp;
6957	struct iovec aiov;
6958	struct uio auio, *uio = ap->a_uio;
6959	size_t *sizep = ap->a_size;
6960	size_t plen;
6961	vnode_t *xvp = NULL, *vp;
6962	int done, error, eof, pos;
6963
6964	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6965	    ap->a_cred, ap->a_td, VREAD);
6966	if (error != 0)
6967		return (error);
6968
6969	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6970	    sizeof(attrprefix));
6971	if (error != 0)
6972		return (error);
6973	plen = strlen(attrprefix);
6974
6975	ZFS_ENTER(zfsvfs);
6976
6977	if (sizep != NULL)
6978		*sizep = 0;
6979
6980	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6981	    LOOKUP_XATTR);
6982	if (error != 0) {
6983		ZFS_EXIT(zfsvfs);
6984		/*
6985		 * ENOATTR means that the EA directory does not yet exist,
6986		 * i.e. there are no extended attributes there.
6987		 */
6988		if (error == ENOATTR)
6989			error = 0;
6990		return (error);
6991	}
6992
6993	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6994	    UIO_SYSSPACE, ".", xvp, td);
6995	error = namei(&nd);
6996	vp = nd.ni_vp;
6997	NDFREE(&nd, NDF_ONLY_PNBUF);
6998	if (error != 0) {
6999		ZFS_EXIT(zfsvfs);
7000		return (error);
7001	}
7002
7003	auio.uio_iov = &aiov;
7004	auio.uio_iovcnt = 1;
7005	auio.uio_segflg = UIO_SYSSPACE;
7006	auio.uio_td = td;
7007	auio.uio_rw = UIO_READ;
7008	auio.uio_offset = 0;
7009
7010	do {
7011		u_char nlen;
7012
7013		aiov.iov_base = (void *)dirbuf;
7014		aiov.iov_len = sizeof(dirbuf);
7015		auio.uio_resid = sizeof(dirbuf);
7016		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
7017		done = sizeof(dirbuf) - auio.uio_resid;
7018		if (error != 0)
7019			break;
7020		for (pos = 0; pos < done;) {
7021			dp = (struct dirent *)(dirbuf + pos);
7022			pos += dp->d_reclen;
7023			/*
7024			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
7025			 * is what we get when attribute was created on Solaris.
7026			 */
7027			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
7028				continue;
7029			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
7030				continue;
7031			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
7032				continue;
7033			nlen = dp->d_namlen - plen;
7034			if (sizep != NULL)
7035				*sizep += 1 + nlen;
7036			else if (uio != NULL) {
7037				/*
7038				 * Format of extattr name entry is one byte for
7039				 * length and the rest for name.
7040				 */
7041				error = uiomove(&nlen, 1, uio->uio_rw, uio);
7042				if (error == 0) {
7043					error = uiomove(dp->d_name + plen, nlen,
7044					    uio->uio_rw, uio);
7045				}
7046				if (error != 0)
7047					break;
7048			}
7049		}
7050	} while (!eof && error == 0);
7051
7052	vput(vp);
7053	ZFS_EXIT(zfsvfs);
7054
7055	return (error);
7056}
7057
7058int
7059zfs_freebsd_getacl(ap)
7060	struct vop_getacl_args /* {
7061		struct vnode *vp;
7062		acl_type_t type;
7063		struct acl *aclp;
7064		struct ucred *cred;
7065		struct thread *td;
7066	} */ *ap;
7067{
7068	int		error;
7069	vsecattr_t      vsecattr;
7070
7071	if (ap->a_type != ACL_TYPE_NFS4)
7072		return (EINVAL);
7073
7074	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
7075	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
7076		return (error);
7077
7078	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
7079	if (vsecattr.vsa_aclentp != NULL)
7080		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
7081
7082	return (error);
7083}
7084
7085int
7086zfs_freebsd_setacl(ap)
7087	struct vop_setacl_args /* {
7088		struct vnode *vp;
7089		acl_type_t type;
7090		struct acl *aclp;
7091		struct ucred *cred;
7092		struct thread *td;
7093	} */ *ap;
7094{
7095	int		error;
7096	vsecattr_t      vsecattr;
7097	int		aclbsize;	/* size of acl list in bytes */
7098	aclent_t	*aaclp;
7099
7100	if (ap->a_type != ACL_TYPE_NFS4)
7101		return (EINVAL);
7102
7103	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
7104		return (EINVAL);
7105
7106	/*
7107	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
7108	 * splitting every entry into two and appending "canonical six"
7109	 * entries at the end.  Don't allow for setting an ACL that would
7110	 * cause chmod(2) to run out of ACL entries.
7111	 */
7112	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
7113		return (ENOSPC);
7114
7115	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
7116	if (error != 0)
7117		return (error);
7118
7119	vsecattr.vsa_mask = VSA_ACE;
7120	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
7121	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
7122	aaclp = vsecattr.vsa_aclentp;
7123	vsecattr.vsa_aclentsz = aclbsize;
7124
7125	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
7126	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
7127	kmem_free(aaclp, aclbsize);
7128
7129	return (error);
7130}
7131
7132int
7133zfs_freebsd_aclcheck(ap)
7134	struct vop_aclcheck_args /* {
7135		struct vnode *vp;
7136		acl_type_t type;
7137		struct acl *aclp;
7138		struct ucred *cred;
7139		struct thread *td;
7140	} */ *ap;
7141{
7142
7143	return (EOPNOTSUPP);
7144}
7145
7146struct vop_vector zfs_vnodeops;
7147struct vop_vector zfs_fifoops;
7148struct vop_vector zfs_shareops;
7149
7150struct vop_vector zfs_vnodeops = {
7151	.vop_default =		&default_vnodeops,
7152	.vop_inactive =		zfs_freebsd_inactive,
7153	.vop_reclaim =		zfs_freebsd_reclaim,
7154	.vop_access =		zfs_freebsd_access,
7155#ifdef FREEBSD_NAMECACHE
7156	.vop_lookup =		vfs_cache_lookup,
7157	.vop_cachedlookup =	zfs_freebsd_lookup,
7158#else
7159	.vop_lookup =		zfs_freebsd_lookup,
7160#endif
7161	.vop_getattr =		zfs_freebsd_getattr,
7162	.vop_setattr =		zfs_freebsd_setattr,
7163	.vop_create =		zfs_freebsd_create,
7164	.vop_mknod =		zfs_freebsd_create,
7165	.vop_mkdir =		zfs_freebsd_mkdir,
7166	.vop_readdir =		zfs_freebsd_readdir,
7167	.vop_fsync =		zfs_freebsd_fsync,
7168	.vop_open =		zfs_freebsd_open,
7169	.vop_close =		zfs_freebsd_close,
7170	.vop_rmdir =		zfs_freebsd_rmdir,
7171	.vop_ioctl =		zfs_freebsd_ioctl,
7172	.vop_link =		zfs_freebsd_link,
7173	.vop_symlink =		zfs_freebsd_symlink,
7174	.vop_readlink =		zfs_freebsd_readlink,
7175	.vop_read =		zfs_freebsd_read,
7176	.vop_write =		zfs_freebsd_write,
7177	.vop_remove =		zfs_freebsd_remove,
7178	.vop_rename =		zfs_freebsd_rename,
7179	.vop_pathconf =		zfs_freebsd_pathconf,
7180	.vop_bmap =		zfs_freebsd_bmap,
7181	.vop_fid =		zfs_freebsd_fid,
7182	.vop_getextattr =	zfs_getextattr,
7183	.vop_deleteextattr =	zfs_deleteextattr,
7184	.vop_setextattr =	zfs_setextattr,
7185	.vop_listextattr =	zfs_listextattr,
7186	.vop_getacl =		zfs_freebsd_getacl,
7187	.vop_setacl =		zfs_freebsd_setacl,
7188	.vop_aclcheck =		zfs_freebsd_aclcheck,
7189	.vop_getpages =		zfs_freebsd_getpages,
7190	.vop_putpages =		zfs_freebsd_putpages,
7191};
7192
7193struct vop_vector zfs_fifoops = {
7194	.vop_default =		&fifo_specops,
7195	.vop_fsync =		zfs_freebsd_fsync,
7196	.vop_access =		zfs_freebsd_access,
7197	.vop_getattr =		zfs_freebsd_getattr,
7198	.vop_inactive =		zfs_freebsd_inactive,
7199	.vop_read =		VOP_PANIC,
7200	.vop_reclaim =		zfs_freebsd_reclaim,
7201	.vop_setattr =		zfs_freebsd_setattr,
7202	.vop_write =		VOP_PANIC,
7203	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
7204	.vop_fid =		zfs_freebsd_fid,
7205	.vop_getacl =		zfs_freebsd_getacl,
7206	.vop_setacl =		zfs_freebsd_setacl,
7207	.vop_aclcheck =		zfs_freebsd_aclcheck,
7208};
7209
7210/*
7211 * special share hidden files vnode operations template
7212 */
7213struct vop_vector zfs_shareops = {
7214	.vop_default =		&default_vnodeops,
7215	.vop_access =		zfs_freebsd_access,
7216	.vop_inactive =		zfs_freebsd_inactive,
7217	.vop_reclaim =		zfs_freebsd_reclaim,
7218	.vop_fid =		zfs_freebsd_fid,
7219	.vop_pathconf =		zfs_freebsd_pathconf,
7220};
7221