zfs_vnops.c revision 304121
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/dnlc.h>
70#include <sys/zfs_rlock.h>
71#include <sys/extdirent.h>
72#include <sys/kidmap.h>
73#include <sys/bio.h>
74#include <sys/buf.h>
75#include <sys/sched.h>
76#include <sys/acl.h>
77#include <vm/vm_param.h>
78
79/*
80 * Programming rules.
81 *
82 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
83 * properly lock its in-core state, create a DMU transaction, do the work,
84 * record this work in the intent log (ZIL), commit the DMU transaction,
85 * and wait for the intent log to commit if it is a synchronous operation.
86 * Moreover, the vnode ops must work in both normal and log replay context.
87 * The ordering of events is important to avoid deadlocks and references
88 * to freed memory.  The example below illustrates the following Big Rules:
89 *
90 *  (1)	A check must be made in each zfs thread for a mounted file system.
91 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
92 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
93 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
94 *	can return EIO from the calling function.
95 *
96 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
97 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
98 *	First, if it's the last reference, the vnode/znode
99 *	can be freed, so the zp may point to freed memory.  Second, the last
100 *	reference will call zfs_zinactive(), which may induce a lot of work --
101 *	pushing cached pages (which acquires range locks) and syncing out
102 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
103 *	which could deadlock the system if you were already holding one.
104 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
105 *
106 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
107 *	as they can span dmu_tx_assign() calls.
108 *
109 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
110 *      dmu_tx_assign().  This is critical because we don't want to block
111 *      while holding locks.
112 *
113 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
114 *	reduces lock contention and CPU usage when we must wait (note that if
115 *	throughput is constrained by the storage, nearly every transaction
116 *	must wait).
117 *
118 *      Note, in particular, that if a lock is sometimes acquired before
119 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
120 *      to use a non-blocking assign can deadlock the system.  The scenario:
121 *
122 *	Thread A has grabbed a lock before calling dmu_tx_assign().
123 *	Thread B is in an already-assigned tx, and blocks for this lock.
124 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
125 *	forever, because the previous txg can't quiesce until B's tx commits.
126 *
127 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
128 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
129 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
130 *	to indicate that this operation has already called dmu_tx_wait().
131 *	This will ensure that we don't retry forever, waiting a short bit
132 *	each time.
133 *
134 *  (5)	If the operation succeeded, generate the intent log entry for it
135 *	before dropping locks.  This ensures that the ordering of events
136 *	in the intent log matches the order in which they actually occurred.
137 *	During ZIL replay the zfs_log_* functions will update the sequence
138 *	number to indicate the zil transaction has replayed.
139 *
140 *  (6)	At the end of each vnode op, the DMU tx must always commit,
141 *	regardless of whether there were any errors.
142 *
143 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
144 *	to ensure that synchronous semantics are provided when necessary.
145 *
146 * In general, this is how things should be ordered in each vnode op:
147 *
148 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
149 * top:
150 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
151 *	rw_enter(...);			// grab any other locks you need
152 *	tx = dmu_tx_create(...);	// get DMU tx
153 *	dmu_tx_hold_*();		// hold each object you might modify
154 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
155 *	if (error) {
156 *		rw_exit(...);		// drop locks
157 *		zfs_dirent_unlock(dl);	// unlock directory entry
158 *		VN_RELE(...);		// release held vnodes
159 *		if (error == ERESTART) {
160 *			waited = B_TRUE;
161 *			dmu_tx_wait(tx);
162 *			dmu_tx_abort(tx);
163 *			goto top;
164 *		}
165 *		dmu_tx_abort(tx);	// abort DMU tx
166 *		ZFS_EXIT(zfsvfs);	// finished in zfs
167 *		return (error);		// really out of space
168 *	}
169 *	error = do_real_work();		// do whatever this VOP does
170 *	if (error == 0)
171 *		zfs_log_*(...);		// on success, make ZIL entry
172 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
173 *	rw_exit(...);			// drop locks
174 *	zfs_dirent_unlock(dl);		// unlock directory entry
175 *	VN_RELE(...);			// release held vnodes
176 *	zil_commit(zilog, foid);	// synchronous when necessary
177 *	ZFS_EXIT(zfsvfs);		// finished in zfs
178 *	return (error);			// done, report error
179 */
180
181/* ARGSUSED */
182static int
183zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
184{
185	znode_t	*zp = VTOZ(*vpp);
186	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
187
188	ZFS_ENTER(zfsvfs);
189	ZFS_VERIFY_ZP(zp);
190
191	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
192	    ((flag & FAPPEND) == 0)) {
193		ZFS_EXIT(zfsvfs);
194		return (SET_ERROR(EPERM));
195	}
196
197	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
198	    ZTOV(zp)->v_type == VREG &&
199	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
200		if (fs_vscan(*vpp, cr, 0) != 0) {
201			ZFS_EXIT(zfsvfs);
202			return (SET_ERROR(EACCES));
203		}
204	}
205
206	/* Keep a count of the synchronous opens in the znode */
207	if (flag & (FSYNC | FDSYNC))
208		atomic_inc_32(&zp->z_sync_cnt);
209
210	ZFS_EXIT(zfsvfs);
211	return (0);
212}
213
214/* ARGSUSED */
215static int
216zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
217    caller_context_t *ct)
218{
219	znode_t	*zp = VTOZ(vp);
220	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
221
222	/*
223	 * Clean up any locks held by this process on the vp.
224	 */
225	cleanlocks(vp, ddi_get_pid(), 0);
226	cleanshares(vp, ddi_get_pid());
227
228	ZFS_ENTER(zfsvfs);
229	ZFS_VERIFY_ZP(zp);
230
231	/* Decrement the synchronous opens in the znode */
232	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
233		atomic_dec_32(&zp->z_sync_cnt);
234
235	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
236	    ZTOV(zp)->v_type == VREG &&
237	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
238		VERIFY(fs_vscan(vp, cr, 1) == 0);
239
240	ZFS_EXIT(zfsvfs);
241	return (0);
242}
243
244/*
245 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
246 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
247 */
248static int
249zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
250{
251	znode_t	*zp = VTOZ(vp);
252	uint64_t noff = (uint64_t)*off; /* new offset */
253	uint64_t file_sz;
254	int error;
255	boolean_t hole;
256
257	file_sz = zp->z_size;
258	if (noff >= file_sz)  {
259		return (SET_ERROR(ENXIO));
260	}
261
262	if (cmd == _FIO_SEEK_HOLE)
263		hole = B_TRUE;
264	else
265		hole = B_FALSE;
266
267	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
268
269	if (error == ESRCH)
270		return (SET_ERROR(ENXIO));
271
272	/*
273	 * We could find a hole that begins after the logical end-of-file,
274	 * because dmu_offset_next() only works on whole blocks.  If the
275	 * EOF falls mid-block, then indicate that the "virtual hole"
276	 * at the end of the file begins at the logical EOF, rather than
277	 * at the end of the last block.
278	 */
279	if (noff > file_sz) {
280		ASSERT(hole);
281		noff = file_sz;
282	}
283
284	if (noff < *off)
285		return (error);
286	*off = noff;
287	return (error);
288}
289
290/* ARGSUSED */
291static int
292zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
293    int *rvalp, caller_context_t *ct)
294{
295	offset_t off;
296	offset_t ndata;
297	dmu_object_info_t doi;
298	int error;
299	zfsvfs_t *zfsvfs;
300	znode_t *zp;
301
302	switch (com) {
303	case _FIOFFS:
304	{
305		return (0);
306
307		/*
308		 * The following two ioctls are used by bfu.  Faking out,
309		 * necessary to avoid bfu errors.
310		 */
311	}
312	case _FIOGDIO:
313	case _FIOSDIO:
314	{
315		return (0);
316	}
317
318	case _FIO_SEEK_DATA:
319	case _FIO_SEEK_HOLE:
320	{
321#ifdef illumos
322		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
323			return (SET_ERROR(EFAULT));
324#else
325		off = *(offset_t *)data;
326#endif
327		zp = VTOZ(vp);
328		zfsvfs = zp->z_zfsvfs;
329		ZFS_ENTER(zfsvfs);
330		ZFS_VERIFY_ZP(zp);
331
332		/* offset parameter is in/out */
333		error = zfs_holey(vp, com, &off);
334		ZFS_EXIT(zfsvfs);
335		if (error)
336			return (error);
337#ifdef illumos
338		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
339			return (SET_ERROR(EFAULT));
340#else
341		*(offset_t *)data = off;
342#endif
343		return (0);
344	}
345#ifdef illumos
346	case _FIO_COUNT_FILLED:
347	{
348		/*
349		 * _FIO_COUNT_FILLED adds a new ioctl command which
350		 * exposes the number of filled blocks in a
351		 * ZFS object.
352		 */
353		zp = VTOZ(vp);
354		zfsvfs = zp->z_zfsvfs;
355		ZFS_ENTER(zfsvfs);
356		ZFS_VERIFY_ZP(zp);
357
358		/*
359		 * Wait for all dirty blocks for this object
360		 * to get synced out to disk, and the DMU info
361		 * updated.
362		 */
363		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
364		if (error) {
365			ZFS_EXIT(zfsvfs);
366			return (error);
367		}
368
369		/*
370		 * Retrieve fill count from DMU object.
371		 */
372		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
373		if (error) {
374			ZFS_EXIT(zfsvfs);
375			return (error);
376		}
377
378		ndata = doi.doi_fill_count;
379
380		ZFS_EXIT(zfsvfs);
381		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
382			return (SET_ERROR(EFAULT));
383		return (0);
384	}
385#endif
386	}
387	return (SET_ERROR(ENOTTY));
388}
389
390static vm_page_t
391page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
392{
393	vm_object_t obj;
394	vm_page_t pp;
395	int64_t end;
396
397	/*
398	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
399	 * aligned boundaries, if the range is not aligned.  As a result a
400	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
401	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
402	 * the whole page would be considred clean despite have some dirty data.
403	 * For this reason we should shrink the range to DEV_BSIZE aligned
404	 * boundaries before calling vm_page_clear_dirty.
405	 */
406	end = rounddown2(off + nbytes, DEV_BSIZE);
407	off = roundup2(off, DEV_BSIZE);
408	nbytes = end - off;
409
410	obj = vp->v_object;
411	zfs_vmobject_assert_wlocked(obj);
412
413	for (;;) {
414		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
415		    pp->valid) {
416			if (vm_page_xbusied(pp)) {
417				/*
418				 * Reference the page before unlocking and
419				 * sleeping so that the page daemon is less
420				 * likely to reclaim it.
421				 */
422				vm_page_reference(pp);
423				vm_page_lock(pp);
424				zfs_vmobject_wunlock(obj);
425				vm_page_busy_sleep(pp, "zfsmwb");
426				zfs_vmobject_wlock(obj);
427				continue;
428			}
429			vm_page_sbusy(pp);
430		} else if (pp == NULL) {
431			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
432			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
433			    VM_ALLOC_SBUSY);
434		} else {
435			ASSERT(pp != NULL && !pp->valid);
436			pp = NULL;
437		}
438
439		if (pp != NULL) {
440			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
441			vm_object_pip_add(obj, 1);
442			pmap_remove_write(pp);
443			if (nbytes != 0)
444				vm_page_clear_dirty(pp, off, nbytes);
445		}
446		break;
447	}
448	return (pp);
449}
450
451static void
452page_unbusy(vm_page_t pp)
453{
454
455	vm_page_sunbusy(pp);
456	vm_object_pip_subtract(pp->object, 1);
457}
458
459static vm_page_t
460page_hold(vnode_t *vp, int64_t start)
461{
462	vm_object_t obj;
463	vm_page_t pp;
464
465	obj = vp->v_object;
466	zfs_vmobject_assert_wlocked(obj);
467
468	for (;;) {
469		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
470		    pp->valid) {
471			if (vm_page_xbusied(pp)) {
472				/*
473				 * Reference the page before unlocking and
474				 * sleeping so that the page daemon is less
475				 * likely to reclaim it.
476				 */
477				vm_page_reference(pp);
478				vm_page_lock(pp);
479				zfs_vmobject_wunlock(obj);
480				vm_page_busy_sleep(pp, "zfsmwb");
481				zfs_vmobject_wlock(obj);
482				continue;
483			}
484
485			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
486			vm_page_lock(pp);
487			vm_page_hold(pp);
488			vm_page_unlock(pp);
489
490		} else
491			pp = NULL;
492		break;
493	}
494	return (pp);
495}
496
497static void
498page_unhold(vm_page_t pp)
499{
500
501	vm_page_lock(pp);
502	vm_page_unhold(pp);
503	vm_page_unlock(pp);
504}
505
506/*
507 * When a file is memory mapped, we must keep the IO data synchronized
508 * between the DMU cache and the memory mapped pages.  What this means:
509 *
510 * On Write:	If we find a memory mapped page, we write to *both*
511 *		the page and the dmu buffer.
512 */
513static void
514update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
515    int segflg, dmu_tx_t *tx)
516{
517	vm_object_t obj;
518	struct sf_buf *sf;
519	caddr_t va;
520	int off;
521
522	ASSERT(segflg != UIO_NOCOPY);
523	ASSERT(vp->v_mount != NULL);
524	obj = vp->v_object;
525	ASSERT(obj != NULL);
526
527	off = start & PAGEOFFSET;
528	zfs_vmobject_wlock(obj);
529	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
530		vm_page_t pp;
531		int nbytes = imin(PAGESIZE - off, len);
532
533		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
534			zfs_vmobject_wunlock(obj);
535
536			va = zfs_map_page(pp, &sf);
537			(void) dmu_read(os, oid, start+off, nbytes,
538			    va+off, DMU_READ_PREFETCH);;
539			zfs_unmap_page(sf);
540
541			zfs_vmobject_wlock(obj);
542			page_unbusy(pp);
543		}
544		len -= nbytes;
545		off = 0;
546	}
547	vm_object_pip_wakeupn(obj, 0);
548	zfs_vmobject_wunlock(obj);
549}
550
551/*
552 * Read with UIO_NOCOPY flag means that sendfile(2) requests
553 * ZFS to populate a range of page cache pages with data.
554 *
555 * NOTE: this function could be optimized to pre-allocate
556 * all pages in advance, drain exclusive busy on all of them,
557 * map them into contiguous KVA region and populate them
558 * in one single dmu_read() call.
559 */
560static int
561mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
562{
563	znode_t *zp = VTOZ(vp);
564	objset_t *os = zp->z_zfsvfs->z_os;
565	struct sf_buf *sf;
566	vm_object_t obj;
567	vm_page_t pp;
568	int64_t start;
569	caddr_t va;
570	int len = nbytes;
571	int off;
572	int error = 0;
573
574	ASSERT(uio->uio_segflg == UIO_NOCOPY);
575	ASSERT(vp->v_mount != NULL);
576	obj = vp->v_object;
577	ASSERT(obj != NULL);
578	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
579
580	zfs_vmobject_wlock(obj);
581	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
582		int bytes = MIN(PAGESIZE, len);
583
584		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
585		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
586		if (pp->valid == 0) {
587			zfs_vmobject_wunlock(obj);
588			va = zfs_map_page(pp, &sf);
589			error = dmu_read(os, zp->z_id, start, bytes, va,
590			    DMU_READ_PREFETCH);
591			if (bytes != PAGESIZE && error == 0)
592				bzero(va + bytes, PAGESIZE - bytes);
593			zfs_unmap_page(sf);
594			zfs_vmobject_wlock(obj);
595			vm_page_sunbusy(pp);
596			vm_page_lock(pp);
597			if (error) {
598				if (pp->wire_count == 0 && pp->valid == 0 &&
599				    !vm_page_busied(pp))
600					vm_page_free(pp);
601			} else {
602				pp->valid = VM_PAGE_BITS_ALL;
603				vm_page_activate(pp);
604			}
605			vm_page_unlock(pp);
606		} else {
607			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
608			vm_page_sunbusy(pp);
609		}
610		if (error)
611			break;
612		uio->uio_resid -= bytes;
613		uio->uio_offset += bytes;
614		len -= bytes;
615	}
616	zfs_vmobject_wunlock(obj);
617	return (error);
618}
619
620/*
621 * When a file is memory mapped, we must keep the IO data synchronized
622 * between the DMU cache and the memory mapped pages.  What this means:
623 *
624 * On Read:	We "read" preferentially from memory mapped pages,
625 *		else we default from the dmu buffer.
626 *
627 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
628 *	 the file is memory mapped.
629 */
630static int
631mappedread(vnode_t *vp, int nbytes, uio_t *uio)
632{
633	znode_t *zp = VTOZ(vp);
634	vm_object_t obj;
635	int64_t start;
636	caddr_t va;
637	int len = nbytes;
638	int off;
639	int error = 0;
640
641	ASSERT(vp->v_mount != NULL);
642	obj = vp->v_object;
643	ASSERT(obj != NULL);
644
645	start = uio->uio_loffset;
646	off = start & PAGEOFFSET;
647	zfs_vmobject_wlock(obj);
648	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
649		vm_page_t pp;
650		uint64_t bytes = MIN(PAGESIZE - off, len);
651
652		if (pp = page_hold(vp, start)) {
653			struct sf_buf *sf;
654			caddr_t va;
655
656			zfs_vmobject_wunlock(obj);
657			va = zfs_map_page(pp, &sf);
658#ifdef illumos
659			error = uiomove(va + off, bytes, UIO_READ, uio);
660#else
661			error = vn_io_fault_uiomove(va + off, bytes, uio);
662#endif
663			zfs_unmap_page(sf);
664			zfs_vmobject_wlock(obj);
665			page_unhold(pp);
666		} else {
667			zfs_vmobject_wunlock(obj);
668			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
669			    uio, bytes);
670			zfs_vmobject_wlock(obj);
671		}
672		len -= bytes;
673		off = 0;
674		if (error)
675			break;
676	}
677	zfs_vmobject_wunlock(obj);
678	return (error);
679}
680
681offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
682
683/*
684 * Read bytes from specified file into supplied buffer.
685 *
686 *	IN:	vp	- vnode of file to be read from.
687 *		uio	- structure supplying read location, range info,
688 *			  and return buffer.
689 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
690 *		cr	- credentials of caller.
691 *		ct	- caller context
692 *
693 *	OUT:	uio	- updated offset and range, buffer filled.
694 *
695 *	RETURN:	0 on success, error code on failure.
696 *
697 * Side Effects:
698 *	vp - atime updated if byte count > 0
699 */
700/* ARGSUSED */
701static int
702zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
703{
704	znode_t		*zp = VTOZ(vp);
705	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
706	ssize_t		n, nbytes;
707	int		error = 0;
708	rl_t		*rl;
709	xuio_t		*xuio = NULL;
710
711	ZFS_ENTER(zfsvfs);
712	ZFS_VERIFY_ZP(zp);
713
714	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
715		ZFS_EXIT(zfsvfs);
716		return (SET_ERROR(EACCES));
717	}
718
719	/*
720	 * Validate file offset
721	 */
722	if (uio->uio_loffset < (offset_t)0) {
723		ZFS_EXIT(zfsvfs);
724		return (SET_ERROR(EINVAL));
725	}
726
727	/*
728	 * Fasttrack empty reads
729	 */
730	if (uio->uio_resid == 0) {
731		ZFS_EXIT(zfsvfs);
732		return (0);
733	}
734
735	/*
736	 * Check for mandatory locks
737	 */
738	if (MANDMODE(zp->z_mode)) {
739		if (error = chklock(vp, FREAD,
740		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
741			ZFS_EXIT(zfsvfs);
742			return (error);
743		}
744	}
745
746	/*
747	 * If we're in FRSYNC mode, sync out this znode before reading it.
748	 */
749	if (zfsvfs->z_log &&
750	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
751		zil_commit(zfsvfs->z_log, zp->z_id);
752
753	/*
754	 * Lock the range against changes.
755	 */
756	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
757
758	/*
759	 * If we are reading past end-of-file we can skip
760	 * to the end; but we might still need to set atime.
761	 */
762	if (uio->uio_loffset >= zp->z_size) {
763		error = 0;
764		goto out;
765	}
766
767	ASSERT(uio->uio_loffset < zp->z_size);
768	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
769
770#ifdef illumos
771	if ((uio->uio_extflg == UIO_XUIO) &&
772	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
773		int nblk;
774		int blksz = zp->z_blksz;
775		uint64_t offset = uio->uio_loffset;
776
777		xuio = (xuio_t *)uio;
778		if ((ISP2(blksz))) {
779			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
780			    blksz)) / blksz;
781		} else {
782			ASSERT(offset + n <= blksz);
783			nblk = 1;
784		}
785		(void) dmu_xuio_init(xuio, nblk);
786
787		if (vn_has_cached_data(vp)) {
788			/*
789			 * For simplicity, we always allocate a full buffer
790			 * even if we only expect to read a portion of a block.
791			 */
792			while (--nblk >= 0) {
793				(void) dmu_xuio_add(xuio,
794				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
795				    blksz), 0, blksz);
796			}
797		}
798	}
799#endif	/* illumos */
800
801	while (n > 0) {
802		nbytes = MIN(n, zfs_read_chunk_size -
803		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
804
805#ifdef __FreeBSD__
806		if (uio->uio_segflg == UIO_NOCOPY)
807			error = mappedread_sf(vp, nbytes, uio);
808		else
809#endif /* __FreeBSD__ */
810		if (vn_has_cached_data(vp)) {
811			error = mappedread(vp, nbytes, uio);
812		} else {
813			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
814			    uio, nbytes);
815		}
816		if (error) {
817			/* convert checksum errors into IO errors */
818			if (error == ECKSUM)
819				error = SET_ERROR(EIO);
820			break;
821		}
822
823		n -= nbytes;
824	}
825out:
826	zfs_range_unlock(rl);
827
828	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
829	ZFS_EXIT(zfsvfs);
830	return (error);
831}
832
833/*
834 * Write the bytes to a file.
835 *
836 *	IN:	vp	- vnode of file to be written to.
837 *		uio	- structure supplying write location, range info,
838 *			  and data buffer.
839 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
840 *			  set if in append mode.
841 *		cr	- credentials of caller.
842 *		ct	- caller context (NFS/CIFS fem monitor only)
843 *
844 *	OUT:	uio	- updated offset and range.
845 *
846 *	RETURN:	0 on success, error code on failure.
847 *
848 * Timestamps:
849 *	vp - ctime|mtime updated if byte count > 0
850 */
851
852/* ARGSUSED */
853static int
854zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
855{
856	znode_t		*zp = VTOZ(vp);
857	rlim64_t	limit = MAXOFFSET_T;
858	ssize_t		start_resid = uio->uio_resid;
859	ssize_t		tx_bytes;
860	uint64_t	end_size;
861	dmu_tx_t	*tx;
862	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
863	zilog_t		*zilog;
864	offset_t	woff;
865	ssize_t		n, nbytes;
866	rl_t		*rl;
867	int		max_blksz = zfsvfs->z_max_blksz;
868	int		error = 0;
869	arc_buf_t	*abuf;
870	iovec_t		*aiov = NULL;
871	xuio_t		*xuio = NULL;
872	int		i_iov = 0;
873	int		iovcnt = uio->uio_iovcnt;
874	iovec_t		*iovp = uio->uio_iov;
875	int		write_eof;
876	int		count = 0;
877	sa_bulk_attr_t	bulk[4];
878	uint64_t	mtime[2], ctime[2];
879
880	/*
881	 * Fasttrack empty write
882	 */
883	n = start_resid;
884	if (n == 0)
885		return (0);
886
887	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
888		limit = MAXOFFSET_T;
889
890	ZFS_ENTER(zfsvfs);
891	ZFS_VERIFY_ZP(zp);
892
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
895	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
896	    &zp->z_size, 8);
897	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
898	    &zp->z_pflags, 8);
899
900	/*
901	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
902	 * callers might not be able to detect properly that we are read-only,
903	 * so check it explicitly here.
904	 */
905	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
906		ZFS_EXIT(zfsvfs);
907		return (SET_ERROR(EROFS));
908	}
909
910	/*
911	 * If immutable or not appending then return EPERM
912	 */
913	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
914	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
915	    (uio->uio_loffset < zp->z_size))) {
916		ZFS_EXIT(zfsvfs);
917		return (SET_ERROR(EPERM));
918	}
919
920	zilog = zfsvfs->z_log;
921
922	/*
923	 * Validate file offset
924	 */
925	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
926	if (woff < 0) {
927		ZFS_EXIT(zfsvfs);
928		return (SET_ERROR(EINVAL));
929	}
930
931	/*
932	 * Check for mandatory locks before calling zfs_range_lock()
933	 * in order to prevent a deadlock with locks set via fcntl().
934	 */
935	if (MANDMODE((mode_t)zp->z_mode) &&
936	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
937		ZFS_EXIT(zfsvfs);
938		return (error);
939	}
940
941#ifdef illumos
942	/*
943	 * Pre-fault the pages to ensure slow (eg NFS) pages
944	 * don't hold up txg.
945	 * Skip this if uio contains loaned arc_buf.
946	 */
947	if ((uio->uio_extflg == UIO_XUIO) &&
948	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
949		xuio = (xuio_t *)uio;
950	else
951		uio_prefaultpages(MIN(n, max_blksz), uio);
952#endif
953
954	/*
955	 * If in append mode, set the io offset pointer to eof.
956	 */
957	if (ioflag & FAPPEND) {
958		/*
959		 * Obtain an appending range lock to guarantee file append
960		 * semantics.  We reset the write offset once we have the lock.
961		 */
962		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
963		woff = rl->r_off;
964		if (rl->r_len == UINT64_MAX) {
965			/*
966			 * We overlocked the file because this write will cause
967			 * the file block size to increase.
968			 * Note that zp_size cannot change with this lock held.
969			 */
970			woff = zp->z_size;
971		}
972		uio->uio_loffset = woff;
973	} else {
974		/*
975		 * Note that if the file block size will change as a result of
976		 * this write, then this range lock will lock the entire file
977		 * so that we can re-write the block safely.
978		 */
979		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
980	}
981
982	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
983		zfs_range_unlock(rl);
984		ZFS_EXIT(zfsvfs);
985		return (EFBIG);
986	}
987
988	if (woff >= limit) {
989		zfs_range_unlock(rl);
990		ZFS_EXIT(zfsvfs);
991		return (SET_ERROR(EFBIG));
992	}
993
994	if ((woff + n) > limit || woff > (limit - n))
995		n = limit - woff;
996
997	/* Will this write extend the file length? */
998	write_eof = (woff + n > zp->z_size);
999
1000	end_size = MAX(zp->z_size, woff + n);
1001
1002	/*
1003	 * Write the file in reasonable size chunks.  Each chunk is written
1004	 * in a separate transaction; this keeps the intent log records small
1005	 * and allows us to do more fine-grained space accounting.
1006	 */
1007	while (n > 0) {
1008		abuf = NULL;
1009		woff = uio->uio_loffset;
1010		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1011		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1012			if (abuf != NULL)
1013				dmu_return_arcbuf(abuf);
1014			error = SET_ERROR(EDQUOT);
1015			break;
1016		}
1017
1018		if (xuio && abuf == NULL) {
1019			ASSERT(i_iov < iovcnt);
1020			aiov = &iovp[i_iov];
1021			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1022			dmu_xuio_clear(xuio, i_iov);
1023			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1024			    iovec_t *, aiov, arc_buf_t *, abuf);
1025			ASSERT((aiov->iov_base == abuf->b_data) ||
1026			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1027			    aiov->iov_len == arc_buf_size(abuf)));
1028			i_iov++;
1029		} else if (abuf == NULL && n >= max_blksz &&
1030		    woff >= zp->z_size &&
1031		    P2PHASE(woff, max_blksz) == 0 &&
1032		    zp->z_blksz == max_blksz) {
1033			/*
1034			 * This write covers a full block.  "Borrow" a buffer
1035			 * from the dmu so that we can fill it before we enter
1036			 * a transaction.  This avoids the possibility of
1037			 * holding up the transaction if the data copy hangs
1038			 * up on a pagefault (e.g., from an NFS server mapping).
1039			 */
1040#ifdef illumos
1041			size_t cbytes;
1042#endif
1043
1044			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1045			    max_blksz);
1046			ASSERT(abuf != NULL);
1047			ASSERT(arc_buf_size(abuf) == max_blksz);
1048#ifdef illumos
1049			if (error = uiocopy(abuf->b_data, max_blksz,
1050			    UIO_WRITE, uio, &cbytes)) {
1051				dmu_return_arcbuf(abuf);
1052				break;
1053			}
1054			ASSERT(cbytes == max_blksz);
1055#else
1056			ssize_t resid = uio->uio_resid;
1057			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1058			if (error != 0) {
1059				uio->uio_offset -= resid - uio->uio_resid;
1060				uio->uio_resid = resid;
1061				dmu_return_arcbuf(abuf);
1062				break;
1063			}
1064#endif
1065		}
1066
1067		/*
1068		 * Start a transaction.
1069		 */
1070		tx = dmu_tx_create(zfsvfs->z_os);
1071		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1072		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1073		zfs_sa_upgrade_txholds(tx, zp);
1074		error = dmu_tx_assign(tx, TXG_WAIT);
1075		if (error) {
1076			dmu_tx_abort(tx);
1077			if (abuf != NULL)
1078				dmu_return_arcbuf(abuf);
1079			break;
1080		}
1081
1082		/*
1083		 * If zfs_range_lock() over-locked we grow the blocksize
1084		 * and then reduce the lock range.  This will only happen
1085		 * on the first iteration since zfs_range_reduce() will
1086		 * shrink down r_len to the appropriate size.
1087		 */
1088		if (rl->r_len == UINT64_MAX) {
1089			uint64_t new_blksz;
1090
1091			if (zp->z_blksz > max_blksz) {
1092				/*
1093				 * File's blocksize is already larger than the
1094				 * "recordsize" property.  Only let it grow to
1095				 * the next power of 2.
1096				 */
1097				ASSERT(!ISP2(zp->z_blksz));
1098				new_blksz = MIN(end_size,
1099				    1 << highbit64(zp->z_blksz));
1100			} else {
1101				new_blksz = MIN(end_size, max_blksz);
1102			}
1103			zfs_grow_blocksize(zp, new_blksz, tx);
1104			zfs_range_reduce(rl, woff, n);
1105		}
1106
1107		/*
1108		 * XXX - should we really limit each write to z_max_blksz?
1109		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1110		 */
1111		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1112
1113		if (woff + nbytes > zp->z_size)
1114			vnode_pager_setsize(vp, woff + nbytes);
1115
1116		if (abuf == NULL) {
1117			tx_bytes = uio->uio_resid;
1118			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1119			    uio, nbytes, tx);
1120			tx_bytes -= uio->uio_resid;
1121		} else {
1122			tx_bytes = nbytes;
1123			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1124			/*
1125			 * If this is not a full block write, but we are
1126			 * extending the file past EOF and this data starts
1127			 * block-aligned, use assign_arcbuf().  Otherwise,
1128			 * write via dmu_write().
1129			 */
1130			if (tx_bytes < max_blksz && (!write_eof ||
1131			    aiov->iov_base != abuf->b_data)) {
1132				ASSERT(xuio);
1133				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1134				    aiov->iov_len, aiov->iov_base, tx);
1135				dmu_return_arcbuf(abuf);
1136				xuio_stat_wbuf_copied();
1137			} else {
1138				ASSERT(xuio || tx_bytes == max_blksz);
1139				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1140				    woff, abuf, tx);
1141			}
1142#ifdef illumos
1143			ASSERT(tx_bytes <= uio->uio_resid);
1144			uioskip(uio, tx_bytes);
1145#endif
1146		}
1147		if (tx_bytes && vn_has_cached_data(vp)) {
1148			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1149			    zp->z_id, uio->uio_segflg, tx);
1150		}
1151
1152		/*
1153		 * If we made no progress, we're done.  If we made even
1154		 * partial progress, update the znode and ZIL accordingly.
1155		 */
1156		if (tx_bytes == 0) {
1157			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1158			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1159			dmu_tx_commit(tx);
1160			ASSERT(error != 0);
1161			break;
1162		}
1163
1164		/*
1165		 * Clear Set-UID/Set-GID bits on successful write if not
1166		 * privileged and at least one of the excute bits is set.
1167		 *
1168		 * It would be nice to to this after all writes have
1169		 * been done, but that would still expose the ISUID/ISGID
1170		 * to another app after the partial write is committed.
1171		 *
1172		 * Note: we don't call zfs_fuid_map_id() here because
1173		 * user 0 is not an ephemeral uid.
1174		 */
1175		mutex_enter(&zp->z_acl_lock);
1176		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1177		    (S_IXUSR >> 6))) != 0 &&
1178		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1179		    secpolicy_vnode_setid_retain(vp, cr,
1180		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1181			uint64_t newmode;
1182			zp->z_mode &= ~(S_ISUID | S_ISGID);
1183			newmode = zp->z_mode;
1184			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1185			    (void *)&newmode, sizeof (uint64_t), tx);
1186		}
1187		mutex_exit(&zp->z_acl_lock);
1188
1189		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1190		    B_TRUE);
1191
1192		/*
1193		 * Update the file size (zp_size) if it has changed;
1194		 * account for possible concurrent updates.
1195		 */
1196		while ((end_size = zp->z_size) < uio->uio_loffset) {
1197			(void) atomic_cas_64(&zp->z_size, end_size,
1198			    uio->uio_loffset);
1199#ifdef illumos
1200			ASSERT(error == 0);
1201#else
1202			ASSERT(error == 0 || error == EFAULT);
1203#endif
1204		}
1205		/*
1206		 * If we are replaying and eof is non zero then force
1207		 * the file size to the specified eof. Note, there's no
1208		 * concurrency during replay.
1209		 */
1210		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1211			zp->z_size = zfsvfs->z_replay_eof;
1212
1213		if (error == 0)
1214			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1215		else
1216			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1217
1218		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1219		dmu_tx_commit(tx);
1220
1221		if (error != 0)
1222			break;
1223		ASSERT(tx_bytes == nbytes);
1224		n -= nbytes;
1225
1226#ifdef illumos
1227		if (!xuio && n > 0)
1228			uio_prefaultpages(MIN(n, max_blksz), uio);
1229#endif
1230	}
1231
1232	zfs_range_unlock(rl);
1233
1234	/*
1235	 * If we're in replay mode, or we made no progress, return error.
1236	 * Otherwise, it's at least a partial write, so it's successful.
1237	 */
1238	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1239		ZFS_EXIT(zfsvfs);
1240		return (error);
1241	}
1242
1243#ifdef __FreeBSD__
1244	/*
1245	 * EFAULT means that at least one page of the source buffer was not
1246	 * available.  VFS will re-try remaining I/O upon this error.
1247	 */
1248	if (error == EFAULT) {
1249		ZFS_EXIT(zfsvfs);
1250		return (error);
1251	}
1252#endif
1253
1254	if (ioflag & (FSYNC | FDSYNC) ||
1255	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1256		zil_commit(zilog, zp->z_id);
1257
1258	ZFS_EXIT(zfsvfs);
1259	return (0);
1260}
1261
1262void
1263zfs_get_done(zgd_t *zgd, int error)
1264{
1265	znode_t *zp = zgd->zgd_private;
1266	objset_t *os = zp->z_zfsvfs->z_os;
1267
1268	if (zgd->zgd_db)
1269		dmu_buf_rele(zgd->zgd_db, zgd);
1270
1271	zfs_range_unlock(zgd->zgd_rl);
1272
1273	/*
1274	 * Release the vnode asynchronously as we currently have the
1275	 * txg stopped from syncing.
1276	 */
1277	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1278
1279	if (error == 0 && zgd->zgd_bp)
1280		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1281
1282	kmem_free(zgd, sizeof (zgd_t));
1283}
1284
1285#ifdef DEBUG
1286static int zil_fault_io = 0;
1287#endif
1288
1289/*
1290 * Get data to generate a TX_WRITE intent log record.
1291 */
1292int
1293zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1294{
1295	zfsvfs_t *zfsvfs = arg;
1296	objset_t *os = zfsvfs->z_os;
1297	znode_t *zp;
1298	uint64_t object = lr->lr_foid;
1299	uint64_t offset = lr->lr_offset;
1300	uint64_t size = lr->lr_length;
1301	blkptr_t *bp = &lr->lr_blkptr;
1302	dmu_buf_t *db;
1303	zgd_t *zgd;
1304	int error = 0;
1305
1306	ASSERT(zio != NULL);
1307	ASSERT(size != 0);
1308
1309	/*
1310	 * Nothing to do if the file has been removed
1311	 */
1312	if (zfs_zget(zfsvfs, object, &zp) != 0)
1313		return (SET_ERROR(ENOENT));
1314	if (zp->z_unlinked) {
1315		/*
1316		 * Release the vnode asynchronously as we currently have the
1317		 * txg stopped from syncing.
1318		 */
1319		VN_RELE_ASYNC(ZTOV(zp),
1320		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1321		return (SET_ERROR(ENOENT));
1322	}
1323
1324	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1325	zgd->zgd_zilog = zfsvfs->z_log;
1326	zgd->zgd_private = zp;
1327
1328	/*
1329	 * Write records come in two flavors: immediate and indirect.
1330	 * For small writes it's cheaper to store the data with the
1331	 * log record (immediate); for large writes it's cheaper to
1332	 * sync the data and get a pointer to it (indirect) so that
1333	 * we don't have to write the data twice.
1334	 */
1335	if (buf != NULL) { /* immediate write */
1336		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1337		/* test for truncation needs to be done while range locked */
1338		if (offset >= zp->z_size) {
1339			error = SET_ERROR(ENOENT);
1340		} else {
1341			error = dmu_read(os, object, offset, size, buf,
1342			    DMU_READ_NO_PREFETCH);
1343		}
1344		ASSERT(error == 0 || error == ENOENT);
1345	} else { /* indirect write */
1346		/*
1347		 * Have to lock the whole block to ensure when it's
1348		 * written out and it's checksum is being calculated
1349		 * that no one can change the data. We need to re-check
1350		 * blocksize after we get the lock in case it's changed!
1351		 */
1352		for (;;) {
1353			uint64_t blkoff;
1354			size = zp->z_blksz;
1355			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1356			offset -= blkoff;
1357			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1358			    RL_READER);
1359			if (zp->z_blksz == size)
1360				break;
1361			offset += blkoff;
1362			zfs_range_unlock(zgd->zgd_rl);
1363		}
1364		/* test for truncation needs to be done while range locked */
1365		if (lr->lr_offset >= zp->z_size)
1366			error = SET_ERROR(ENOENT);
1367#ifdef DEBUG
1368		if (zil_fault_io) {
1369			error = SET_ERROR(EIO);
1370			zil_fault_io = 0;
1371		}
1372#endif
1373		if (error == 0)
1374			error = dmu_buf_hold(os, object, offset, zgd, &db,
1375			    DMU_READ_NO_PREFETCH);
1376
1377		if (error == 0) {
1378			blkptr_t *obp = dmu_buf_get_blkptr(db);
1379			if (obp) {
1380				ASSERT(BP_IS_HOLE(bp));
1381				*bp = *obp;
1382			}
1383
1384			zgd->zgd_db = db;
1385			zgd->zgd_bp = bp;
1386
1387			ASSERT(db->db_offset == offset);
1388			ASSERT(db->db_size == size);
1389
1390			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1391			    zfs_get_done, zgd);
1392			ASSERT(error || lr->lr_length <= zp->z_blksz);
1393
1394			/*
1395			 * On success, we need to wait for the write I/O
1396			 * initiated by dmu_sync() to complete before we can
1397			 * release this dbuf.  We will finish everything up
1398			 * in the zfs_get_done() callback.
1399			 */
1400			if (error == 0)
1401				return (0);
1402
1403			if (error == EALREADY) {
1404				lr->lr_common.lrc_txtype = TX_WRITE2;
1405				error = 0;
1406			}
1407		}
1408	}
1409
1410	zfs_get_done(zgd, error);
1411
1412	return (error);
1413}
1414
1415/*ARGSUSED*/
1416static int
1417zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1418    caller_context_t *ct)
1419{
1420	znode_t *zp = VTOZ(vp);
1421	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1422	int error;
1423
1424	ZFS_ENTER(zfsvfs);
1425	ZFS_VERIFY_ZP(zp);
1426
1427	if (flag & V_ACE_MASK)
1428		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1429	else
1430		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1431
1432	ZFS_EXIT(zfsvfs);
1433	return (error);
1434}
1435
1436/*
1437 * If vnode is for a device return a specfs vnode instead.
1438 */
1439static int
1440specvp_check(vnode_t **vpp, cred_t *cr)
1441{
1442	int error = 0;
1443
1444	if (IS_DEVVP(*vpp)) {
1445		struct vnode *svp;
1446
1447		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1448		VN_RELE(*vpp);
1449		if (svp == NULL)
1450			error = SET_ERROR(ENOSYS);
1451		*vpp = svp;
1452	}
1453	return (error);
1454}
1455
1456
1457/*
1458 * Lookup an entry in a directory, or an extended attribute directory.
1459 * If it exists, return a held vnode reference for it.
1460 *
1461 *	IN:	dvp	- vnode of directory to search.
1462 *		nm	- name of entry to lookup.
1463 *		pnp	- full pathname to lookup [UNUSED].
1464 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1465 *		rdir	- root directory vnode [UNUSED].
1466 *		cr	- credentials of caller.
1467 *		ct	- caller context
1468 *		direntflags - directory lookup flags
1469 *		realpnp - returned pathname.
1470 *
1471 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1472 *
1473 *	RETURN:	0 on success, error code on failure.
1474 *
1475 * Timestamps:
1476 *	NA
1477 */
1478/* ARGSUSED */
1479static int
1480zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1481    int nameiop, cred_t *cr, kthread_t *td, int flags)
1482{
1483	znode_t *zdp = VTOZ(dvp);
1484	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1485	int	error = 0;
1486	int *direntflags = NULL;
1487	void *realpnp = NULL;
1488
1489	/* fast path */
1490	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1491
1492		if (dvp->v_type != VDIR) {
1493			return (SET_ERROR(ENOTDIR));
1494		} else if (zdp->z_sa_hdl == NULL) {
1495			return (SET_ERROR(EIO));
1496		}
1497
1498		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1499			error = zfs_fastaccesschk_execute(zdp, cr);
1500			if (!error) {
1501				*vpp = dvp;
1502				VN_HOLD(*vpp);
1503				return (0);
1504			}
1505			return (error);
1506		} else {
1507			vnode_t *tvp = dnlc_lookup(dvp, nm);
1508
1509			if (tvp) {
1510				error = zfs_fastaccesschk_execute(zdp, cr);
1511				if (error) {
1512					VN_RELE(tvp);
1513					return (error);
1514				}
1515				if (tvp == DNLC_NO_VNODE) {
1516					VN_RELE(tvp);
1517					return (SET_ERROR(ENOENT));
1518				} else {
1519					*vpp = tvp;
1520					return (specvp_check(vpp, cr));
1521				}
1522			}
1523		}
1524	}
1525
1526	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1527
1528	ZFS_ENTER(zfsvfs);
1529	ZFS_VERIFY_ZP(zdp);
1530
1531	*vpp = NULL;
1532
1533	if (flags & LOOKUP_XATTR) {
1534#ifdef TODO
1535		/*
1536		 * If the xattr property is off, refuse the lookup request.
1537		 */
1538		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1539			ZFS_EXIT(zfsvfs);
1540			return (SET_ERROR(EINVAL));
1541		}
1542#endif
1543
1544		/*
1545		 * We don't allow recursive attributes..
1546		 * Maybe someday we will.
1547		 */
1548		if (zdp->z_pflags & ZFS_XATTR) {
1549			ZFS_EXIT(zfsvfs);
1550			return (SET_ERROR(EINVAL));
1551		}
1552
1553		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1554			ZFS_EXIT(zfsvfs);
1555			return (error);
1556		}
1557
1558		/*
1559		 * Do we have permission to get into attribute directory?
1560		 */
1561
1562		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1563		    B_FALSE, cr)) {
1564			VN_RELE(*vpp);
1565			*vpp = NULL;
1566		}
1567
1568		ZFS_EXIT(zfsvfs);
1569		return (error);
1570	}
1571
1572	if (dvp->v_type != VDIR) {
1573		ZFS_EXIT(zfsvfs);
1574		return (SET_ERROR(ENOTDIR));
1575	}
1576
1577	/*
1578	 * Check accessibility of directory.
1579	 */
1580
1581	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1582		ZFS_EXIT(zfsvfs);
1583		return (error);
1584	}
1585
1586	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1587	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1588		ZFS_EXIT(zfsvfs);
1589		return (SET_ERROR(EILSEQ));
1590	}
1591
1592	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1593	if (error == 0)
1594		error = specvp_check(vpp, cr);
1595
1596	/* Translate errors and add SAVENAME when needed. */
1597	if (cnp->cn_flags & ISLASTCN) {
1598		switch (nameiop) {
1599		case CREATE:
1600		case RENAME:
1601			if (error == ENOENT) {
1602				error = EJUSTRETURN;
1603				cnp->cn_flags |= SAVENAME;
1604				break;
1605			}
1606			/* FALLTHROUGH */
1607		case DELETE:
1608			if (error == 0)
1609				cnp->cn_flags |= SAVENAME;
1610			break;
1611		}
1612	}
1613	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1614		int ltype = 0;
1615
1616		if (cnp->cn_flags & ISDOTDOT) {
1617			ltype = VOP_ISLOCKED(dvp);
1618			VOP_UNLOCK(dvp, 0);
1619		}
1620		ZFS_EXIT(zfsvfs);
1621		error = vn_lock(*vpp, cnp->cn_lkflags);
1622		if (cnp->cn_flags & ISDOTDOT)
1623			vn_lock(dvp, ltype | LK_RETRY);
1624		if (error != 0) {
1625			VN_RELE(*vpp);
1626			*vpp = NULL;
1627			return (error);
1628		}
1629	} else {
1630		ZFS_EXIT(zfsvfs);
1631	}
1632
1633#ifdef FREEBSD_NAMECACHE
1634	/*
1635	 * Insert name into cache (as non-existent) if appropriate.
1636	 */
1637	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1638		cache_enter(dvp, *vpp, cnp);
1639	/*
1640	 * Insert name into cache if appropriate.
1641	 */
1642	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1643		if (!(cnp->cn_flags & ISLASTCN) ||
1644		    (nameiop != DELETE && nameiop != RENAME)) {
1645			cache_enter(dvp, *vpp, cnp);
1646		}
1647	}
1648#endif
1649
1650	return (error);
1651}
1652
1653/*
1654 * Attempt to create a new entry in a directory.  If the entry
1655 * already exists, truncate the file if permissible, else return
1656 * an error.  Return the vp of the created or trunc'd file.
1657 *
1658 *	IN:	dvp	- vnode of directory to put new file entry in.
1659 *		name	- name of new file entry.
1660 *		vap	- attributes of new file.
1661 *		excl	- flag indicating exclusive or non-exclusive mode.
1662 *		mode	- mode to open file with.
1663 *		cr	- credentials of caller.
1664 *		flag	- large file flag [UNUSED].
1665 *		ct	- caller context
1666 *		vsecp	- ACL to be set
1667 *
1668 *	OUT:	vpp	- vnode of created or trunc'd entry.
1669 *
1670 *	RETURN:	0 on success, error code on failure.
1671 *
1672 * Timestamps:
1673 *	dvp - ctime|mtime updated if new entry created
1674 *	 vp - ctime|mtime always, atime if new
1675 */
1676
1677/* ARGSUSED */
1678static int
1679zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1680    vnode_t **vpp, cred_t *cr, kthread_t *td)
1681{
1682	znode_t		*zp, *dzp = VTOZ(dvp);
1683	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1684	zilog_t		*zilog;
1685	objset_t	*os;
1686	zfs_dirlock_t	*dl;
1687	dmu_tx_t	*tx;
1688	int		error;
1689	ksid_t		*ksid;
1690	uid_t		uid;
1691	gid_t		gid = crgetgid(cr);
1692	zfs_acl_ids_t   acl_ids;
1693	boolean_t	fuid_dirtied;
1694	boolean_t	have_acl = B_FALSE;
1695	boolean_t	waited = B_FALSE;
1696	void		*vsecp = NULL;
1697	int		flag = 0;
1698
1699	/*
1700	 * If we have an ephemeral id, ACL, or XVATTR then
1701	 * make sure file system is at proper version
1702	 */
1703
1704	ksid = crgetsid(cr, KSID_OWNER);
1705	if (ksid)
1706		uid = ksid_getid(ksid);
1707	else
1708		uid = crgetuid(cr);
1709
1710	if (zfsvfs->z_use_fuids == B_FALSE &&
1711	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1712	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1713		return (SET_ERROR(EINVAL));
1714
1715	ZFS_ENTER(zfsvfs);
1716	ZFS_VERIFY_ZP(dzp);
1717	os = zfsvfs->z_os;
1718	zilog = zfsvfs->z_log;
1719
1720	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1721	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1722		ZFS_EXIT(zfsvfs);
1723		return (SET_ERROR(EILSEQ));
1724	}
1725
1726	if (vap->va_mask & AT_XVATTR) {
1727		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1728		    crgetuid(cr), cr, vap->va_type)) != 0) {
1729			ZFS_EXIT(zfsvfs);
1730			return (error);
1731		}
1732	}
1733
1734	getnewvnode_reserve(1);
1735
1736top:
1737	*vpp = NULL;
1738
1739	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1740		vap->va_mode &= ~S_ISVTX;
1741
1742	if (*name == '\0') {
1743		/*
1744		 * Null component name refers to the directory itself.
1745		 */
1746		VN_HOLD(dvp);
1747		zp = dzp;
1748		dl = NULL;
1749		error = 0;
1750	} else {
1751		/* possible VN_HOLD(zp) */
1752		int zflg = 0;
1753
1754		if (flag & FIGNORECASE)
1755			zflg |= ZCILOOK;
1756
1757		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1758		    NULL, NULL);
1759		if (error) {
1760			if (have_acl)
1761				zfs_acl_ids_free(&acl_ids);
1762			if (strcmp(name, "..") == 0)
1763				error = SET_ERROR(EISDIR);
1764			getnewvnode_drop_reserve();
1765			ZFS_EXIT(zfsvfs);
1766			return (error);
1767		}
1768	}
1769
1770	if (zp == NULL) {
1771		uint64_t txtype;
1772
1773		/*
1774		 * Create a new file object and update the directory
1775		 * to reference it.
1776		 */
1777		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1778			if (have_acl)
1779				zfs_acl_ids_free(&acl_ids);
1780			goto out;
1781		}
1782
1783		/*
1784		 * We only support the creation of regular files in
1785		 * extended attribute directories.
1786		 */
1787
1788		if ((dzp->z_pflags & ZFS_XATTR) &&
1789		    (vap->va_type != VREG)) {
1790			if (have_acl)
1791				zfs_acl_ids_free(&acl_ids);
1792			error = SET_ERROR(EINVAL);
1793			goto out;
1794		}
1795
1796		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1797		    cr, vsecp, &acl_ids)) != 0)
1798			goto out;
1799		have_acl = B_TRUE;
1800
1801		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1802			zfs_acl_ids_free(&acl_ids);
1803			error = SET_ERROR(EDQUOT);
1804			goto out;
1805		}
1806
1807		tx = dmu_tx_create(os);
1808
1809		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1810		    ZFS_SA_BASE_ATTR_SIZE);
1811
1812		fuid_dirtied = zfsvfs->z_fuid_dirty;
1813		if (fuid_dirtied)
1814			zfs_fuid_txhold(zfsvfs, tx);
1815		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1816		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1817		if (!zfsvfs->z_use_sa &&
1818		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1819			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1820			    0, acl_ids.z_aclp->z_acl_bytes);
1821		}
1822		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1823		if (error) {
1824			zfs_dirent_unlock(dl);
1825			if (error == ERESTART) {
1826				waited = B_TRUE;
1827				dmu_tx_wait(tx);
1828				dmu_tx_abort(tx);
1829				goto top;
1830			}
1831			zfs_acl_ids_free(&acl_ids);
1832			dmu_tx_abort(tx);
1833			getnewvnode_drop_reserve();
1834			ZFS_EXIT(zfsvfs);
1835			return (error);
1836		}
1837		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1838
1839		if (fuid_dirtied)
1840			zfs_fuid_sync(zfsvfs, tx);
1841
1842		(void) zfs_link_create(dl, zp, tx, ZNEW);
1843		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1844		if (flag & FIGNORECASE)
1845			txtype |= TX_CI;
1846		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1847		    vsecp, acl_ids.z_fuidp, vap);
1848		zfs_acl_ids_free(&acl_ids);
1849		dmu_tx_commit(tx);
1850	} else {
1851		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1852
1853		if (have_acl)
1854			zfs_acl_ids_free(&acl_ids);
1855		have_acl = B_FALSE;
1856
1857		/*
1858		 * A directory entry already exists for this name.
1859		 */
1860		/*
1861		 * Can't truncate an existing file if in exclusive mode.
1862		 */
1863		if (excl == EXCL) {
1864			error = SET_ERROR(EEXIST);
1865			goto out;
1866		}
1867		/*
1868		 * Can't open a directory for writing.
1869		 */
1870		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1871			error = SET_ERROR(EISDIR);
1872			goto out;
1873		}
1874		/*
1875		 * Verify requested access to file.
1876		 */
1877		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1878			goto out;
1879		}
1880
1881		mutex_enter(&dzp->z_lock);
1882		dzp->z_seq++;
1883		mutex_exit(&dzp->z_lock);
1884
1885		/*
1886		 * Truncate regular files if requested.
1887		 */
1888		if ((ZTOV(zp)->v_type == VREG) &&
1889		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1890			/* we can't hold any locks when calling zfs_freesp() */
1891			zfs_dirent_unlock(dl);
1892			dl = NULL;
1893			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1894			if (error == 0) {
1895				vnevent_create(ZTOV(zp), ct);
1896			}
1897		}
1898	}
1899out:
1900	getnewvnode_drop_reserve();
1901	if (dl)
1902		zfs_dirent_unlock(dl);
1903
1904	if (error) {
1905		if (zp)
1906			VN_RELE(ZTOV(zp));
1907	} else {
1908		*vpp = ZTOV(zp);
1909		error = specvp_check(vpp, cr);
1910	}
1911
1912	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1913		zil_commit(zilog, 0);
1914
1915	ZFS_EXIT(zfsvfs);
1916	return (error);
1917}
1918
1919/*
1920 * Remove an entry from a directory.
1921 *
1922 *	IN:	dvp	- vnode of directory to remove entry from.
1923 *		name	- name of entry to remove.
1924 *		cr	- credentials of caller.
1925 *		ct	- caller context
1926 *		flags	- case flags
1927 *
1928 *	RETURN:	0 on success, error code on failure.
1929 *
1930 * Timestamps:
1931 *	dvp - ctime|mtime
1932 *	 vp - ctime (if nlink > 0)
1933 */
1934
1935uint64_t null_xattr = 0;
1936
1937/*ARGSUSED*/
1938static int
1939zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1940    int flags)
1941{
1942	znode_t		*zp, *dzp = VTOZ(dvp);
1943	znode_t		*xzp;
1944	vnode_t		*vp;
1945	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1946	zilog_t		*zilog;
1947	uint64_t	acl_obj, xattr_obj;
1948	uint64_t	xattr_obj_unlinked = 0;
1949	uint64_t	obj = 0;
1950	zfs_dirlock_t	*dl;
1951	dmu_tx_t	*tx;
1952	boolean_t	may_delete_now, delete_now = FALSE;
1953	boolean_t	unlinked, toobig = FALSE;
1954	uint64_t	txtype;
1955	pathname_t	*realnmp = NULL;
1956	pathname_t	realnm;
1957	int		error;
1958	int		zflg = ZEXISTS;
1959	boolean_t	waited = B_FALSE;
1960
1961	ZFS_ENTER(zfsvfs);
1962	ZFS_VERIFY_ZP(dzp);
1963	zilog = zfsvfs->z_log;
1964
1965	if (flags & FIGNORECASE) {
1966		zflg |= ZCILOOK;
1967		pn_alloc(&realnm);
1968		realnmp = &realnm;
1969	}
1970
1971top:
1972	xattr_obj = 0;
1973	xzp = NULL;
1974	/*
1975	 * Attempt to lock directory; fail if entry doesn't exist.
1976	 */
1977	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1978	    NULL, realnmp)) {
1979		if (realnmp)
1980			pn_free(realnmp);
1981		ZFS_EXIT(zfsvfs);
1982		return (error);
1983	}
1984
1985	vp = ZTOV(zp);
1986
1987	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1988		goto out;
1989	}
1990
1991	/*
1992	 * Need to use rmdir for removing directories.
1993	 */
1994	if (vp->v_type == VDIR) {
1995		error = SET_ERROR(EPERM);
1996		goto out;
1997	}
1998
1999	vnevent_remove(vp, dvp, name, ct);
2000
2001	if (realnmp)
2002		dnlc_remove(dvp, realnmp->pn_buf);
2003	else
2004		dnlc_remove(dvp, name);
2005
2006	VI_LOCK(vp);
2007	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
2008	VI_UNLOCK(vp);
2009
2010	/*
2011	 * We may delete the znode now, or we may put it in the unlinked set;
2012	 * it depends on whether we're the last link, and on whether there are
2013	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
2014	 * allow for either case.
2015	 */
2016	obj = zp->z_id;
2017	tx = dmu_tx_create(zfsvfs->z_os);
2018	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2019	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2020	zfs_sa_upgrade_txholds(tx, zp);
2021	zfs_sa_upgrade_txholds(tx, dzp);
2022	if (may_delete_now) {
2023		toobig =
2024		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
2025		/* if the file is too big, only hold_free a token amount */
2026		dmu_tx_hold_free(tx, zp->z_id, 0,
2027		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
2028	}
2029
2030	/* are there any extended attributes? */
2031	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2032	    &xattr_obj, sizeof (xattr_obj));
2033	if (error == 0 && xattr_obj) {
2034		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
2035		ASSERT0(error);
2036		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2037		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2038	}
2039
2040	mutex_enter(&zp->z_lock);
2041	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
2042		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
2043	mutex_exit(&zp->z_lock);
2044
2045	/* charge as an update -- would be nice not to charge at all */
2046	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2047
2048	/*
2049	 * Mark this transaction as typically resulting in a net free of space
2050	 */
2051	dmu_tx_mark_netfree(tx);
2052
2053	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2054	if (error) {
2055		zfs_dirent_unlock(dl);
2056		VN_RELE(vp);
2057		if (xzp)
2058			VN_RELE(ZTOV(xzp));
2059		if (error == ERESTART) {
2060			waited = B_TRUE;
2061			dmu_tx_wait(tx);
2062			dmu_tx_abort(tx);
2063			goto top;
2064		}
2065		if (realnmp)
2066			pn_free(realnmp);
2067		dmu_tx_abort(tx);
2068		ZFS_EXIT(zfsvfs);
2069		return (error);
2070	}
2071
2072	/*
2073	 * Remove the directory entry.
2074	 */
2075	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
2076
2077	if (error) {
2078		dmu_tx_commit(tx);
2079		goto out;
2080	}
2081
2082	if (unlinked) {
2083		/*
2084		 * Hold z_lock so that we can make sure that the ACL obj
2085		 * hasn't changed.  Could have been deleted due to
2086		 * zfs_sa_upgrade().
2087		 */
2088		mutex_enter(&zp->z_lock);
2089		VI_LOCK(vp);
2090		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2091		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
2092		delete_now = may_delete_now && !toobig &&
2093		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
2094		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
2095		    acl_obj;
2096		VI_UNLOCK(vp);
2097	}
2098
2099	if (delete_now) {
2100#ifdef __FreeBSD__
2101		panic("zfs_remove: delete_now branch taken");
2102#endif
2103		if (xattr_obj_unlinked) {
2104			ASSERT3U(xzp->z_links, ==, 2);
2105			mutex_enter(&xzp->z_lock);
2106			xzp->z_unlinked = 1;
2107			xzp->z_links = 0;
2108			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
2109			    &xzp->z_links, sizeof (xzp->z_links), tx);
2110			ASSERT3U(error,  ==,  0);
2111			mutex_exit(&xzp->z_lock);
2112			zfs_unlinked_add(xzp, tx);
2113
2114			if (zp->z_is_sa)
2115				error = sa_remove(zp->z_sa_hdl,
2116				    SA_ZPL_XATTR(zfsvfs), tx);
2117			else
2118				error = sa_update(zp->z_sa_hdl,
2119				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
2120				    sizeof (uint64_t), tx);
2121			ASSERT0(error);
2122		}
2123		VI_LOCK(vp);
2124		vp->v_count--;
2125		ASSERT0(vp->v_count);
2126		VI_UNLOCK(vp);
2127		mutex_exit(&zp->z_lock);
2128		zfs_znode_delete(zp, tx);
2129	} else if (unlinked) {
2130		mutex_exit(&zp->z_lock);
2131		zfs_unlinked_add(zp, tx);
2132#ifdef __FreeBSD__
2133		vp->v_vflag |= VV_NOSYNC;
2134#endif
2135	}
2136
2137	txtype = TX_REMOVE;
2138	if (flags & FIGNORECASE)
2139		txtype |= TX_CI;
2140	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2141
2142	dmu_tx_commit(tx);
2143out:
2144	if (realnmp)
2145		pn_free(realnmp);
2146
2147	zfs_dirent_unlock(dl);
2148
2149	if (!delete_now)
2150		VN_RELE(vp);
2151	if (xzp)
2152		VN_RELE(ZTOV(xzp));
2153
2154	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2155		zil_commit(zilog, 0);
2156
2157	ZFS_EXIT(zfsvfs);
2158	return (error);
2159}
2160
2161/*
2162 * Create a new directory and insert it into dvp using the name
2163 * provided.  Return a pointer to the inserted directory.
2164 *
2165 *	IN:	dvp	- vnode of directory to add subdir to.
2166 *		dirname	- name of new directory.
2167 *		vap	- attributes of new directory.
2168 *		cr	- credentials of caller.
2169 *		ct	- caller context
2170 *		flags	- case flags
2171 *		vsecp	- ACL to be set
2172 *
2173 *	OUT:	vpp	- vnode of created directory.
2174 *
2175 *	RETURN:	0 on success, error code on failure.
2176 *
2177 * Timestamps:
2178 *	dvp - ctime|mtime updated
2179 *	 vp - ctime|mtime|atime updated
2180 */
2181/*ARGSUSED*/
2182static int
2183zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2184    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2185{
2186	znode_t		*zp, *dzp = VTOZ(dvp);
2187	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2188	zilog_t		*zilog;
2189	zfs_dirlock_t	*dl;
2190	uint64_t	txtype;
2191	dmu_tx_t	*tx;
2192	int		error;
2193	int		zf = ZNEW;
2194	ksid_t		*ksid;
2195	uid_t		uid;
2196	gid_t		gid = crgetgid(cr);
2197	zfs_acl_ids_t   acl_ids;
2198	boolean_t	fuid_dirtied;
2199	boolean_t	waited = B_FALSE;
2200
2201	ASSERT(vap->va_type == VDIR);
2202
2203	/*
2204	 * If we have an ephemeral id, ACL, or XVATTR then
2205	 * make sure file system is at proper version
2206	 */
2207
2208	ksid = crgetsid(cr, KSID_OWNER);
2209	if (ksid)
2210		uid = ksid_getid(ksid);
2211	else
2212		uid = crgetuid(cr);
2213	if (zfsvfs->z_use_fuids == B_FALSE &&
2214	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2215	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2216		return (SET_ERROR(EINVAL));
2217
2218	ZFS_ENTER(zfsvfs);
2219	ZFS_VERIFY_ZP(dzp);
2220	zilog = zfsvfs->z_log;
2221
2222	if (dzp->z_pflags & ZFS_XATTR) {
2223		ZFS_EXIT(zfsvfs);
2224		return (SET_ERROR(EINVAL));
2225	}
2226
2227	if (zfsvfs->z_utf8 && u8_validate(dirname,
2228	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2229		ZFS_EXIT(zfsvfs);
2230		return (SET_ERROR(EILSEQ));
2231	}
2232	if (flags & FIGNORECASE)
2233		zf |= ZCILOOK;
2234
2235	if (vap->va_mask & AT_XVATTR) {
2236		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2237		    crgetuid(cr), cr, vap->va_type)) != 0) {
2238			ZFS_EXIT(zfsvfs);
2239			return (error);
2240		}
2241	}
2242
2243	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2244	    vsecp, &acl_ids)) != 0) {
2245		ZFS_EXIT(zfsvfs);
2246		return (error);
2247	}
2248
2249	getnewvnode_reserve(1);
2250
2251	/*
2252	 * First make sure the new directory doesn't exist.
2253	 *
2254	 * Existence is checked first to make sure we don't return
2255	 * EACCES instead of EEXIST which can cause some applications
2256	 * to fail.
2257	 */
2258top:
2259	*vpp = NULL;
2260
2261	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2262	    NULL, NULL)) {
2263		zfs_acl_ids_free(&acl_ids);
2264		getnewvnode_drop_reserve();
2265		ZFS_EXIT(zfsvfs);
2266		return (error);
2267	}
2268
2269	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2270		zfs_acl_ids_free(&acl_ids);
2271		zfs_dirent_unlock(dl);
2272		getnewvnode_drop_reserve();
2273		ZFS_EXIT(zfsvfs);
2274		return (error);
2275	}
2276
2277	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2278		zfs_acl_ids_free(&acl_ids);
2279		zfs_dirent_unlock(dl);
2280		getnewvnode_drop_reserve();
2281		ZFS_EXIT(zfsvfs);
2282		return (SET_ERROR(EDQUOT));
2283	}
2284
2285	/*
2286	 * Add a new entry to the directory.
2287	 */
2288	tx = dmu_tx_create(zfsvfs->z_os);
2289	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2290	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2291	fuid_dirtied = zfsvfs->z_fuid_dirty;
2292	if (fuid_dirtied)
2293		zfs_fuid_txhold(zfsvfs, tx);
2294	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2295		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2296		    acl_ids.z_aclp->z_acl_bytes);
2297	}
2298
2299	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2300	    ZFS_SA_BASE_ATTR_SIZE);
2301
2302	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2303	if (error) {
2304		zfs_dirent_unlock(dl);
2305		if (error == ERESTART) {
2306			waited = B_TRUE;
2307			dmu_tx_wait(tx);
2308			dmu_tx_abort(tx);
2309			goto top;
2310		}
2311		zfs_acl_ids_free(&acl_ids);
2312		dmu_tx_abort(tx);
2313		getnewvnode_drop_reserve();
2314		ZFS_EXIT(zfsvfs);
2315		return (error);
2316	}
2317
2318	/*
2319	 * Create new node.
2320	 */
2321	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2322
2323	if (fuid_dirtied)
2324		zfs_fuid_sync(zfsvfs, tx);
2325
2326	/*
2327	 * Now put new name in parent dir.
2328	 */
2329	(void) zfs_link_create(dl, zp, tx, ZNEW);
2330
2331	*vpp = ZTOV(zp);
2332
2333	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2334	if (flags & FIGNORECASE)
2335		txtype |= TX_CI;
2336	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2337	    acl_ids.z_fuidp, vap);
2338
2339	zfs_acl_ids_free(&acl_ids);
2340
2341	dmu_tx_commit(tx);
2342
2343	getnewvnode_drop_reserve();
2344
2345	zfs_dirent_unlock(dl);
2346
2347	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2348		zil_commit(zilog, 0);
2349
2350	ZFS_EXIT(zfsvfs);
2351	return (0);
2352}
2353
2354/*
2355 * Remove a directory subdir entry.  If the current working
2356 * directory is the same as the subdir to be removed, the
2357 * remove will fail.
2358 *
2359 *	IN:	dvp	- vnode of directory to remove from.
2360 *		name	- name of directory to be removed.
2361 *		cwd	- vnode of current working directory.
2362 *		cr	- credentials of caller.
2363 *		ct	- caller context
2364 *		flags	- case flags
2365 *
2366 *	RETURN:	0 on success, error code on failure.
2367 *
2368 * Timestamps:
2369 *	dvp - ctime|mtime updated
2370 */
2371/*ARGSUSED*/
2372static int
2373zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2374    caller_context_t *ct, int flags)
2375{
2376	znode_t		*dzp = VTOZ(dvp);
2377	znode_t		*zp;
2378	vnode_t		*vp;
2379	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2380	zilog_t		*zilog;
2381	zfs_dirlock_t	*dl;
2382	dmu_tx_t	*tx;
2383	int		error;
2384	int		zflg = ZEXISTS;
2385	boolean_t	waited = B_FALSE;
2386
2387	ZFS_ENTER(zfsvfs);
2388	ZFS_VERIFY_ZP(dzp);
2389	zilog = zfsvfs->z_log;
2390
2391	if (flags & FIGNORECASE)
2392		zflg |= ZCILOOK;
2393top:
2394	zp = NULL;
2395
2396	/*
2397	 * Attempt to lock directory; fail if entry doesn't exist.
2398	 */
2399	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2400	    NULL, NULL)) {
2401		ZFS_EXIT(zfsvfs);
2402		return (error);
2403	}
2404
2405	vp = ZTOV(zp);
2406
2407	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2408		goto out;
2409	}
2410
2411	if (vp->v_type != VDIR) {
2412		error = SET_ERROR(ENOTDIR);
2413		goto out;
2414	}
2415
2416	if (vp == cwd) {
2417		error = SET_ERROR(EINVAL);
2418		goto out;
2419	}
2420
2421	vnevent_rmdir(vp, dvp, name, ct);
2422
2423	/*
2424	 * Grab a lock on the directory to make sure that noone is
2425	 * trying to add (or lookup) entries while we are removing it.
2426	 */
2427	rw_enter(&zp->z_name_lock, RW_WRITER);
2428
2429	/*
2430	 * Grab a lock on the parent pointer to make sure we play well
2431	 * with the treewalk and directory rename code.
2432	 */
2433	rw_enter(&zp->z_parent_lock, RW_WRITER);
2434
2435	tx = dmu_tx_create(zfsvfs->z_os);
2436	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2437	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2438	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2439	zfs_sa_upgrade_txholds(tx, zp);
2440	zfs_sa_upgrade_txholds(tx, dzp);
2441	dmu_tx_mark_netfree(tx);
2442	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2443	if (error) {
2444		rw_exit(&zp->z_parent_lock);
2445		rw_exit(&zp->z_name_lock);
2446		zfs_dirent_unlock(dl);
2447		VN_RELE(vp);
2448		if (error == ERESTART) {
2449			waited = B_TRUE;
2450			dmu_tx_wait(tx);
2451			dmu_tx_abort(tx);
2452			goto top;
2453		}
2454		dmu_tx_abort(tx);
2455		ZFS_EXIT(zfsvfs);
2456		return (error);
2457	}
2458
2459#ifdef FREEBSD_NAMECACHE
2460	cache_purge(dvp);
2461#endif
2462
2463	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2464
2465	if (error == 0) {
2466		uint64_t txtype = TX_RMDIR;
2467		if (flags & FIGNORECASE)
2468			txtype |= TX_CI;
2469		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2470	}
2471
2472	dmu_tx_commit(tx);
2473
2474	rw_exit(&zp->z_parent_lock);
2475	rw_exit(&zp->z_name_lock);
2476#ifdef FREEBSD_NAMECACHE
2477	cache_purge(vp);
2478#endif
2479out:
2480	zfs_dirent_unlock(dl);
2481
2482	VN_RELE(vp);
2483
2484	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2485		zil_commit(zilog, 0);
2486
2487	ZFS_EXIT(zfsvfs);
2488	return (error);
2489}
2490
2491/*
2492 * Read as many directory entries as will fit into the provided
2493 * buffer from the given directory cursor position (specified in
2494 * the uio structure).
2495 *
2496 *	IN:	vp	- vnode of directory to read.
2497 *		uio	- structure supplying read location, range info,
2498 *			  and return buffer.
2499 *		cr	- credentials of caller.
2500 *		ct	- caller context
2501 *		flags	- case flags
2502 *
2503 *	OUT:	uio	- updated offset and range, buffer filled.
2504 *		eofp	- set to true if end-of-file detected.
2505 *
2506 *	RETURN:	0 on success, error code on failure.
2507 *
2508 * Timestamps:
2509 *	vp - atime updated
2510 *
2511 * Note that the low 4 bits of the cookie returned by zap is always zero.
2512 * This allows us to use the low range for "special" directory entries:
2513 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2514 * we use the offset 2 for the '.zfs' directory.
2515 */
2516/* ARGSUSED */
2517static int
2518zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2519{
2520	znode_t		*zp = VTOZ(vp);
2521	iovec_t		*iovp;
2522	edirent_t	*eodp;
2523	dirent64_t	*odp;
2524	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2525	objset_t	*os;
2526	caddr_t		outbuf;
2527	size_t		bufsize;
2528	zap_cursor_t	zc;
2529	zap_attribute_t	zap;
2530	uint_t		bytes_wanted;
2531	uint64_t	offset; /* must be unsigned; checks for < 1 */
2532	uint64_t	parent;
2533	int		local_eof;
2534	int		outcount;
2535	int		error;
2536	uint8_t		prefetch;
2537	boolean_t	check_sysattrs;
2538	uint8_t		type;
2539	int		ncooks;
2540	u_long		*cooks = NULL;
2541	int		flags = 0;
2542
2543	ZFS_ENTER(zfsvfs);
2544	ZFS_VERIFY_ZP(zp);
2545
2546	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2547	    &parent, sizeof (parent))) != 0) {
2548		ZFS_EXIT(zfsvfs);
2549		return (error);
2550	}
2551
2552	/*
2553	 * If we are not given an eof variable,
2554	 * use a local one.
2555	 */
2556	if (eofp == NULL)
2557		eofp = &local_eof;
2558
2559	/*
2560	 * Check for valid iov_len.
2561	 */
2562	if (uio->uio_iov->iov_len <= 0) {
2563		ZFS_EXIT(zfsvfs);
2564		return (SET_ERROR(EINVAL));
2565	}
2566
2567	/*
2568	 * Quit if directory has been removed (posix)
2569	 */
2570	if ((*eofp = zp->z_unlinked) != 0) {
2571		ZFS_EXIT(zfsvfs);
2572		return (0);
2573	}
2574
2575	error = 0;
2576	os = zfsvfs->z_os;
2577	offset = uio->uio_loffset;
2578	prefetch = zp->z_zn_prefetch;
2579
2580	/*
2581	 * Initialize the iterator cursor.
2582	 */
2583	if (offset <= 3) {
2584		/*
2585		 * Start iteration from the beginning of the directory.
2586		 */
2587		zap_cursor_init(&zc, os, zp->z_id);
2588	} else {
2589		/*
2590		 * The offset is a serialized cursor.
2591		 */
2592		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2593	}
2594
2595	/*
2596	 * Get space to change directory entries into fs independent format.
2597	 */
2598	iovp = uio->uio_iov;
2599	bytes_wanted = iovp->iov_len;
2600	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2601		bufsize = bytes_wanted;
2602		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2603		odp = (struct dirent64 *)outbuf;
2604	} else {
2605		bufsize = bytes_wanted;
2606		outbuf = NULL;
2607		odp = (struct dirent64 *)iovp->iov_base;
2608	}
2609	eodp = (struct edirent *)odp;
2610
2611	if (ncookies != NULL) {
2612		/*
2613		 * Minimum entry size is dirent size and 1 byte for a file name.
2614		 */
2615		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2616		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2617		*cookies = cooks;
2618		*ncookies = ncooks;
2619	}
2620	/*
2621	 * If this VFS supports the system attribute view interface; and
2622	 * we're looking at an extended attribute directory; and we care
2623	 * about normalization conflicts on this vfs; then we must check
2624	 * for normalization conflicts with the sysattr name space.
2625	 */
2626#ifdef TODO
2627	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2628	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2629	    (flags & V_RDDIR_ENTFLAGS);
2630#else
2631	check_sysattrs = 0;
2632#endif
2633
2634	/*
2635	 * Transform to file-system independent format
2636	 */
2637	outcount = 0;
2638	while (outcount < bytes_wanted) {
2639		ino64_t objnum;
2640		ushort_t reclen;
2641		off64_t *next = NULL;
2642
2643		/*
2644		 * Special case `.', `..', and `.zfs'.
2645		 */
2646		if (offset == 0) {
2647			(void) strcpy(zap.za_name, ".");
2648			zap.za_normalization_conflict = 0;
2649			objnum = zp->z_id;
2650			type = DT_DIR;
2651		} else if (offset == 1) {
2652			(void) strcpy(zap.za_name, "..");
2653			zap.za_normalization_conflict = 0;
2654			objnum = parent;
2655			type = DT_DIR;
2656		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2657			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2658			zap.za_normalization_conflict = 0;
2659			objnum = ZFSCTL_INO_ROOT;
2660			type = DT_DIR;
2661		} else {
2662			/*
2663			 * Grab next entry.
2664			 */
2665			if (error = zap_cursor_retrieve(&zc, &zap)) {
2666				if ((*eofp = (error == ENOENT)) != 0)
2667					break;
2668				else
2669					goto update;
2670			}
2671
2672			if (zap.za_integer_length != 8 ||
2673			    zap.za_num_integers != 1) {
2674				cmn_err(CE_WARN, "zap_readdir: bad directory "
2675				    "entry, obj = %lld, offset = %lld\n",
2676				    (u_longlong_t)zp->z_id,
2677				    (u_longlong_t)offset);
2678				error = SET_ERROR(ENXIO);
2679				goto update;
2680			}
2681
2682			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2683			/*
2684			 * MacOS X can extract the object type here such as:
2685			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2686			 */
2687			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2688
2689			if (check_sysattrs && !zap.za_normalization_conflict) {
2690#ifdef TODO
2691				zap.za_normalization_conflict =
2692				    xattr_sysattr_casechk(zap.za_name);
2693#else
2694				panic("%s:%u: TODO", __func__, __LINE__);
2695#endif
2696			}
2697		}
2698
2699		if (flags & V_RDDIR_ACCFILTER) {
2700			/*
2701			 * If we have no access at all, don't include
2702			 * this entry in the returned information
2703			 */
2704			znode_t	*ezp;
2705			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2706				goto skip_entry;
2707			if (!zfs_has_access(ezp, cr)) {
2708				VN_RELE(ZTOV(ezp));
2709				goto skip_entry;
2710			}
2711			VN_RELE(ZTOV(ezp));
2712		}
2713
2714		if (flags & V_RDDIR_ENTFLAGS)
2715			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2716		else
2717			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2718
2719		/*
2720		 * Will this entry fit in the buffer?
2721		 */
2722		if (outcount + reclen > bufsize) {
2723			/*
2724			 * Did we manage to fit anything in the buffer?
2725			 */
2726			if (!outcount) {
2727				error = SET_ERROR(EINVAL);
2728				goto update;
2729			}
2730			break;
2731		}
2732		if (flags & V_RDDIR_ENTFLAGS) {
2733			/*
2734			 * Add extended flag entry:
2735			 */
2736			eodp->ed_ino = objnum;
2737			eodp->ed_reclen = reclen;
2738			/* NOTE: ed_off is the offset for the *next* entry */
2739			next = &(eodp->ed_off);
2740			eodp->ed_eflags = zap.za_normalization_conflict ?
2741			    ED_CASE_CONFLICT : 0;
2742			(void) strncpy(eodp->ed_name, zap.za_name,
2743			    EDIRENT_NAMELEN(reclen));
2744			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2745		} else {
2746			/*
2747			 * Add normal entry:
2748			 */
2749			odp->d_ino = objnum;
2750			odp->d_reclen = reclen;
2751			odp->d_namlen = strlen(zap.za_name);
2752			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2753			odp->d_type = type;
2754			odp = (dirent64_t *)((intptr_t)odp + reclen);
2755		}
2756		outcount += reclen;
2757
2758		ASSERT(outcount <= bufsize);
2759
2760		/* Prefetch znode */
2761		if (prefetch)
2762			dmu_prefetch(os, objnum, 0, 0, 0,
2763			    ZIO_PRIORITY_SYNC_READ);
2764
2765	skip_entry:
2766		/*
2767		 * Move to the next entry, fill in the previous offset.
2768		 */
2769		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2770			zap_cursor_advance(&zc);
2771			offset = zap_cursor_serialize(&zc);
2772		} else {
2773			offset += 1;
2774		}
2775
2776		if (cooks != NULL) {
2777			*cooks++ = offset;
2778			ncooks--;
2779			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2780		}
2781	}
2782	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2783
2784	/* Subtract unused cookies */
2785	if (ncookies != NULL)
2786		*ncookies -= ncooks;
2787
2788	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2789		iovp->iov_base += outcount;
2790		iovp->iov_len -= outcount;
2791		uio->uio_resid -= outcount;
2792	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2793		/*
2794		 * Reset the pointer.
2795		 */
2796		offset = uio->uio_loffset;
2797	}
2798
2799update:
2800	zap_cursor_fini(&zc);
2801	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2802		kmem_free(outbuf, bufsize);
2803
2804	if (error == ENOENT)
2805		error = 0;
2806
2807	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2808
2809	uio->uio_loffset = offset;
2810	ZFS_EXIT(zfsvfs);
2811	if (error != 0 && cookies != NULL) {
2812		free(*cookies, M_TEMP);
2813		*cookies = NULL;
2814		*ncookies = 0;
2815	}
2816	return (error);
2817}
2818
2819ulong_t zfs_fsync_sync_cnt = 4;
2820
2821static int
2822zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2823{
2824	znode_t	*zp = VTOZ(vp);
2825	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2826
2827	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2828
2829	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2830		ZFS_ENTER(zfsvfs);
2831		ZFS_VERIFY_ZP(zp);
2832		zil_commit(zfsvfs->z_log, zp->z_id);
2833		ZFS_EXIT(zfsvfs);
2834	}
2835	return (0);
2836}
2837
2838
2839/*
2840 * Get the requested file attributes and place them in the provided
2841 * vattr structure.
2842 *
2843 *	IN:	vp	- vnode of file.
2844 *		vap	- va_mask identifies requested attributes.
2845 *			  If AT_XVATTR set, then optional attrs are requested
2846 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2847 *		cr	- credentials of caller.
2848 *		ct	- caller context
2849 *
2850 *	OUT:	vap	- attribute values.
2851 *
2852 *	RETURN:	0 (always succeeds).
2853 */
2854/* ARGSUSED */
2855static int
2856zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2857    caller_context_t *ct)
2858{
2859	znode_t *zp = VTOZ(vp);
2860	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2861	int	error = 0;
2862	uint32_t blksize;
2863	u_longlong_t nblocks;
2864	uint64_t links;
2865	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2866	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2867	xoptattr_t *xoap = NULL;
2868	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2869	sa_bulk_attr_t bulk[4];
2870	int count = 0;
2871
2872	ZFS_ENTER(zfsvfs);
2873	ZFS_VERIFY_ZP(zp);
2874
2875	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2876
2877	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2878	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2879	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2880	if (vp->v_type == VBLK || vp->v_type == VCHR)
2881		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2882		    &rdev, 8);
2883
2884	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2885		ZFS_EXIT(zfsvfs);
2886		return (error);
2887	}
2888
2889	/*
2890	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2891	 * Also, if we are the owner don't bother, since owner should
2892	 * always be allowed to read basic attributes of file.
2893	 */
2894	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2895	    (vap->va_uid != crgetuid(cr))) {
2896		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2897		    skipaclchk, cr)) {
2898			ZFS_EXIT(zfsvfs);
2899			return (error);
2900		}
2901	}
2902
2903	/*
2904	 * Return all attributes.  It's cheaper to provide the answer
2905	 * than to determine whether we were asked the question.
2906	 */
2907
2908	mutex_enter(&zp->z_lock);
2909	vap->va_type = IFTOVT(zp->z_mode);
2910	vap->va_mode = zp->z_mode & ~S_IFMT;
2911#ifdef illumos
2912	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2913#else
2914	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2915#endif
2916	vap->va_nodeid = zp->z_id;
2917	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2918		links = zp->z_links + 1;
2919	else
2920		links = zp->z_links;
2921	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2922	vap->va_size = zp->z_size;
2923#ifdef illumos
2924	vap->va_rdev = vp->v_rdev;
2925#else
2926	if (vp->v_type == VBLK || vp->v_type == VCHR)
2927		vap->va_rdev = zfs_cmpldev(rdev);
2928#endif
2929	vap->va_seq = zp->z_seq;
2930	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2931	vap->va_filerev = zp->z_seq;
2932
2933	/*
2934	 * Add in any requested optional attributes and the create time.
2935	 * Also set the corresponding bits in the returned attribute bitmap.
2936	 */
2937	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2938		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2939			xoap->xoa_archive =
2940			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2941			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2942		}
2943
2944		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2945			xoap->xoa_readonly =
2946			    ((zp->z_pflags & ZFS_READONLY) != 0);
2947			XVA_SET_RTN(xvap, XAT_READONLY);
2948		}
2949
2950		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2951			xoap->xoa_system =
2952			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2953			XVA_SET_RTN(xvap, XAT_SYSTEM);
2954		}
2955
2956		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2957			xoap->xoa_hidden =
2958			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2959			XVA_SET_RTN(xvap, XAT_HIDDEN);
2960		}
2961
2962		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2963			xoap->xoa_nounlink =
2964			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2965			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2966		}
2967
2968		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2969			xoap->xoa_immutable =
2970			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2971			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2972		}
2973
2974		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2975			xoap->xoa_appendonly =
2976			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2977			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2978		}
2979
2980		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2981			xoap->xoa_nodump =
2982			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2983			XVA_SET_RTN(xvap, XAT_NODUMP);
2984		}
2985
2986		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2987			xoap->xoa_opaque =
2988			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2989			XVA_SET_RTN(xvap, XAT_OPAQUE);
2990		}
2991
2992		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2993			xoap->xoa_av_quarantined =
2994			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2995			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2996		}
2997
2998		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2999			xoap->xoa_av_modified =
3000			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
3001			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
3002		}
3003
3004		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
3005		    vp->v_type == VREG) {
3006			zfs_sa_get_scanstamp(zp, xvap);
3007		}
3008
3009		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
3010			uint64_t times[2];
3011
3012			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
3013			    times, sizeof (times));
3014			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
3015			XVA_SET_RTN(xvap, XAT_CREATETIME);
3016		}
3017
3018		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3019			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
3020			XVA_SET_RTN(xvap, XAT_REPARSE);
3021		}
3022		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
3023			xoap->xoa_generation = zp->z_gen;
3024			XVA_SET_RTN(xvap, XAT_GEN);
3025		}
3026
3027		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
3028			xoap->xoa_offline =
3029			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
3030			XVA_SET_RTN(xvap, XAT_OFFLINE);
3031		}
3032
3033		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
3034			xoap->xoa_sparse =
3035			    ((zp->z_pflags & ZFS_SPARSE) != 0);
3036			XVA_SET_RTN(xvap, XAT_SPARSE);
3037		}
3038	}
3039
3040	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
3041	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
3042	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
3043	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
3044
3045	mutex_exit(&zp->z_lock);
3046
3047	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3048	vap->va_blksize = blksize;
3049	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
3050
3051	if (zp->z_blksz == 0) {
3052		/*
3053		 * Block size hasn't been set; suggest maximal I/O transfers.
3054		 */
3055		vap->va_blksize = zfsvfs->z_max_blksz;
3056	}
3057
3058	ZFS_EXIT(zfsvfs);
3059	return (0);
3060}
3061
3062/*
3063 * Set the file attributes to the values contained in the
3064 * vattr structure.
3065 *
3066 *	IN:	vp	- vnode of file to be modified.
3067 *		vap	- new attribute values.
3068 *			  If AT_XVATTR set, then optional attrs are being set
3069 *		flags	- ATTR_UTIME set if non-default time values provided.
3070 *			- ATTR_NOACLCHECK (CIFS context only).
3071 *		cr	- credentials of caller.
3072 *		ct	- caller context
3073 *
3074 *	RETURN:	0 on success, error code on failure.
3075 *
3076 * Timestamps:
3077 *	vp - ctime updated, mtime updated if size changed.
3078 */
3079/* ARGSUSED */
3080static int
3081zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3082    caller_context_t *ct)
3083{
3084	znode_t		*zp = VTOZ(vp);
3085	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3086	zilog_t		*zilog;
3087	dmu_tx_t	*tx;
3088	vattr_t		oldva;
3089	xvattr_t	tmpxvattr;
3090	uint_t		mask = vap->va_mask;
3091	uint_t		saved_mask = 0;
3092	uint64_t	saved_mode;
3093	int		trim_mask = 0;
3094	uint64_t	new_mode;
3095	uint64_t	new_uid, new_gid;
3096	uint64_t	xattr_obj;
3097	uint64_t	mtime[2], ctime[2];
3098	znode_t		*attrzp;
3099	int		need_policy = FALSE;
3100	int		err, err2;
3101	zfs_fuid_info_t *fuidp = NULL;
3102	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3103	xoptattr_t	*xoap;
3104	zfs_acl_t	*aclp;
3105	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3106	boolean_t	fuid_dirtied = B_FALSE;
3107	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
3108	int		count = 0, xattr_count = 0;
3109
3110	if (mask == 0)
3111		return (0);
3112
3113	if (mask & AT_NOSET)
3114		return (SET_ERROR(EINVAL));
3115
3116	ZFS_ENTER(zfsvfs);
3117	ZFS_VERIFY_ZP(zp);
3118
3119	zilog = zfsvfs->z_log;
3120
3121	/*
3122	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3123	 * that file system is at proper version level
3124	 */
3125
3126	if (zfsvfs->z_use_fuids == B_FALSE &&
3127	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3128	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3129	    (mask & AT_XVATTR))) {
3130		ZFS_EXIT(zfsvfs);
3131		return (SET_ERROR(EINVAL));
3132	}
3133
3134	if (mask & AT_SIZE && vp->v_type == VDIR) {
3135		ZFS_EXIT(zfsvfs);
3136		return (SET_ERROR(EISDIR));
3137	}
3138
3139	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3140		ZFS_EXIT(zfsvfs);
3141		return (SET_ERROR(EINVAL));
3142	}
3143
3144	/*
3145	 * If this is an xvattr_t, then get a pointer to the structure of
3146	 * optional attributes.  If this is NULL, then we have a vattr_t.
3147	 */
3148	xoap = xva_getxoptattr(xvap);
3149
3150	xva_init(&tmpxvattr);
3151
3152	/*
3153	 * Immutable files can only alter immutable bit and atime
3154	 */
3155	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3156	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3157	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3158		ZFS_EXIT(zfsvfs);
3159		return (SET_ERROR(EPERM));
3160	}
3161
3162	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3163		ZFS_EXIT(zfsvfs);
3164		return (SET_ERROR(EPERM));
3165	}
3166
3167	/*
3168	 * Verify timestamps doesn't overflow 32 bits.
3169	 * ZFS can handle large timestamps, but 32bit syscalls can't
3170	 * handle times greater than 2039.  This check should be removed
3171	 * once large timestamps are fully supported.
3172	 */
3173	if (mask & (AT_ATIME | AT_MTIME)) {
3174		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3175		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3176			ZFS_EXIT(zfsvfs);
3177			return (SET_ERROR(EOVERFLOW));
3178		}
3179	}
3180
3181top:
3182	attrzp = NULL;
3183	aclp = NULL;
3184
3185	/* Can this be moved to before the top label? */
3186	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3187		ZFS_EXIT(zfsvfs);
3188		return (SET_ERROR(EROFS));
3189	}
3190
3191	/*
3192	 * First validate permissions
3193	 */
3194
3195	if (mask & AT_SIZE) {
3196		/*
3197		 * XXX - Note, we are not providing any open
3198		 * mode flags here (like FNDELAY), so we may
3199		 * block if there are locks present... this
3200		 * should be addressed in openat().
3201		 */
3202		/* XXX - would it be OK to generate a log record here? */
3203		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3204		if (err) {
3205			ZFS_EXIT(zfsvfs);
3206			return (err);
3207		}
3208	}
3209
3210	if (mask & (AT_ATIME|AT_MTIME) ||
3211	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3212	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3213	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3214	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3215	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3216	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3217	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3218		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3219		    skipaclchk, cr);
3220	}
3221
3222	if (mask & (AT_UID|AT_GID)) {
3223		int	idmask = (mask & (AT_UID|AT_GID));
3224		int	take_owner;
3225		int	take_group;
3226
3227		/*
3228		 * NOTE: even if a new mode is being set,
3229		 * we may clear S_ISUID/S_ISGID bits.
3230		 */
3231
3232		if (!(mask & AT_MODE))
3233			vap->va_mode = zp->z_mode;
3234
3235		/*
3236		 * Take ownership or chgrp to group we are a member of
3237		 */
3238
3239		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3240		take_group = (mask & AT_GID) &&
3241		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3242
3243		/*
3244		 * If both AT_UID and AT_GID are set then take_owner and
3245		 * take_group must both be set in order to allow taking
3246		 * ownership.
3247		 *
3248		 * Otherwise, send the check through secpolicy_vnode_setattr()
3249		 *
3250		 */
3251
3252		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3253		    ((idmask == AT_UID) && take_owner) ||
3254		    ((idmask == AT_GID) && take_group)) {
3255			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3256			    skipaclchk, cr) == 0) {
3257				/*
3258				 * Remove setuid/setgid for non-privileged users
3259				 */
3260				secpolicy_setid_clear(vap, vp, cr);
3261				trim_mask = (mask & (AT_UID|AT_GID));
3262			} else {
3263				need_policy =  TRUE;
3264			}
3265		} else {
3266			need_policy =  TRUE;
3267		}
3268	}
3269
3270	mutex_enter(&zp->z_lock);
3271	oldva.va_mode = zp->z_mode;
3272	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3273	if (mask & AT_XVATTR) {
3274		/*
3275		 * Update xvattr mask to include only those attributes
3276		 * that are actually changing.
3277		 *
3278		 * the bits will be restored prior to actually setting
3279		 * the attributes so the caller thinks they were set.
3280		 */
3281		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3282			if (xoap->xoa_appendonly !=
3283			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3284				need_policy = TRUE;
3285			} else {
3286				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3287				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3288			}
3289		}
3290
3291		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3292			if (xoap->xoa_nounlink !=
3293			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3294				need_policy = TRUE;
3295			} else {
3296				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3297				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3298			}
3299		}
3300
3301		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3302			if (xoap->xoa_immutable !=
3303			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3304				need_policy = TRUE;
3305			} else {
3306				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3307				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3308			}
3309		}
3310
3311		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3312			if (xoap->xoa_nodump !=
3313			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3314				need_policy = TRUE;
3315			} else {
3316				XVA_CLR_REQ(xvap, XAT_NODUMP);
3317				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3318			}
3319		}
3320
3321		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3322			if (xoap->xoa_av_modified !=
3323			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3324				need_policy = TRUE;
3325			} else {
3326				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3327				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3328			}
3329		}
3330
3331		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3332			if ((vp->v_type != VREG &&
3333			    xoap->xoa_av_quarantined) ||
3334			    xoap->xoa_av_quarantined !=
3335			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3336				need_policy = TRUE;
3337			} else {
3338				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3339				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3340			}
3341		}
3342
3343		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3344			mutex_exit(&zp->z_lock);
3345			ZFS_EXIT(zfsvfs);
3346			return (SET_ERROR(EPERM));
3347		}
3348
3349		if (need_policy == FALSE &&
3350		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3351		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3352			need_policy = TRUE;
3353		}
3354	}
3355
3356	mutex_exit(&zp->z_lock);
3357
3358	if (mask & AT_MODE) {
3359		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3360			err = secpolicy_setid_setsticky_clear(vp, vap,
3361			    &oldva, cr);
3362			if (err) {
3363				ZFS_EXIT(zfsvfs);
3364				return (err);
3365			}
3366			trim_mask |= AT_MODE;
3367		} else {
3368			need_policy = TRUE;
3369		}
3370	}
3371
3372	if (need_policy) {
3373		/*
3374		 * If trim_mask is set then take ownership
3375		 * has been granted or write_acl is present and user
3376		 * has the ability to modify mode.  In that case remove
3377		 * UID|GID and or MODE from mask so that
3378		 * secpolicy_vnode_setattr() doesn't revoke it.
3379		 */
3380
3381		if (trim_mask) {
3382			saved_mask = vap->va_mask;
3383			vap->va_mask &= ~trim_mask;
3384			if (trim_mask & AT_MODE) {
3385				/*
3386				 * Save the mode, as secpolicy_vnode_setattr()
3387				 * will overwrite it with ova.va_mode.
3388				 */
3389				saved_mode = vap->va_mode;
3390			}
3391		}
3392		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3393		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3394		if (err) {
3395			ZFS_EXIT(zfsvfs);
3396			return (err);
3397		}
3398
3399		if (trim_mask) {
3400			vap->va_mask |= saved_mask;
3401			if (trim_mask & AT_MODE) {
3402				/*
3403				 * Recover the mode after
3404				 * secpolicy_vnode_setattr().
3405				 */
3406				vap->va_mode = saved_mode;
3407			}
3408		}
3409	}
3410
3411	/*
3412	 * secpolicy_vnode_setattr, or take ownership may have
3413	 * changed va_mask
3414	 */
3415	mask = vap->va_mask;
3416
3417	if ((mask & (AT_UID | AT_GID))) {
3418		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3419		    &xattr_obj, sizeof (xattr_obj));
3420
3421		if (err == 0 && xattr_obj) {
3422			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3423			if (err)
3424				goto out2;
3425		}
3426		if (mask & AT_UID) {
3427			new_uid = zfs_fuid_create(zfsvfs,
3428			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3429			if (new_uid != zp->z_uid &&
3430			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3431				if (attrzp)
3432					VN_RELE(ZTOV(attrzp));
3433				err = SET_ERROR(EDQUOT);
3434				goto out2;
3435			}
3436		}
3437
3438		if (mask & AT_GID) {
3439			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3440			    cr, ZFS_GROUP, &fuidp);
3441			if (new_gid != zp->z_gid &&
3442			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3443				if (attrzp)
3444					VN_RELE(ZTOV(attrzp));
3445				err = SET_ERROR(EDQUOT);
3446				goto out2;
3447			}
3448		}
3449	}
3450	tx = dmu_tx_create(zfsvfs->z_os);
3451
3452	if (mask & AT_MODE) {
3453		uint64_t pmode = zp->z_mode;
3454		uint64_t acl_obj;
3455		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3456
3457		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3458		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3459			err = SET_ERROR(EPERM);
3460			goto out;
3461		}
3462
3463		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3464			goto out;
3465
3466		mutex_enter(&zp->z_lock);
3467		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3468			/*
3469			 * Are we upgrading ACL from old V0 format
3470			 * to V1 format?
3471			 */
3472			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3473			    zfs_znode_acl_version(zp) ==
3474			    ZFS_ACL_VERSION_INITIAL) {
3475				dmu_tx_hold_free(tx, acl_obj, 0,
3476				    DMU_OBJECT_END);
3477				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3478				    0, aclp->z_acl_bytes);
3479			} else {
3480				dmu_tx_hold_write(tx, acl_obj, 0,
3481				    aclp->z_acl_bytes);
3482			}
3483		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3484			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3485			    0, aclp->z_acl_bytes);
3486		}
3487		mutex_exit(&zp->z_lock);
3488		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3489	} else {
3490		if ((mask & AT_XVATTR) &&
3491		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3492			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3493		else
3494			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3495	}
3496
3497	if (attrzp) {
3498		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3499	}
3500
3501	fuid_dirtied = zfsvfs->z_fuid_dirty;
3502	if (fuid_dirtied)
3503		zfs_fuid_txhold(zfsvfs, tx);
3504
3505	zfs_sa_upgrade_txholds(tx, zp);
3506
3507	err = dmu_tx_assign(tx, TXG_WAIT);
3508	if (err)
3509		goto out;
3510
3511	count = 0;
3512	/*
3513	 * Set each attribute requested.
3514	 * We group settings according to the locks they need to acquire.
3515	 *
3516	 * Note: you cannot set ctime directly, although it will be
3517	 * updated as a side-effect of calling this function.
3518	 */
3519
3520
3521	if (mask & (AT_UID|AT_GID|AT_MODE))
3522		mutex_enter(&zp->z_acl_lock);
3523	mutex_enter(&zp->z_lock);
3524
3525	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3526	    &zp->z_pflags, sizeof (zp->z_pflags));
3527
3528	if (attrzp) {
3529		if (mask & (AT_UID|AT_GID|AT_MODE))
3530			mutex_enter(&attrzp->z_acl_lock);
3531		mutex_enter(&attrzp->z_lock);
3532		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3533		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3534		    sizeof (attrzp->z_pflags));
3535	}
3536
3537	if (mask & (AT_UID|AT_GID)) {
3538
3539		if (mask & AT_UID) {
3540			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3541			    &new_uid, sizeof (new_uid));
3542			zp->z_uid = new_uid;
3543			if (attrzp) {
3544				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3545				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3546				    sizeof (new_uid));
3547				attrzp->z_uid = new_uid;
3548			}
3549		}
3550
3551		if (mask & AT_GID) {
3552			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3553			    NULL, &new_gid, sizeof (new_gid));
3554			zp->z_gid = new_gid;
3555			if (attrzp) {
3556				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3557				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3558				    sizeof (new_gid));
3559				attrzp->z_gid = new_gid;
3560			}
3561		}
3562		if (!(mask & AT_MODE)) {
3563			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3564			    NULL, &new_mode, sizeof (new_mode));
3565			new_mode = zp->z_mode;
3566		}
3567		err = zfs_acl_chown_setattr(zp);
3568		ASSERT(err == 0);
3569		if (attrzp) {
3570			err = zfs_acl_chown_setattr(attrzp);
3571			ASSERT(err == 0);
3572		}
3573	}
3574
3575	if (mask & AT_MODE) {
3576		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3577		    &new_mode, sizeof (new_mode));
3578		zp->z_mode = new_mode;
3579		ASSERT3U((uintptr_t)aclp, !=, 0);
3580		err = zfs_aclset_common(zp, aclp, cr, tx);
3581		ASSERT0(err);
3582		if (zp->z_acl_cached)
3583			zfs_acl_free(zp->z_acl_cached);
3584		zp->z_acl_cached = aclp;
3585		aclp = NULL;
3586	}
3587
3588
3589	if (mask & AT_ATIME) {
3590		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3591		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3592		    &zp->z_atime, sizeof (zp->z_atime));
3593	}
3594
3595	if (mask & AT_MTIME) {
3596		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3597		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3598		    mtime, sizeof (mtime));
3599	}
3600
3601	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3602	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3603		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3604		    NULL, mtime, sizeof (mtime));
3605		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3606		    &ctime, sizeof (ctime));
3607		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3608		    B_TRUE);
3609	} else if (mask != 0) {
3610		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3611		    &ctime, sizeof (ctime));
3612		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3613		    B_TRUE);
3614		if (attrzp) {
3615			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3616			    SA_ZPL_CTIME(zfsvfs), NULL,
3617			    &ctime, sizeof (ctime));
3618			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3619			    mtime, ctime, B_TRUE);
3620		}
3621	}
3622	/*
3623	 * Do this after setting timestamps to prevent timestamp
3624	 * update from toggling bit
3625	 */
3626
3627	if (xoap && (mask & AT_XVATTR)) {
3628
3629		/*
3630		 * restore trimmed off masks
3631		 * so that return masks can be set for caller.
3632		 */
3633
3634		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3635			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3636		}
3637		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3638			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3639		}
3640		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3641			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3642		}
3643		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3644			XVA_SET_REQ(xvap, XAT_NODUMP);
3645		}
3646		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3647			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3648		}
3649		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3650			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3651		}
3652
3653		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3654			ASSERT(vp->v_type == VREG);
3655
3656		zfs_xvattr_set(zp, xvap, tx);
3657	}
3658
3659	if (fuid_dirtied)
3660		zfs_fuid_sync(zfsvfs, tx);
3661
3662	if (mask != 0)
3663		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3664
3665	mutex_exit(&zp->z_lock);
3666	if (mask & (AT_UID|AT_GID|AT_MODE))
3667		mutex_exit(&zp->z_acl_lock);
3668
3669	if (attrzp) {
3670		if (mask & (AT_UID|AT_GID|AT_MODE))
3671			mutex_exit(&attrzp->z_acl_lock);
3672		mutex_exit(&attrzp->z_lock);
3673	}
3674out:
3675	if (err == 0 && attrzp) {
3676		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3677		    xattr_count, tx);
3678		ASSERT(err2 == 0);
3679	}
3680
3681	if (attrzp)
3682		VN_RELE(ZTOV(attrzp));
3683
3684	if (aclp)
3685		zfs_acl_free(aclp);
3686
3687	if (fuidp) {
3688		zfs_fuid_info_free(fuidp);
3689		fuidp = NULL;
3690	}
3691
3692	if (err) {
3693		dmu_tx_abort(tx);
3694		if (err == ERESTART)
3695			goto top;
3696	} else {
3697		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3698		dmu_tx_commit(tx);
3699	}
3700
3701out2:
3702	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3703		zil_commit(zilog, 0);
3704
3705	ZFS_EXIT(zfsvfs);
3706	return (err);
3707}
3708
3709typedef struct zfs_zlock {
3710	krwlock_t	*zl_rwlock;	/* lock we acquired */
3711	znode_t		*zl_znode;	/* znode we held */
3712	struct zfs_zlock *zl_next;	/* next in list */
3713} zfs_zlock_t;
3714
3715/*
3716 * Drop locks and release vnodes that were held by zfs_rename_lock().
3717 */
3718static void
3719zfs_rename_unlock(zfs_zlock_t **zlpp)
3720{
3721	zfs_zlock_t *zl;
3722
3723	while ((zl = *zlpp) != NULL) {
3724		if (zl->zl_znode != NULL)
3725			VN_RELE(ZTOV(zl->zl_znode));
3726		rw_exit(zl->zl_rwlock);
3727		*zlpp = zl->zl_next;
3728		kmem_free(zl, sizeof (*zl));
3729	}
3730}
3731
3732/*
3733 * Search back through the directory tree, using the ".." entries.
3734 * Lock each directory in the chain to prevent concurrent renames.
3735 * Fail any attempt to move a directory into one of its own descendants.
3736 * XXX - z_parent_lock can overlap with map or grow locks
3737 */
3738static int
3739zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3740{
3741	zfs_zlock_t	*zl;
3742	znode_t		*zp = tdzp;
3743	uint64_t	rootid = zp->z_zfsvfs->z_root;
3744	uint64_t	oidp = zp->z_id;
3745	krwlock_t	*rwlp = &szp->z_parent_lock;
3746	krw_t		rw = RW_WRITER;
3747
3748	/*
3749	 * First pass write-locks szp and compares to zp->z_id.
3750	 * Later passes read-lock zp and compare to zp->z_parent.
3751	 */
3752	do {
3753		if (!rw_tryenter(rwlp, rw)) {
3754			/*
3755			 * Another thread is renaming in this path.
3756			 * Note that if we are a WRITER, we don't have any
3757			 * parent_locks held yet.
3758			 */
3759			if (rw == RW_READER && zp->z_id > szp->z_id) {
3760				/*
3761				 * Drop our locks and restart
3762				 */
3763				zfs_rename_unlock(&zl);
3764				*zlpp = NULL;
3765				zp = tdzp;
3766				oidp = zp->z_id;
3767				rwlp = &szp->z_parent_lock;
3768				rw = RW_WRITER;
3769				continue;
3770			} else {
3771				/*
3772				 * Wait for other thread to drop its locks
3773				 */
3774				rw_enter(rwlp, rw);
3775			}
3776		}
3777
3778		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3779		zl->zl_rwlock = rwlp;
3780		zl->zl_znode = NULL;
3781		zl->zl_next = *zlpp;
3782		*zlpp = zl;
3783
3784		if (oidp == szp->z_id)		/* We're a descendant of szp */
3785			return (SET_ERROR(EINVAL));
3786
3787		if (oidp == rootid)		/* We've hit the top */
3788			return (0);
3789
3790		if (rw == RW_READER) {		/* i.e. not the first pass */
3791			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3792			if (error)
3793				return (error);
3794			zl->zl_znode = zp;
3795		}
3796		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3797		    &oidp, sizeof (oidp));
3798		rwlp = &zp->z_parent_lock;
3799		rw = RW_READER;
3800
3801	} while (zp->z_id != sdzp->z_id);
3802
3803	return (0);
3804}
3805
3806/*
3807 * Move an entry from the provided source directory to the target
3808 * directory.  Change the entry name as indicated.
3809 *
3810 *	IN:	sdvp	- Source directory containing the "old entry".
3811 *		snm	- Old entry name.
3812 *		tdvp	- Target directory to contain the "new entry".
3813 *		tnm	- New entry name.
3814 *		cr	- credentials of caller.
3815 *		ct	- caller context
3816 *		flags	- case flags
3817 *
3818 *	RETURN:	0 on success, error code on failure.
3819 *
3820 * Timestamps:
3821 *	sdvp,tdvp - ctime|mtime updated
3822 */
3823/*ARGSUSED*/
3824static int
3825zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3826    caller_context_t *ct, int flags)
3827{
3828	znode_t		*tdzp, *sdzp, *szp, *tzp;
3829	zfsvfs_t 	*zfsvfs;
3830	zilog_t		*zilog;
3831	vnode_t		*realvp;
3832	zfs_dirlock_t	*sdl, *tdl;
3833	dmu_tx_t	*tx;
3834	zfs_zlock_t	*zl;
3835	int		cmp, serr, terr;
3836	int		error = 0;
3837	int		zflg = 0;
3838	boolean_t	waited = B_FALSE;
3839
3840	tdzp = VTOZ(tdvp);
3841	ZFS_VERIFY_ZP(tdzp);
3842	zfsvfs = tdzp->z_zfsvfs;
3843	ZFS_ENTER(zfsvfs);
3844	zilog = zfsvfs->z_log;
3845	sdzp = VTOZ(sdvp);
3846
3847	/*
3848	 * In case sdzp is not valid, let's be sure to exit from the right
3849	 * zfsvfs_t.
3850	 */
3851	if (sdzp->z_sa_hdl == NULL) {
3852		ZFS_EXIT(zfsvfs);
3853		return (SET_ERROR(EIO));
3854	}
3855
3856	/*
3857	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3858	 * ctldir appear to have the same v_vfsp.
3859	 */
3860	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3861		ZFS_EXIT(zfsvfs);
3862		return (SET_ERROR(EXDEV));
3863	}
3864
3865	if (zfsvfs->z_utf8 && u8_validate(tnm,
3866	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3867		ZFS_EXIT(zfsvfs);
3868		return (SET_ERROR(EILSEQ));
3869	}
3870
3871	if (flags & FIGNORECASE)
3872		zflg |= ZCILOOK;
3873
3874top:
3875	szp = NULL;
3876	tzp = NULL;
3877	zl = NULL;
3878
3879	/*
3880	 * This is to prevent the creation of links into attribute space
3881	 * by renaming a linked file into/outof an attribute directory.
3882	 * See the comment in zfs_link() for why this is considered bad.
3883	 */
3884	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3885		ZFS_EXIT(zfsvfs);
3886		return (SET_ERROR(EINVAL));
3887	}
3888
3889	/*
3890	 * Lock source and target directory entries.  To prevent deadlock,
3891	 * a lock ordering must be defined.  We lock the directory with
3892	 * the smallest object id first, or if it's a tie, the one with
3893	 * the lexically first name.
3894	 */
3895	if (sdzp->z_id < tdzp->z_id) {
3896		cmp = -1;
3897	} else if (sdzp->z_id > tdzp->z_id) {
3898		cmp = 1;
3899	} else {
3900		/*
3901		 * First compare the two name arguments without
3902		 * considering any case folding.
3903		 */
3904		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3905
3906		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3907		ASSERT(error == 0 || !zfsvfs->z_utf8);
3908		if (cmp == 0) {
3909			/*
3910			 * POSIX: "If the old argument and the new argument
3911			 * both refer to links to the same existing file,
3912			 * the rename() function shall return successfully
3913			 * and perform no other action."
3914			 */
3915			ZFS_EXIT(zfsvfs);
3916			return (0);
3917		}
3918		/*
3919		 * If the file system is case-folding, then we may
3920		 * have some more checking to do.  A case-folding file
3921		 * system is either supporting mixed case sensitivity
3922		 * access or is completely case-insensitive.  Note
3923		 * that the file system is always case preserving.
3924		 *
3925		 * In mixed sensitivity mode case sensitive behavior
3926		 * is the default.  FIGNORECASE must be used to
3927		 * explicitly request case insensitive behavior.
3928		 *
3929		 * If the source and target names provided differ only
3930		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3931		 * we will treat this as a special case in the
3932		 * case-insensitive mode: as long as the source name
3933		 * is an exact match, we will allow this to proceed as
3934		 * a name-change request.
3935		 */
3936		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3937		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3938		    flags & FIGNORECASE)) &&
3939		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3940		    &error) == 0) {
3941			/*
3942			 * case preserving rename request, require exact
3943			 * name matches
3944			 */
3945			zflg |= ZCIEXACT;
3946			zflg &= ~ZCILOOK;
3947		}
3948	}
3949
3950	/*
3951	 * If the source and destination directories are the same, we should
3952	 * grab the z_name_lock of that directory only once.
3953	 */
3954	if (sdzp == tdzp) {
3955		zflg |= ZHAVELOCK;
3956		rw_enter(&sdzp->z_name_lock, RW_READER);
3957	}
3958
3959	if (cmp < 0) {
3960		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3961		    ZEXISTS | zflg, NULL, NULL);
3962		terr = zfs_dirent_lock(&tdl,
3963		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3964	} else {
3965		terr = zfs_dirent_lock(&tdl,
3966		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3967		serr = zfs_dirent_lock(&sdl,
3968		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3969		    NULL, NULL);
3970	}
3971
3972	if (serr) {
3973		/*
3974		 * Source entry invalid or not there.
3975		 */
3976		if (!terr) {
3977			zfs_dirent_unlock(tdl);
3978			if (tzp)
3979				VN_RELE(ZTOV(tzp));
3980		}
3981
3982		if (sdzp == tdzp)
3983			rw_exit(&sdzp->z_name_lock);
3984
3985		/*
3986		 * FreeBSD: In OpenSolaris they only check if rename source is
3987		 * ".." here, because "." is handled in their lookup. This is
3988		 * not the case for FreeBSD, so we check for "." explicitly.
3989		 */
3990		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3991			serr = SET_ERROR(EINVAL);
3992		ZFS_EXIT(zfsvfs);
3993		return (serr);
3994	}
3995	if (terr) {
3996		zfs_dirent_unlock(sdl);
3997		VN_RELE(ZTOV(szp));
3998
3999		if (sdzp == tdzp)
4000			rw_exit(&sdzp->z_name_lock);
4001
4002		if (strcmp(tnm, "..") == 0)
4003			terr = SET_ERROR(EINVAL);
4004		ZFS_EXIT(zfsvfs);
4005		return (terr);
4006	}
4007
4008	/*
4009	 * Must have write access at the source to remove the old entry
4010	 * and write access at the target to create the new entry.
4011	 * Note that if target and source are the same, this can be
4012	 * done in a single check.
4013	 */
4014
4015	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
4016		goto out;
4017
4018	if (ZTOV(szp)->v_type == VDIR) {
4019		/*
4020		 * Check to make sure rename is valid.
4021		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
4022		 */
4023		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
4024			goto out;
4025	}
4026
4027	/*
4028	 * Does target exist?
4029	 */
4030	if (tzp) {
4031		/*
4032		 * Source and target must be the same type.
4033		 */
4034		if (ZTOV(szp)->v_type == VDIR) {
4035			if (ZTOV(tzp)->v_type != VDIR) {
4036				error = SET_ERROR(ENOTDIR);
4037				goto out;
4038			}
4039		} else {
4040			if (ZTOV(tzp)->v_type == VDIR) {
4041				error = SET_ERROR(EISDIR);
4042				goto out;
4043			}
4044		}
4045		/*
4046		 * POSIX dictates that when the source and target
4047		 * entries refer to the same file object, rename
4048		 * must do nothing and exit without error.
4049		 */
4050		if (szp->z_id == tzp->z_id) {
4051			error = 0;
4052			goto out;
4053		}
4054	}
4055
4056	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
4057	if (tzp)
4058		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
4059
4060	/*
4061	 * notify the target directory if it is not the same
4062	 * as source directory.
4063	 */
4064	if (tdvp != sdvp) {
4065		vnevent_rename_dest_dir(tdvp, ct);
4066	}
4067
4068	tx = dmu_tx_create(zfsvfs->z_os);
4069	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4070	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4071	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4072	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4073	if (sdzp != tdzp) {
4074		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4075		zfs_sa_upgrade_txholds(tx, tdzp);
4076	}
4077	if (tzp) {
4078		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4079		zfs_sa_upgrade_txholds(tx, tzp);
4080	}
4081
4082	zfs_sa_upgrade_txholds(tx, szp);
4083	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4084	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4085	if (error) {
4086		if (zl != NULL)
4087			zfs_rename_unlock(&zl);
4088		zfs_dirent_unlock(sdl);
4089		zfs_dirent_unlock(tdl);
4090
4091		if (sdzp == tdzp)
4092			rw_exit(&sdzp->z_name_lock);
4093
4094		VN_RELE(ZTOV(szp));
4095		if (tzp)
4096			VN_RELE(ZTOV(tzp));
4097		if (error == ERESTART) {
4098			waited = B_TRUE;
4099			dmu_tx_wait(tx);
4100			dmu_tx_abort(tx);
4101			goto top;
4102		}
4103		dmu_tx_abort(tx);
4104		ZFS_EXIT(zfsvfs);
4105		return (error);
4106	}
4107
4108	if (tzp)	/* Attempt to remove the existing target */
4109		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
4110
4111	if (error == 0) {
4112		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
4113		if (error == 0) {
4114			szp->z_pflags |= ZFS_AV_MODIFIED;
4115
4116			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4117			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4118			ASSERT0(error);
4119
4120			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
4121			if (error == 0) {
4122				zfs_log_rename(zilog, tx, TX_RENAME |
4123				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
4124				    sdl->dl_name, tdzp, tdl->dl_name, szp);
4125
4126				/*
4127				 * Update path information for the target vnode
4128				 */
4129				vn_renamepath(tdvp, ZTOV(szp), tnm,
4130				    strlen(tnm));
4131			} else {
4132				/*
4133				 * At this point, we have successfully created
4134				 * the target name, but have failed to remove
4135				 * the source name.  Since the create was done
4136				 * with the ZRENAMING flag, there are
4137				 * complications; for one, the link count is
4138				 * wrong.  The easiest way to deal with this
4139				 * is to remove the newly created target, and
4140				 * return the original error.  This must
4141				 * succeed; fortunately, it is very unlikely to
4142				 * fail, since we just created it.
4143				 */
4144				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4145				    ZRENAMING, NULL), ==, 0);
4146			}
4147		}
4148#ifdef FREEBSD_NAMECACHE
4149		if (error == 0) {
4150			cache_purge(sdvp);
4151			cache_purge(tdvp);
4152			cache_purge(ZTOV(szp));
4153			if (tzp)
4154				cache_purge(ZTOV(tzp));
4155		}
4156#endif
4157	}
4158
4159	dmu_tx_commit(tx);
4160out:
4161	if (zl != NULL)
4162		zfs_rename_unlock(&zl);
4163
4164	zfs_dirent_unlock(sdl);
4165	zfs_dirent_unlock(tdl);
4166
4167	if (sdzp == tdzp)
4168		rw_exit(&sdzp->z_name_lock);
4169
4170
4171	VN_RELE(ZTOV(szp));
4172	if (tzp)
4173		VN_RELE(ZTOV(tzp));
4174
4175	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4176		zil_commit(zilog, 0);
4177
4178	ZFS_EXIT(zfsvfs);
4179
4180	return (error);
4181}
4182
4183/*
4184 * Insert the indicated symbolic reference entry into the directory.
4185 *
4186 *	IN:	dvp	- Directory to contain new symbolic link.
4187 *		link	- Name for new symlink entry.
4188 *		vap	- Attributes of new entry.
4189 *		cr	- credentials of caller.
4190 *		ct	- caller context
4191 *		flags	- case flags
4192 *
4193 *	RETURN:	0 on success, error code on failure.
4194 *
4195 * Timestamps:
4196 *	dvp - ctime|mtime updated
4197 */
4198/*ARGSUSED*/
4199static int
4200zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4201    cred_t *cr, kthread_t *td)
4202{
4203	znode_t		*zp, *dzp = VTOZ(dvp);
4204	zfs_dirlock_t	*dl;
4205	dmu_tx_t	*tx;
4206	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4207	zilog_t		*zilog;
4208	uint64_t	len = strlen(link);
4209	int		error;
4210	int		zflg = ZNEW;
4211	zfs_acl_ids_t	acl_ids;
4212	boolean_t	fuid_dirtied;
4213	uint64_t	txtype = TX_SYMLINK;
4214	boolean_t	waited = B_FALSE;
4215	int		flags = 0;
4216
4217	ASSERT(vap->va_type == VLNK);
4218
4219	ZFS_ENTER(zfsvfs);
4220	ZFS_VERIFY_ZP(dzp);
4221	zilog = zfsvfs->z_log;
4222
4223	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4224	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4225		ZFS_EXIT(zfsvfs);
4226		return (SET_ERROR(EILSEQ));
4227	}
4228	if (flags & FIGNORECASE)
4229		zflg |= ZCILOOK;
4230
4231	if (len > MAXPATHLEN) {
4232		ZFS_EXIT(zfsvfs);
4233		return (SET_ERROR(ENAMETOOLONG));
4234	}
4235
4236	if ((error = zfs_acl_ids_create(dzp, 0,
4237	    vap, cr, NULL, &acl_ids)) != 0) {
4238		ZFS_EXIT(zfsvfs);
4239		return (error);
4240	}
4241
4242	getnewvnode_reserve(1);
4243
4244top:
4245	/*
4246	 * Attempt to lock directory; fail if entry already exists.
4247	 */
4248	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4249	if (error) {
4250		zfs_acl_ids_free(&acl_ids);
4251		getnewvnode_drop_reserve();
4252		ZFS_EXIT(zfsvfs);
4253		return (error);
4254	}
4255
4256	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4257		zfs_acl_ids_free(&acl_ids);
4258		zfs_dirent_unlock(dl);
4259		getnewvnode_drop_reserve();
4260		ZFS_EXIT(zfsvfs);
4261		return (error);
4262	}
4263
4264	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4265		zfs_acl_ids_free(&acl_ids);
4266		zfs_dirent_unlock(dl);
4267		getnewvnode_drop_reserve();
4268		ZFS_EXIT(zfsvfs);
4269		return (SET_ERROR(EDQUOT));
4270	}
4271	tx = dmu_tx_create(zfsvfs->z_os);
4272	fuid_dirtied = zfsvfs->z_fuid_dirty;
4273	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4274	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4275	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4276	    ZFS_SA_BASE_ATTR_SIZE + len);
4277	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4278	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4279		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4280		    acl_ids.z_aclp->z_acl_bytes);
4281	}
4282	if (fuid_dirtied)
4283		zfs_fuid_txhold(zfsvfs, tx);
4284	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4285	if (error) {
4286		zfs_dirent_unlock(dl);
4287		if (error == ERESTART) {
4288			waited = B_TRUE;
4289			dmu_tx_wait(tx);
4290			dmu_tx_abort(tx);
4291			goto top;
4292		}
4293		zfs_acl_ids_free(&acl_ids);
4294		dmu_tx_abort(tx);
4295		getnewvnode_drop_reserve();
4296		ZFS_EXIT(zfsvfs);
4297		return (error);
4298	}
4299
4300	/*
4301	 * Create a new object for the symlink.
4302	 * for version 4 ZPL datsets the symlink will be an SA attribute
4303	 */
4304	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4305
4306	if (fuid_dirtied)
4307		zfs_fuid_sync(zfsvfs, tx);
4308
4309	mutex_enter(&zp->z_lock);
4310	if (zp->z_is_sa)
4311		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4312		    link, len, tx);
4313	else
4314		zfs_sa_symlink(zp, link, len, tx);
4315	mutex_exit(&zp->z_lock);
4316
4317	zp->z_size = len;
4318	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4319	    &zp->z_size, sizeof (zp->z_size), tx);
4320	/*
4321	 * Insert the new object into the directory.
4322	 */
4323	(void) zfs_link_create(dl, zp, tx, ZNEW);
4324
4325	if (flags & FIGNORECASE)
4326		txtype |= TX_CI;
4327	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4328	*vpp = ZTOV(zp);
4329
4330	zfs_acl_ids_free(&acl_ids);
4331
4332	dmu_tx_commit(tx);
4333
4334	getnewvnode_drop_reserve();
4335
4336	zfs_dirent_unlock(dl);
4337
4338	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4339		zil_commit(zilog, 0);
4340
4341	ZFS_EXIT(zfsvfs);
4342	return (error);
4343}
4344
4345/*
4346 * Return, in the buffer contained in the provided uio structure,
4347 * the symbolic path referred to by vp.
4348 *
4349 *	IN:	vp	- vnode of symbolic link.
4350 *		uio	- structure to contain the link path.
4351 *		cr	- credentials of caller.
4352 *		ct	- caller context
4353 *
4354 *	OUT:	uio	- structure containing the link path.
4355 *
4356 *	RETURN:	0 on success, error code on failure.
4357 *
4358 * Timestamps:
4359 *	vp - atime updated
4360 */
4361/* ARGSUSED */
4362static int
4363zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4364{
4365	znode_t		*zp = VTOZ(vp);
4366	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4367	int		error;
4368
4369	ZFS_ENTER(zfsvfs);
4370	ZFS_VERIFY_ZP(zp);
4371
4372	mutex_enter(&zp->z_lock);
4373	if (zp->z_is_sa)
4374		error = sa_lookup_uio(zp->z_sa_hdl,
4375		    SA_ZPL_SYMLINK(zfsvfs), uio);
4376	else
4377		error = zfs_sa_readlink(zp, uio);
4378	mutex_exit(&zp->z_lock);
4379
4380	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4381
4382	ZFS_EXIT(zfsvfs);
4383	return (error);
4384}
4385
4386/*
4387 * Insert a new entry into directory tdvp referencing svp.
4388 *
4389 *	IN:	tdvp	- Directory to contain new entry.
4390 *		svp	- vnode of new entry.
4391 *		name	- name of new entry.
4392 *		cr	- credentials of caller.
4393 *		ct	- caller context
4394 *
4395 *	RETURN:	0 on success, error code on failure.
4396 *
4397 * Timestamps:
4398 *	tdvp - ctime|mtime updated
4399 *	 svp - ctime updated
4400 */
4401/* ARGSUSED */
4402static int
4403zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4404    caller_context_t *ct, int flags)
4405{
4406	znode_t		*dzp = VTOZ(tdvp);
4407	znode_t		*tzp, *szp;
4408	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4409	zilog_t		*zilog;
4410	zfs_dirlock_t	*dl;
4411	dmu_tx_t	*tx;
4412	vnode_t		*realvp;
4413	int		error;
4414	int		zf = ZNEW;
4415	uint64_t	parent;
4416	uid_t		owner;
4417	boolean_t	waited = B_FALSE;
4418
4419	ASSERT(tdvp->v_type == VDIR);
4420
4421	ZFS_ENTER(zfsvfs);
4422	ZFS_VERIFY_ZP(dzp);
4423	zilog = zfsvfs->z_log;
4424
4425	if (VOP_REALVP(svp, &realvp, ct) == 0)
4426		svp = realvp;
4427
4428	/*
4429	 * POSIX dictates that we return EPERM here.
4430	 * Better choices include ENOTSUP or EISDIR.
4431	 */
4432	if (svp->v_type == VDIR) {
4433		ZFS_EXIT(zfsvfs);
4434		return (SET_ERROR(EPERM));
4435	}
4436
4437	szp = VTOZ(svp);
4438	ZFS_VERIFY_ZP(szp);
4439
4440	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4441		ZFS_EXIT(zfsvfs);
4442		return (SET_ERROR(EPERM));
4443	}
4444
4445	/*
4446	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4447	 * ctldir appear to have the same v_vfsp.
4448	 */
4449	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4450		ZFS_EXIT(zfsvfs);
4451		return (SET_ERROR(EXDEV));
4452	}
4453
4454	/* Prevent links to .zfs/shares files */
4455
4456	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4457	    &parent, sizeof (uint64_t))) != 0) {
4458		ZFS_EXIT(zfsvfs);
4459		return (error);
4460	}
4461	if (parent == zfsvfs->z_shares_dir) {
4462		ZFS_EXIT(zfsvfs);
4463		return (SET_ERROR(EPERM));
4464	}
4465
4466	if (zfsvfs->z_utf8 && u8_validate(name,
4467	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4468		ZFS_EXIT(zfsvfs);
4469		return (SET_ERROR(EILSEQ));
4470	}
4471	if (flags & FIGNORECASE)
4472		zf |= ZCILOOK;
4473
4474	/*
4475	 * We do not support links between attributes and non-attributes
4476	 * because of the potential security risk of creating links
4477	 * into "normal" file space in order to circumvent restrictions
4478	 * imposed in attribute space.
4479	 */
4480	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4481		ZFS_EXIT(zfsvfs);
4482		return (SET_ERROR(EINVAL));
4483	}
4484
4485
4486	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4487	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4488		ZFS_EXIT(zfsvfs);
4489		return (SET_ERROR(EPERM));
4490	}
4491
4492	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4493		ZFS_EXIT(zfsvfs);
4494		return (error);
4495	}
4496
4497top:
4498	/*
4499	 * Attempt to lock directory; fail if entry already exists.
4500	 */
4501	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4502	if (error) {
4503		ZFS_EXIT(zfsvfs);
4504		return (error);
4505	}
4506
4507	tx = dmu_tx_create(zfsvfs->z_os);
4508	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4509	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4510	zfs_sa_upgrade_txholds(tx, szp);
4511	zfs_sa_upgrade_txholds(tx, dzp);
4512	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4513	if (error) {
4514		zfs_dirent_unlock(dl);
4515		if (error == ERESTART) {
4516			waited = B_TRUE;
4517			dmu_tx_wait(tx);
4518			dmu_tx_abort(tx);
4519			goto top;
4520		}
4521		dmu_tx_abort(tx);
4522		ZFS_EXIT(zfsvfs);
4523		return (error);
4524	}
4525
4526	error = zfs_link_create(dl, szp, tx, 0);
4527
4528	if (error == 0) {
4529		uint64_t txtype = TX_LINK;
4530		if (flags & FIGNORECASE)
4531			txtype |= TX_CI;
4532		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4533	}
4534
4535	dmu_tx_commit(tx);
4536
4537	zfs_dirent_unlock(dl);
4538
4539	if (error == 0) {
4540		vnevent_link(svp, ct);
4541	}
4542
4543	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4544		zil_commit(zilog, 0);
4545
4546	ZFS_EXIT(zfsvfs);
4547	return (error);
4548}
4549
4550#ifdef illumos
4551/*
4552 * zfs_null_putapage() is used when the file system has been force
4553 * unmounted. It just drops the pages.
4554 */
4555/* ARGSUSED */
4556static int
4557zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4558    size_t *lenp, int flags, cred_t *cr)
4559{
4560	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4561	return (0);
4562}
4563
4564/*
4565 * Push a page out to disk, klustering if possible.
4566 *
4567 *	IN:	vp	- file to push page to.
4568 *		pp	- page to push.
4569 *		flags	- additional flags.
4570 *		cr	- credentials of caller.
4571 *
4572 *	OUT:	offp	- start of range pushed.
4573 *		lenp	- len of range pushed.
4574 *
4575 *	RETURN:	0 on success, error code on failure.
4576 *
4577 * NOTE: callers must have locked the page to be pushed.  On
4578 * exit, the page (and all other pages in the kluster) must be
4579 * unlocked.
4580 */
4581/* ARGSUSED */
4582static int
4583zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4584    size_t *lenp, int flags, cred_t *cr)
4585{
4586	znode_t		*zp = VTOZ(vp);
4587	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4588	dmu_tx_t	*tx;
4589	u_offset_t	off, koff;
4590	size_t		len, klen;
4591	int		err;
4592
4593	off = pp->p_offset;
4594	len = PAGESIZE;
4595	/*
4596	 * If our blocksize is bigger than the page size, try to kluster
4597	 * multiple pages so that we write a full block (thus avoiding
4598	 * a read-modify-write).
4599	 */
4600	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4601		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4602		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4603		ASSERT(koff <= zp->z_size);
4604		if (koff + klen > zp->z_size)
4605			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4606		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4607	}
4608	ASSERT3U(btop(len), ==, btopr(len));
4609
4610	/*
4611	 * Can't push pages past end-of-file.
4612	 */
4613	if (off >= zp->z_size) {
4614		/* ignore all pages */
4615		err = 0;
4616		goto out;
4617	} else if (off + len > zp->z_size) {
4618		int npages = btopr(zp->z_size - off);
4619		page_t *trunc;
4620
4621		page_list_break(&pp, &trunc, npages);
4622		/* ignore pages past end of file */
4623		if (trunc)
4624			pvn_write_done(trunc, flags);
4625		len = zp->z_size - off;
4626	}
4627
4628	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4629	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4630		err = SET_ERROR(EDQUOT);
4631		goto out;
4632	}
4633	tx = dmu_tx_create(zfsvfs->z_os);
4634	dmu_tx_hold_write(tx, zp->z_id, off, len);
4635
4636	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4637	zfs_sa_upgrade_txholds(tx, zp);
4638	err = dmu_tx_assign(tx, TXG_WAIT);
4639	if (err != 0) {
4640		dmu_tx_abort(tx);
4641		goto out;
4642	}
4643
4644	if (zp->z_blksz <= PAGESIZE) {
4645		caddr_t va = zfs_map_page(pp, S_READ);
4646		ASSERT3U(len, <=, PAGESIZE);
4647		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4648		zfs_unmap_page(pp, va);
4649	} else {
4650		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4651	}
4652
4653	if (err == 0) {
4654		uint64_t mtime[2], ctime[2];
4655		sa_bulk_attr_t bulk[3];
4656		int count = 0;
4657
4658		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4659		    &mtime, 16);
4660		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4661		    &ctime, 16);
4662		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4663		    &zp->z_pflags, 8);
4664		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4665		    B_TRUE);
4666		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4667	}
4668	dmu_tx_commit(tx);
4669
4670out:
4671	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4672	if (offp)
4673		*offp = off;
4674	if (lenp)
4675		*lenp = len;
4676
4677	return (err);
4678}
4679
4680/*
4681 * Copy the portion of the file indicated from pages into the file.
4682 * The pages are stored in a page list attached to the files vnode.
4683 *
4684 *	IN:	vp	- vnode of file to push page data to.
4685 *		off	- position in file to put data.
4686 *		len	- amount of data to write.
4687 *		flags	- flags to control the operation.
4688 *		cr	- credentials of caller.
4689 *		ct	- caller context.
4690 *
4691 *	RETURN:	0 on success, error code on failure.
4692 *
4693 * Timestamps:
4694 *	vp - ctime|mtime updated
4695 */
4696/*ARGSUSED*/
4697static int
4698zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4699    caller_context_t *ct)
4700{
4701	znode_t		*zp = VTOZ(vp);
4702	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4703	page_t		*pp;
4704	size_t		io_len;
4705	u_offset_t	io_off;
4706	uint_t		blksz;
4707	rl_t		*rl;
4708	int		error = 0;
4709
4710	ZFS_ENTER(zfsvfs);
4711	ZFS_VERIFY_ZP(zp);
4712
4713	/*
4714	 * Align this request to the file block size in case we kluster.
4715	 * XXX - this can result in pretty aggresive locking, which can
4716	 * impact simultanious read/write access.  One option might be
4717	 * to break up long requests (len == 0) into block-by-block
4718	 * operations to get narrower locking.
4719	 */
4720	blksz = zp->z_blksz;
4721	if (ISP2(blksz))
4722		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4723	else
4724		io_off = 0;
4725	if (len > 0 && ISP2(blksz))
4726		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4727	else
4728		io_len = 0;
4729
4730	if (io_len == 0) {
4731		/*
4732		 * Search the entire vp list for pages >= io_off.
4733		 */
4734		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4735		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4736		goto out;
4737	}
4738	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4739
4740	if (off > zp->z_size) {
4741		/* past end of file */
4742		zfs_range_unlock(rl);
4743		ZFS_EXIT(zfsvfs);
4744		return (0);
4745	}
4746
4747	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4748
4749	for (off = io_off; io_off < off + len; io_off += io_len) {
4750		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4751			pp = page_lookup(vp, io_off,
4752			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4753		} else {
4754			pp = page_lookup_nowait(vp, io_off,
4755			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4756		}
4757
4758		if (pp != NULL && pvn_getdirty(pp, flags)) {
4759			int err;
4760
4761			/*
4762			 * Found a dirty page to push
4763			 */
4764			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4765			if (err)
4766				error = err;
4767		} else {
4768			io_len = PAGESIZE;
4769		}
4770	}
4771out:
4772	zfs_range_unlock(rl);
4773	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4774		zil_commit(zfsvfs->z_log, zp->z_id);
4775	ZFS_EXIT(zfsvfs);
4776	return (error);
4777}
4778#endif	/* illumos */
4779
4780/*ARGSUSED*/
4781void
4782zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4783{
4784	znode_t	*zp = VTOZ(vp);
4785	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4786	int error;
4787
4788	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4789	if (zp->z_sa_hdl == NULL) {
4790		/*
4791		 * The fs has been unmounted, or we did a
4792		 * suspend/resume and this file no longer exists.
4793		 */
4794		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4795		vrecycle(vp);
4796		return;
4797	}
4798
4799	mutex_enter(&zp->z_lock);
4800	if (zp->z_unlinked) {
4801		/*
4802		 * Fast path to recycle a vnode of a removed file.
4803		 */
4804		mutex_exit(&zp->z_lock);
4805		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4806		vrecycle(vp);
4807		return;
4808	}
4809	mutex_exit(&zp->z_lock);
4810
4811	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4812		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4813
4814		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4815		zfs_sa_upgrade_txholds(tx, zp);
4816		error = dmu_tx_assign(tx, TXG_WAIT);
4817		if (error) {
4818			dmu_tx_abort(tx);
4819		} else {
4820			mutex_enter(&zp->z_lock);
4821			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4822			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4823			zp->z_atime_dirty = 0;
4824			mutex_exit(&zp->z_lock);
4825			dmu_tx_commit(tx);
4826		}
4827	}
4828	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4829}
4830
4831#ifdef illumos
4832/*
4833 * Bounds-check the seek operation.
4834 *
4835 *	IN:	vp	- vnode seeking within
4836 *		ooff	- old file offset
4837 *		noffp	- pointer to new file offset
4838 *		ct	- caller context
4839 *
4840 *	RETURN:	0 on success, EINVAL if new offset invalid.
4841 */
4842/* ARGSUSED */
4843static int
4844zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4845    caller_context_t *ct)
4846{
4847	if (vp->v_type == VDIR)
4848		return (0);
4849	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4850}
4851
4852/*
4853 * Pre-filter the generic locking function to trap attempts to place
4854 * a mandatory lock on a memory mapped file.
4855 */
4856static int
4857zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4858    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4859{
4860	znode_t *zp = VTOZ(vp);
4861	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4862
4863	ZFS_ENTER(zfsvfs);
4864	ZFS_VERIFY_ZP(zp);
4865
4866	/*
4867	 * We are following the UFS semantics with respect to mapcnt
4868	 * here: If we see that the file is mapped already, then we will
4869	 * return an error, but we don't worry about races between this
4870	 * function and zfs_map().
4871	 */
4872	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4873		ZFS_EXIT(zfsvfs);
4874		return (SET_ERROR(EAGAIN));
4875	}
4876	ZFS_EXIT(zfsvfs);
4877	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4878}
4879
4880/*
4881 * If we can't find a page in the cache, we will create a new page
4882 * and fill it with file data.  For efficiency, we may try to fill
4883 * multiple pages at once (klustering) to fill up the supplied page
4884 * list.  Note that the pages to be filled are held with an exclusive
4885 * lock to prevent access by other threads while they are being filled.
4886 */
4887static int
4888zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4889    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4890{
4891	znode_t *zp = VTOZ(vp);
4892	page_t *pp, *cur_pp;
4893	objset_t *os = zp->z_zfsvfs->z_os;
4894	u_offset_t io_off, total;
4895	size_t io_len;
4896	int err;
4897
4898	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4899		/*
4900		 * We only have a single page, don't bother klustering
4901		 */
4902		io_off = off;
4903		io_len = PAGESIZE;
4904		pp = page_create_va(vp, io_off, io_len,
4905		    PG_EXCL | PG_WAIT, seg, addr);
4906	} else {
4907		/*
4908		 * Try to find enough pages to fill the page list
4909		 */
4910		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4911		    &io_len, off, plsz, 0);
4912	}
4913	if (pp == NULL) {
4914		/*
4915		 * The page already exists, nothing to do here.
4916		 */
4917		*pl = NULL;
4918		return (0);
4919	}
4920
4921	/*
4922	 * Fill the pages in the kluster.
4923	 */
4924	cur_pp = pp;
4925	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4926		caddr_t va;
4927
4928		ASSERT3U(io_off, ==, cur_pp->p_offset);
4929		va = zfs_map_page(cur_pp, S_WRITE);
4930		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4931		    DMU_READ_PREFETCH);
4932		zfs_unmap_page(cur_pp, va);
4933		if (err) {
4934			/* On error, toss the entire kluster */
4935			pvn_read_done(pp, B_ERROR);
4936			/* convert checksum errors into IO errors */
4937			if (err == ECKSUM)
4938				err = SET_ERROR(EIO);
4939			return (err);
4940		}
4941		cur_pp = cur_pp->p_next;
4942	}
4943
4944	/*
4945	 * Fill in the page list array from the kluster starting
4946	 * from the desired offset `off'.
4947	 * NOTE: the page list will always be null terminated.
4948	 */
4949	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4950	ASSERT(pl == NULL || (*pl)->p_offset == off);
4951
4952	return (0);
4953}
4954
4955/*
4956 * Return pointers to the pages for the file region [off, off + len]
4957 * in the pl array.  If plsz is greater than len, this function may
4958 * also return page pointers from after the specified region
4959 * (i.e. the region [off, off + plsz]).  These additional pages are
4960 * only returned if they are already in the cache, or were created as
4961 * part of a klustered read.
4962 *
4963 *	IN:	vp	- vnode of file to get data from.
4964 *		off	- position in file to get data from.
4965 *		len	- amount of data to retrieve.
4966 *		plsz	- length of provided page list.
4967 *		seg	- segment to obtain pages for.
4968 *		addr	- virtual address of fault.
4969 *		rw	- mode of created pages.
4970 *		cr	- credentials of caller.
4971 *		ct	- caller context.
4972 *
4973 *	OUT:	protp	- protection mode of created pages.
4974 *		pl	- list of pages created.
4975 *
4976 *	RETURN:	0 on success, error code on failure.
4977 *
4978 * Timestamps:
4979 *	vp - atime updated
4980 */
4981/* ARGSUSED */
4982static int
4983zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4984    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4985    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4986{
4987	znode_t		*zp = VTOZ(vp);
4988	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4989	page_t		**pl0 = pl;
4990	int		err = 0;
4991
4992	/* we do our own caching, faultahead is unnecessary */
4993	if (pl == NULL)
4994		return (0);
4995	else if (len > plsz)
4996		len = plsz;
4997	else
4998		len = P2ROUNDUP(len, PAGESIZE);
4999	ASSERT(plsz >= len);
5000
5001	ZFS_ENTER(zfsvfs);
5002	ZFS_VERIFY_ZP(zp);
5003
5004	if (protp)
5005		*protp = PROT_ALL;
5006
5007	/*
5008	 * Loop through the requested range [off, off + len) looking
5009	 * for pages.  If we don't find a page, we will need to create
5010	 * a new page and fill it with data from the file.
5011	 */
5012	while (len > 0) {
5013		if (*pl = page_lookup(vp, off, SE_SHARED))
5014			*(pl+1) = NULL;
5015		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
5016			goto out;
5017		while (*pl) {
5018			ASSERT3U((*pl)->p_offset, ==, off);
5019			off += PAGESIZE;
5020			addr += PAGESIZE;
5021			if (len > 0) {
5022				ASSERT3U(len, >=, PAGESIZE);
5023				len -= PAGESIZE;
5024			}
5025			ASSERT3U(plsz, >=, PAGESIZE);
5026			plsz -= PAGESIZE;
5027			pl++;
5028		}
5029	}
5030
5031	/*
5032	 * Fill out the page array with any pages already in the cache.
5033	 */
5034	while (plsz > 0 &&
5035	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
5036			off += PAGESIZE;
5037			plsz -= PAGESIZE;
5038	}
5039out:
5040	if (err) {
5041		/*
5042		 * Release any pages we have previously locked.
5043		 */
5044		while (pl > pl0)
5045			page_unlock(*--pl);
5046	} else {
5047		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5048	}
5049
5050	*pl = NULL;
5051
5052	ZFS_EXIT(zfsvfs);
5053	return (err);
5054}
5055
5056/*
5057 * Request a memory map for a section of a file.  This code interacts
5058 * with common code and the VM system as follows:
5059 *
5060 * - common code calls mmap(), which ends up in smmap_common()
5061 * - this calls VOP_MAP(), which takes you into (say) zfs
5062 * - zfs_map() calls as_map(), passing segvn_create() as the callback
5063 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
5064 * - zfs_addmap() updates z_mapcnt
5065 */
5066/*ARGSUSED*/
5067static int
5068zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
5069    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5070    caller_context_t *ct)
5071{
5072	znode_t *zp = VTOZ(vp);
5073	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5074	segvn_crargs_t	vn_a;
5075	int		error;
5076
5077	ZFS_ENTER(zfsvfs);
5078	ZFS_VERIFY_ZP(zp);
5079
5080	if ((prot & PROT_WRITE) && (zp->z_pflags &
5081	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
5082		ZFS_EXIT(zfsvfs);
5083		return (SET_ERROR(EPERM));
5084	}
5085
5086	if ((prot & (PROT_READ | PROT_EXEC)) &&
5087	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
5088		ZFS_EXIT(zfsvfs);
5089		return (SET_ERROR(EACCES));
5090	}
5091
5092	if (vp->v_flag & VNOMAP) {
5093		ZFS_EXIT(zfsvfs);
5094		return (SET_ERROR(ENOSYS));
5095	}
5096
5097	if (off < 0 || len > MAXOFFSET_T - off) {
5098		ZFS_EXIT(zfsvfs);
5099		return (SET_ERROR(ENXIO));
5100	}
5101
5102	if (vp->v_type != VREG) {
5103		ZFS_EXIT(zfsvfs);
5104		return (SET_ERROR(ENODEV));
5105	}
5106
5107	/*
5108	 * If file is locked, disallow mapping.
5109	 */
5110	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
5111		ZFS_EXIT(zfsvfs);
5112		return (SET_ERROR(EAGAIN));
5113	}
5114
5115	as_rangelock(as);
5116	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5117	if (error != 0) {
5118		as_rangeunlock(as);
5119		ZFS_EXIT(zfsvfs);
5120		return (error);
5121	}
5122
5123	vn_a.vp = vp;
5124	vn_a.offset = (u_offset_t)off;
5125	vn_a.type = flags & MAP_TYPE;
5126	vn_a.prot = prot;
5127	vn_a.maxprot = maxprot;
5128	vn_a.cred = cr;
5129	vn_a.amp = NULL;
5130	vn_a.flags = flags & ~MAP_TYPE;
5131	vn_a.szc = 0;
5132	vn_a.lgrp_mem_policy_flags = 0;
5133
5134	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5135
5136	as_rangeunlock(as);
5137	ZFS_EXIT(zfsvfs);
5138	return (error);
5139}
5140
5141/* ARGSUSED */
5142static int
5143zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5144    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5145    caller_context_t *ct)
5146{
5147	uint64_t pages = btopr(len);
5148
5149	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5150	return (0);
5151}
5152
5153/*
5154 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5155 * more accurate mtime for the associated file.  Since we don't have a way of
5156 * detecting when the data was actually modified, we have to resort to
5157 * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5158 * last page is pushed.  The problem occurs when the msync() call is omitted,
5159 * which by far the most common case:
5160 *
5161 *	open()
5162 *	mmap()
5163 *	<modify memory>
5164 *	munmap()
5165 *	close()
5166 *	<time lapse>
5167 *	putpage() via fsflush
5168 *
5169 * If we wait until fsflush to come along, we can have a modification time that
5170 * is some arbitrary point in the future.  In order to prevent this in the
5171 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5172 * torn down.
5173 */
5174/* ARGSUSED */
5175static int
5176zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5177    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5178    caller_context_t *ct)
5179{
5180	uint64_t pages = btopr(len);
5181
5182	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5183	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5184
5185	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5186	    vn_has_cached_data(vp))
5187		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5188
5189	return (0);
5190}
5191
5192/*
5193 * Free or allocate space in a file.  Currently, this function only
5194 * supports the `F_FREESP' command.  However, this command is somewhat
5195 * misnamed, as its functionality includes the ability to allocate as
5196 * well as free space.
5197 *
5198 *	IN:	vp	- vnode of file to free data in.
5199 *		cmd	- action to take (only F_FREESP supported).
5200 *		bfp	- section of file to free/alloc.
5201 *		flag	- current file open mode flags.
5202 *		offset	- current file offset.
5203 *		cr	- credentials of caller [UNUSED].
5204 *		ct	- caller context.
5205 *
5206 *	RETURN:	0 on success, error code on failure.
5207 *
5208 * Timestamps:
5209 *	vp - ctime|mtime updated
5210 */
5211/* ARGSUSED */
5212static int
5213zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5214    offset_t offset, cred_t *cr, caller_context_t *ct)
5215{
5216	znode_t		*zp = VTOZ(vp);
5217	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5218	uint64_t	off, len;
5219	int		error;
5220
5221	ZFS_ENTER(zfsvfs);
5222	ZFS_VERIFY_ZP(zp);
5223
5224	if (cmd != F_FREESP) {
5225		ZFS_EXIT(zfsvfs);
5226		return (SET_ERROR(EINVAL));
5227	}
5228
5229	/*
5230	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
5231	 * callers might not be able to detect properly that we are read-only,
5232	 * so check it explicitly here.
5233	 */
5234	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
5235		ZFS_EXIT(zfsvfs);
5236		return (SET_ERROR(EROFS));
5237	}
5238
5239	if (error = convoff(vp, bfp, 0, offset)) {
5240		ZFS_EXIT(zfsvfs);
5241		return (error);
5242	}
5243
5244	if (bfp->l_len < 0) {
5245		ZFS_EXIT(zfsvfs);
5246		return (SET_ERROR(EINVAL));
5247	}
5248
5249	off = bfp->l_start;
5250	len = bfp->l_len; /* 0 means from off to end of file */
5251
5252	error = zfs_freesp(zp, off, len, flag, TRUE);
5253
5254	ZFS_EXIT(zfsvfs);
5255	return (error);
5256}
5257#endif	/* illumos */
5258
5259CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5260CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5261
5262/*ARGSUSED*/
5263static int
5264zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5265{
5266	znode_t		*zp = VTOZ(vp);
5267	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5268	uint32_t	gen;
5269	uint64_t	gen64;
5270	uint64_t	object = zp->z_id;
5271	zfid_short_t	*zfid;
5272	int		size, i, error;
5273
5274	ZFS_ENTER(zfsvfs);
5275	ZFS_VERIFY_ZP(zp);
5276
5277	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5278	    &gen64, sizeof (uint64_t))) != 0) {
5279		ZFS_EXIT(zfsvfs);
5280		return (error);
5281	}
5282
5283	gen = (uint32_t)gen64;
5284
5285	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5286
5287#ifdef illumos
5288	if (fidp->fid_len < size) {
5289		fidp->fid_len = size;
5290		ZFS_EXIT(zfsvfs);
5291		return (SET_ERROR(ENOSPC));
5292	}
5293#else
5294	fidp->fid_len = size;
5295#endif
5296
5297	zfid = (zfid_short_t *)fidp;
5298
5299	zfid->zf_len = size;
5300
5301	for (i = 0; i < sizeof (zfid->zf_object); i++)
5302		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5303
5304	/* Must have a non-zero generation number to distinguish from .zfs */
5305	if (gen == 0)
5306		gen = 1;
5307	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5308		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5309
5310	if (size == LONG_FID_LEN) {
5311		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5312		zfid_long_t	*zlfid;
5313
5314		zlfid = (zfid_long_t *)fidp;
5315
5316		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5317			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5318
5319		/* XXX - this should be the generation number for the objset */
5320		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5321			zlfid->zf_setgen[i] = 0;
5322	}
5323
5324	ZFS_EXIT(zfsvfs);
5325	return (0);
5326}
5327
5328static int
5329zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5330    caller_context_t *ct)
5331{
5332	znode_t		*zp, *xzp;
5333	zfsvfs_t	*zfsvfs;
5334	zfs_dirlock_t	*dl;
5335	int		error;
5336
5337	switch (cmd) {
5338	case _PC_LINK_MAX:
5339		*valp = INT_MAX;
5340		return (0);
5341
5342	case _PC_FILESIZEBITS:
5343		*valp = 64;
5344		return (0);
5345#ifdef illumos
5346	case _PC_XATTR_EXISTS:
5347		zp = VTOZ(vp);
5348		zfsvfs = zp->z_zfsvfs;
5349		ZFS_ENTER(zfsvfs);
5350		ZFS_VERIFY_ZP(zp);
5351		*valp = 0;
5352		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5353		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5354		if (error == 0) {
5355			zfs_dirent_unlock(dl);
5356			if (!zfs_dirempty(xzp))
5357				*valp = 1;
5358			VN_RELE(ZTOV(xzp));
5359		} else if (error == ENOENT) {
5360			/*
5361			 * If there aren't extended attributes, it's the
5362			 * same as having zero of them.
5363			 */
5364			error = 0;
5365		}
5366		ZFS_EXIT(zfsvfs);
5367		return (error);
5368
5369	case _PC_SATTR_ENABLED:
5370	case _PC_SATTR_EXISTS:
5371		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5372		    (vp->v_type == VREG || vp->v_type == VDIR);
5373		return (0);
5374
5375	case _PC_ACCESS_FILTERING:
5376		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5377		    vp->v_type == VDIR;
5378		return (0);
5379
5380	case _PC_ACL_ENABLED:
5381		*valp = _ACL_ACE_ENABLED;
5382		return (0);
5383#endif	/* illumos */
5384	case _PC_MIN_HOLE_SIZE:
5385		*valp = (int)SPA_MINBLOCKSIZE;
5386		return (0);
5387#ifdef illumos
5388	case _PC_TIMESTAMP_RESOLUTION:
5389		/* nanosecond timestamp resolution */
5390		*valp = 1L;
5391		return (0);
5392#endif
5393	case _PC_ACL_EXTENDED:
5394		*valp = 0;
5395		return (0);
5396
5397	case _PC_ACL_NFS4:
5398		*valp = 1;
5399		return (0);
5400
5401	case _PC_ACL_PATH_MAX:
5402		*valp = ACL_MAX_ENTRIES;
5403		return (0);
5404
5405	default:
5406		return (EOPNOTSUPP);
5407	}
5408}
5409
5410/*ARGSUSED*/
5411static int
5412zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5413    caller_context_t *ct)
5414{
5415	znode_t *zp = VTOZ(vp);
5416	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5417	int error;
5418	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5419
5420	ZFS_ENTER(zfsvfs);
5421	ZFS_VERIFY_ZP(zp);
5422	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5423	ZFS_EXIT(zfsvfs);
5424
5425	return (error);
5426}
5427
5428/*ARGSUSED*/
5429int
5430zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5431    caller_context_t *ct)
5432{
5433	znode_t *zp = VTOZ(vp);
5434	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5435	int error;
5436	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5437	zilog_t	*zilog = zfsvfs->z_log;
5438
5439	ZFS_ENTER(zfsvfs);
5440	ZFS_VERIFY_ZP(zp);
5441
5442	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5443
5444	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5445		zil_commit(zilog, 0);
5446
5447	ZFS_EXIT(zfsvfs);
5448	return (error);
5449}
5450
5451#ifdef illumos
5452/*
5453 * The smallest read we may consider to loan out an arcbuf.
5454 * This must be a power of 2.
5455 */
5456int zcr_blksz_min = (1 << 10);	/* 1K */
5457/*
5458 * If set to less than the file block size, allow loaning out of an
5459 * arcbuf for a partial block read.  This must be a power of 2.
5460 */
5461int zcr_blksz_max = (1 << 17);	/* 128K */
5462
5463/*ARGSUSED*/
5464static int
5465zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5466    caller_context_t *ct)
5467{
5468	znode_t	*zp = VTOZ(vp);
5469	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5470	int max_blksz = zfsvfs->z_max_blksz;
5471	uio_t *uio = &xuio->xu_uio;
5472	ssize_t size = uio->uio_resid;
5473	offset_t offset = uio->uio_loffset;
5474	int blksz;
5475	int fullblk, i;
5476	arc_buf_t *abuf;
5477	ssize_t maxsize;
5478	int preamble, postamble;
5479
5480	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5481		return (SET_ERROR(EINVAL));
5482
5483	ZFS_ENTER(zfsvfs);
5484	ZFS_VERIFY_ZP(zp);
5485	switch (ioflag) {
5486	case UIO_WRITE:
5487		/*
5488		 * Loan out an arc_buf for write if write size is bigger than
5489		 * max_blksz, and the file's block size is also max_blksz.
5490		 */
5491		blksz = max_blksz;
5492		if (size < blksz || zp->z_blksz != blksz) {
5493			ZFS_EXIT(zfsvfs);
5494			return (SET_ERROR(EINVAL));
5495		}
5496		/*
5497		 * Caller requests buffers for write before knowing where the
5498		 * write offset might be (e.g. NFS TCP write).
5499		 */
5500		if (offset == -1) {
5501			preamble = 0;
5502		} else {
5503			preamble = P2PHASE(offset, blksz);
5504			if (preamble) {
5505				preamble = blksz - preamble;
5506				size -= preamble;
5507			}
5508		}
5509
5510		postamble = P2PHASE(size, blksz);
5511		size -= postamble;
5512
5513		fullblk = size / blksz;
5514		(void) dmu_xuio_init(xuio,
5515		    (preamble != 0) + fullblk + (postamble != 0));
5516		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5517		    int, postamble, int,
5518		    (preamble != 0) + fullblk + (postamble != 0));
5519
5520		/*
5521		 * Have to fix iov base/len for partial buffers.  They
5522		 * currently represent full arc_buf's.
5523		 */
5524		if (preamble) {
5525			/* data begins in the middle of the arc_buf */
5526			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5527			    blksz);
5528			ASSERT(abuf);
5529			(void) dmu_xuio_add(xuio, abuf,
5530			    blksz - preamble, preamble);
5531		}
5532
5533		for (i = 0; i < fullblk; i++) {
5534			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5535			    blksz);
5536			ASSERT(abuf);
5537			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5538		}
5539
5540		if (postamble) {
5541			/* data ends in the middle of the arc_buf */
5542			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5543			    blksz);
5544			ASSERT(abuf);
5545			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5546		}
5547		break;
5548	case UIO_READ:
5549		/*
5550		 * Loan out an arc_buf for read if the read size is larger than
5551		 * the current file block size.  Block alignment is not
5552		 * considered.  Partial arc_buf will be loaned out for read.
5553		 */
5554		blksz = zp->z_blksz;
5555		if (blksz < zcr_blksz_min)
5556			blksz = zcr_blksz_min;
5557		if (blksz > zcr_blksz_max)
5558			blksz = zcr_blksz_max;
5559		/* avoid potential complexity of dealing with it */
5560		if (blksz > max_blksz) {
5561			ZFS_EXIT(zfsvfs);
5562			return (SET_ERROR(EINVAL));
5563		}
5564
5565		maxsize = zp->z_size - uio->uio_loffset;
5566		if (size > maxsize)
5567			size = maxsize;
5568
5569		if (size < blksz || vn_has_cached_data(vp)) {
5570			ZFS_EXIT(zfsvfs);
5571			return (SET_ERROR(EINVAL));
5572		}
5573		break;
5574	default:
5575		ZFS_EXIT(zfsvfs);
5576		return (SET_ERROR(EINVAL));
5577	}
5578
5579	uio->uio_extflg = UIO_XUIO;
5580	XUIO_XUZC_RW(xuio) = ioflag;
5581	ZFS_EXIT(zfsvfs);
5582	return (0);
5583}
5584
5585/*ARGSUSED*/
5586static int
5587zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5588{
5589	int i;
5590	arc_buf_t *abuf;
5591	int ioflag = XUIO_XUZC_RW(xuio);
5592
5593	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5594
5595	i = dmu_xuio_cnt(xuio);
5596	while (i-- > 0) {
5597		abuf = dmu_xuio_arcbuf(xuio, i);
5598		/*
5599		 * if abuf == NULL, it must be a write buffer
5600		 * that has been returned in zfs_write().
5601		 */
5602		if (abuf)
5603			dmu_return_arcbuf(abuf);
5604		ASSERT(abuf || ioflag == UIO_WRITE);
5605	}
5606
5607	dmu_xuio_fini(xuio);
5608	return (0);
5609}
5610
5611/*
5612 * Predeclare these here so that the compiler assumes that
5613 * this is an "old style" function declaration that does
5614 * not include arguments => we won't get type mismatch errors
5615 * in the initializations that follow.
5616 */
5617static int zfs_inval();
5618static int zfs_isdir();
5619
5620static int
5621zfs_inval()
5622{
5623	return (SET_ERROR(EINVAL));
5624}
5625
5626static int
5627zfs_isdir()
5628{
5629	return (SET_ERROR(EISDIR));
5630}
5631/*
5632 * Directory vnode operations template
5633 */
5634vnodeops_t *zfs_dvnodeops;
5635const fs_operation_def_t zfs_dvnodeops_template[] = {
5636	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5637	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5638	VOPNAME_READ,		{ .error = zfs_isdir },
5639	VOPNAME_WRITE,		{ .error = zfs_isdir },
5640	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5641	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5642	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5643	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5644	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5645	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5646	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5647	VOPNAME_LINK,		{ .vop_link = zfs_link },
5648	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5649	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5650	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5651	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5652	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5653	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5654	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5655	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5656	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5657	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5658	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5659	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5660	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5661	NULL,			NULL
5662};
5663
5664/*
5665 * Regular file vnode operations template
5666 */
5667vnodeops_t *zfs_fvnodeops;
5668const fs_operation_def_t zfs_fvnodeops_template[] = {
5669	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5670	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5671	VOPNAME_READ,		{ .vop_read = zfs_read },
5672	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5673	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5674	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5675	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5676	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5677	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5678	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5679	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5680	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5681	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5682	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5683	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5684	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5685	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5686	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5687	VOPNAME_MAP,		{ .vop_map = zfs_map },
5688	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5689	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5690	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5691	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5692	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5693	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5694	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
5695	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
5696	NULL,			NULL
5697};
5698
5699/*
5700 * Symbolic link vnode operations template
5701 */
5702vnodeops_t *zfs_symvnodeops;
5703const fs_operation_def_t zfs_symvnodeops_template[] = {
5704	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5705	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5706	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5707	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5708	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5709	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5710	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5711	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5712	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5713	NULL,			NULL
5714};
5715
5716/*
5717 * special share hidden files vnode operations template
5718 */
5719vnodeops_t *zfs_sharevnodeops;
5720const fs_operation_def_t zfs_sharevnodeops_template[] = {
5721	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5722	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5723	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5724	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5725	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5726	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5727	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5728	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5729	NULL,			NULL
5730};
5731
5732/*
5733 * Extended attribute directory vnode operations template
5734 *
5735 * This template is identical to the directory vnodes
5736 * operation template except for restricted operations:
5737 *	VOP_MKDIR()
5738 *	VOP_SYMLINK()
5739 *
5740 * Note that there are other restrictions embedded in:
5741 *	zfs_create()	- restrict type to VREG
5742 *	zfs_link()	- no links into/out of attribute space
5743 *	zfs_rename()	- no moves into/out of attribute space
5744 */
5745vnodeops_t *zfs_xdvnodeops;
5746const fs_operation_def_t zfs_xdvnodeops_template[] = {
5747	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5748	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5749	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5750	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5751	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5752	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5753	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5754	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5755	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5756	VOPNAME_LINK,		{ .vop_link = zfs_link },
5757	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5758	VOPNAME_MKDIR,		{ .error = zfs_inval },
5759	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5760	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5761	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5762	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5763	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5764	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5765	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5766	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5767	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5768	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5769	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5770	NULL,			NULL
5771};
5772
5773/*
5774 * Error vnode operations template
5775 */
5776vnodeops_t *zfs_evnodeops;
5777const fs_operation_def_t zfs_evnodeops_template[] = {
5778	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5779	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5780	NULL,			NULL
5781};
5782#endif	/* illumos */
5783
5784static int
5785ioflags(int ioflags)
5786{
5787	int flags = 0;
5788
5789	if (ioflags & IO_APPEND)
5790		flags |= FAPPEND;
5791	if (ioflags & IO_NDELAY)
5792        	flags |= FNONBLOCK;
5793	if (ioflags & IO_SYNC)
5794		flags |= (FSYNC | FDSYNC | FRSYNC);
5795
5796	return (flags);
5797}
5798
5799static int
5800zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5801{
5802	znode_t *zp = VTOZ(vp);
5803	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5804	objset_t *os = zp->z_zfsvfs->z_os;
5805	vm_page_t mfirst, mlast, mreq;
5806	vm_object_t object;
5807	caddr_t va;
5808	struct sf_buf *sf;
5809	off_t startoff, endoff;
5810	int i, error;
5811	vm_pindex_t reqstart, reqend;
5812	int pcount, lsize, reqsize, size;
5813
5814	ZFS_ENTER(zfsvfs);
5815	ZFS_VERIFY_ZP(zp);
5816
5817	pcount = OFF_TO_IDX(round_page(count));
5818	mreq = m[reqpage];
5819	object = mreq->object;
5820	error = 0;
5821
5822	KASSERT(vp->v_object == object, ("mismatching object"));
5823
5824	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
5825		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
5826		reqstart = OFF_TO_IDX(round_page(startoff));
5827		if (reqstart < m[0]->pindex)
5828			reqstart = 0;
5829		else
5830			reqstart = reqstart - m[0]->pindex;
5831		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
5832		    zp->z_blksz);
5833		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
5834		if (reqend > m[pcount - 1]->pindex)
5835			reqend = m[pcount - 1]->pindex;
5836		reqsize = reqend - m[reqstart]->pindex + 1;
5837		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
5838		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
5839	} else {
5840		reqstart = reqpage;
5841		reqsize = 1;
5842	}
5843	mfirst = m[reqstart];
5844	mlast = m[reqstart + reqsize - 1];
5845
5846	zfs_vmobject_wlock(object);
5847
5848	for (i = 0; i < reqstart; i++) {
5849		vm_page_lock(m[i]);
5850		vm_page_free(m[i]);
5851		vm_page_unlock(m[i]);
5852	}
5853	for (i = reqstart + reqsize; i < pcount; i++) {
5854		vm_page_lock(m[i]);
5855		vm_page_free(m[i]);
5856		vm_page_unlock(m[i]);
5857	}
5858
5859	if (mreq->valid && reqsize == 1) {
5860		if (mreq->valid != VM_PAGE_BITS_ALL)
5861			vm_page_zero_invalid(mreq, TRUE);
5862		zfs_vmobject_wunlock(object);
5863		ZFS_EXIT(zfsvfs);
5864		return (zfs_vm_pagerret_ok);
5865	}
5866
5867	PCPU_INC(cnt.v_vnodein);
5868	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
5869
5870	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5871		for (i = reqstart; i < reqstart + reqsize; i++) {
5872			if (i != reqpage) {
5873				vm_page_lock(m[i]);
5874				vm_page_free(m[i]);
5875				vm_page_unlock(m[i]);
5876			}
5877		}
5878		zfs_vmobject_wunlock(object);
5879		ZFS_EXIT(zfsvfs);
5880		return (zfs_vm_pagerret_bad);
5881	}
5882
5883	lsize = PAGE_SIZE;
5884	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
5885		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
5886
5887	zfs_vmobject_wunlock(object);
5888
5889	for (i = reqstart; i < reqstart + reqsize; i++) {
5890		size = PAGE_SIZE;
5891		if (i == (reqstart + reqsize - 1))
5892			size = lsize;
5893		va = zfs_map_page(m[i], &sf);
5894		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
5895		    size, va, DMU_READ_PREFETCH);
5896		if (size != PAGE_SIZE)
5897			bzero(va + size, PAGE_SIZE - size);
5898		zfs_unmap_page(sf);
5899		if (error != 0)
5900			break;
5901	}
5902
5903	zfs_vmobject_wlock(object);
5904
5905	for (i = reqstart; i < reqstart + reqsize; i++) {
5906		if (!error)
5907			m[i]->valid = VM_PAGE_BITS_ALL;
5908		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
5909		if (i != reqpage)
5910			vm_page_readahead_finish(m[i]);
5911	}
5912
5913	zfs_vmobject_wunlock(object);
5914
5915	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5916	ZFS_EXIT(zfsvfs);
5917	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
5918}
5919
5920static int
5921zfs_freebsd_getpages(ap)
5922	struct vop_getpages_args /* {
5923		struct vnode *a_vp;
5924		vm_page_t *a_m;
5925		int a_count;
5926		int a_reqpage;
5927		vm_ooffset_t a_offset;
5928	} */ *ap;
5929{
5930
5931	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5932}
5933
5934static int
5935zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
5936    int *rtvals)
5937{
5938	znode_t		*zp = VTOZ(vp);
5939	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5940	rl_t		*rl;
5941	dmu_tx_t	*tx;
5942	struct sf_buf	*sf;
5943	vm_object_t	object;
5944	vm_page_t	m;
5945	caddr_t		va;
5946	size_t		tocopy;
5947	size_t		lo_len;
5948	vm_ooffset_t	lo_off;
5949	vm_ooffset_t	off;
5950	uint_t		blksz;
5951	int		ncount;
5952	int		pcount;
5953	int		err;
5954	int		i;
5955
5956	ZFS_ENTER(zfsvfs);
5957	ZFS_VERIFY_ZP(zp);
5958
5959	object = vp->v_object;
5960	pcount = btoc(len);
5961	ncount = pcount;
5962
5963	KASSERT(ma[0]->object == object, ("mismatching object"));
5964	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
5965
5966	for (i = 0; i < pcount; i++)
5967		rtvals[i] = zfs_vm_pagerret_error;
5968
5969	off = IDX_TO_OFF(ma[0]->pindex);
5970	blksz = zp->z_blksz;
5971	lo_off = rounddown(off, blksz);
5972	lo_len = roundup(len + (off - lo_off), blksz);
5973	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
5974
5975	zfs_vmobject_wlock(object);
5976	if (len + off > object->un_pager.vnp.vnp_size) {
5977		if (object->un_pager.vnp.vnp_size > off) {
5978			int pgoff;
5979
5980			len = object->un_pager.vnp.vnp_size - off;
5981			ncount = btoc(len);
5982			if ((pgoff = (int)len & PAGE_MASK) != 0) {
5983				/*
5984				 * If the object is locked and the following
5985				 * conditions hold, then the page's dirty
5986				 * field cannot be concurrently changed by a
5987				 * pmap operation.
5988				 */
5989				m = ma[ncount - 1];
5990				vm_page_assert_sbusied(m);
5991				KASSERT(!pmap_page_is_write_mapped(m),
5992				    ("zfs_putpages: page %p is not read-only", m));
5993				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
5994				    pgoff);
5995			}
5996		} else {
5997			len = 0;
5998			ncount = 0;
5999		}
6000		if (ncount < pcount) {
6001			for (i = ncount; i < pcount; i++) {
6002				rtvals[i] = zfs_vm_pagerret_bad;
6003			}
6004		}
6005	}
6006	zfs_vmobject_wunlock(object);
6007
6008	if (ncount == 0)
6009		goto out;
6010
6011	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
6012	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
6013		goto out;
6014	}
6015
6016top:
6017	tx = dmu_tx_create(zfsvfs->z_os);
6018	dmu_tx_hold_write(tx, zp->z_id, off, len);
6019
6020	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
6021	zfs_sa_upgrade_txholds(tx, zp);
6022	err = dmu_tx_assign(tx, TXG_NOWAIT);
6023	if (err != 0) {
6024		if (err == ERESTART) {
6025			dmu_tx_wait(tx);
6026			dmu_tx_abort(tx);
6027			goto top;
6028		}
6029		dmu_tx_abort(tx);
6030		goto out;
6031	}
6032
6033	if (zp->z_blksz < PAGE_SIZE) {
6034		i = 0;
6035		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
6036			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
6037			va = zfs_map_page(ma[i], &sf);
6038			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
6039			zfs_unmap_page(sf);
6040		}
6041	} else {
6042		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
6043	}
6044
6045	if (err == 0) {
6046		uint64_t mtime[2], ctime[2];
6047		sa_bulk_attr_t bulk[3];
6048		int count = 0;
6049
6050		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
6051		    &mtime, 16);
6052		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
6053		    &ctime, 16);
6054		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
6055		    &zp->z_pflags, 8);
6056		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
6057		    B_TRUE);
6058		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
6059		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6060
6061		zfs_vmobject_wlock(object);
6062		for (i = 0; i < ncount; i++) {
6063			rtvals[i] = zfs_vm_pagerret_ok;
6064			vm_page_undirty(ma[i]);
6065		}
6066		zfs_vmobject_wunlock(object);
6067		PCPU_INC(cnt.v_vnodeout);
6068		PCPU_ADD(cnt.v_vnodepgsout, ncount);
6069	}
6070	dmu_tx_commit(tx);
6071
6072out:
6073	zfs_range_unlock(rl);
6074	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
6075	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6076		zil_commit(zfsvfs->z_log, zp->z_id);
6077	ZFS_EXIT(zfsvfs);
6078	return (rtvals[0]);
6079}
6080
6081int
6082zfs_freebsd_putpages(ap)
6083	struct vop_putpages_args /* {
6084		struct vnode *a_vp;
6085		vm_page_t *a_m;
6086		int a_count;
6087		int a_sync;
6088		int *a_rtvals;
6089		vm_ooffset_t a_offset;
6090	} */ *ap;
6091{
6092
6093	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
6094	    ap->a_rtvals));
6095}
6096
6097static int
6098zfs_freebsd_bmap(ap)
6099	struct vop_bmap_args /* {
6100		struct vnode *a_vp;
6101		daddr_t  a_bn;
6102		struct bufobj **a_bop;
6103		daddr_t *a_bnp;
6104		int *a_runp;
6105		int *a_runb;
6106	} */ *ap;
6107{
6108
6109	if (ap->a_bop != NULL)
6110		*ap->a_bop = &ap->a_vp->v_bufobj;
6111	if (ap->a_bnp != NULL)
6112		*ap->a_bnp = ap->a_bn;
6113	if (ap->a_runp != NULL)
6114		*ap->a_runp = 0;
6115	if (ap->a_runb != NULL)
6116		*ap->a_runb = 0;
6117
6118	return (0);
6119}
6120
6121static int
6122zfs_freebsd_open(ap)
6123	struct vop_open_args /* {
6124		struct vnode *a_vp;
6125		int a_mode;
6126		struct ucred *a_cred;
6127		struct thread *a_td;
6128	} */ *ap;
6129{
6130	vnode_t	*vp = ap->a_vp;
6131	znode_t *zp = VTOZ(vp);
6132	int error;
6133
6134	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
6135	if (error == 0)
6136		vnode_create_vobject(vp, zp->z_size, ap->a_td);
6137	return (error);
6138}
6139
6140static int
6141zfs_freebsd_close(ap)
6142	struct vop_close_args /* {
6143		struct vnode *a_vp;
6144		int  a_fflag;
6145		struct ucred *a_cred;
6146		struct thread *a_td;
6147	} */ *ap;
6148{
6149
6150	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
6151}
6152
6153static int
6154zfs_freebsd_ioctl(ap)
6155	struct vop_ioctl_args /* {
6156		struct vnode *a_vp;
6157		u_long a_command;
6158		caddr_t a_data;
6159		int a_fflag;
6160		struct ucred *cred;
6161		struct thread *td;
6162	} */ *ap;
6163{
6164
6165	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
6166	    ap->a_fflag, ap->a_cred, NULL, NULL));
6167}
6168
6169static int
6170zfs_freebsd_read(ap)
6171	struct vop_read_args /* {
6172		struct vnode *a_vp;
6173		struct uio *a_uio;
6174		int a_ioflag;
6175		struct ucred *a_cred;
6176	} */ *ap;
6177{
6178
6179	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6180	    ap->a_cred, NULL));
6181}
6182
6183static int
6184zfs_freebsd_write(ap)
6185	struct vop_write_args /* {
6186		struct vnode *a_vp;
6187		struct uio *a_uio;
6188		int a_ioflag;
6189		struct ucred *a_cred;
6190	} */ *ap;
6191{
6192
6193	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6194	    ap->a_cred, NULL));
6195}
6196
6197static int
6198zfs_freebsd_access(ap)
6199	struct vop_access_args /* {
6200		struct vnode *a_vp;
6201		accmode_t a_accmode;
6202		struct ucred *a_cred;
6203		struct thread *a_td;
6204	} */ *ap;
6205{
6206	vnode_t *vp = ap->a_vp;
6207	znode_t *zp = VTOZ(vp);
6208	accmode_t accmode;
6209	int error = 0;
6210
6211	/*
6212	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
6213	 */
6214	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
6215	if (accmode != 0)
6216		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
6217
6218	/*
6219	 * VADMIN has to be handled by vaccess().
6220	 */
6221	if (error == 0) {
6222		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
6223		if (accmode != 0) {
6224			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
6225			    zp->z_gid, accmode, ap->a_cred, NULL);
6226		}
6227	}
6228
6229	/*
6230	 * For VEXEC, ensure that at least one execute bit is set for
6231	 * non-directories.
6232	 */
6233	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
6234	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
6235		error = EACCES;
6236	}
6237
6238	return (error);
6239}
6240
6241static int
6242zfs_freebsd_lookup(ap)
6243	struct vop_lookup_args /* {
6244		struct vnode *a_dvp;
6245		struct vnode **a_vpp;
6246		struct componentname *a_cnp;
6247	} */ *ap;
6248{
6249	struct componentname *cnp = ap->a_cnp;
6250	char nm[NAME_MAX + 1];
6251
6252	ASSERT(cnp->cn_namelen < sizeof(nm));
6253	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
6254
6255	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
6256	    cnp->cn_cred, cnp->cn_thread, 0));
6257}
6258
6259static int
6260zfs_freebsd_create(ap)
6261	struct vop_create_args /* {
6262		struct vnode *a_dvp;
6263		struct vnode **a_vpp;
6264		struct componentname *a_cnp;
6265		struct vattr *a_vap;
6266	} */ *ap;
6267{
6268	struct componentname *cnp = ap->a_cnp;
6269	vattr_t *vap = ap->a_vap;
6270	int error, mode;
6271
6272	ASSERT(cnp->cn_flags & SAVENAME);
6273
6274	vattr_init_mask(vap);
6275	mode = vap->va_mode & ALLPERMS;
6276
6277	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
6278	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
6279#ifdef FREEBSD_NAMECACHE
6280	if (error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
6281		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
6282#endif
6283	return (error);
6284}
6285
6286static int
6287zfs_freebsd_remove(ap)
6288	struct vop_remove_args /* {
6289		struct vnode *a_dvp;
6290		struct vnode *a_vp;
6291		struct componentname *a_cnp;
6292	} */ *ap;
6293{
6294
6295	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6296
6297	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
6298	    ap->a_cnp->cn_cred, NULL, 0));
6299}
6300
6301static int
6302zfs_freebsd_mkdir(ap)
6303	struct vop_mkdir_args /* {
6304		struct vnode *a_dvp;
6305		struct vnode **a_vpp;
6306		struct componentname *a_cnp;
6307		struct vattr *a_vap;
6308	} */ *ap;
6309{
6310	vattr_t *vap = ap->a_vap;
6311
6312	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6313
6314	vattr_init_mask(vap);
6315
6316	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
6317	    ap->a_cnp->cn_cred, NULL, 0, NULL));
6318}
6319
6320static int
6321zfs_freebsd_rmdir(ap)
6322	struct vop_rmdir_args /* {
6323		struct vnode *a_dvp;
6324		struct vnode *a_vp;
6325		struct componentname *a_cnp;
6326	} */ *ap;
6327{
6328	struct componentname *cnp = ap->a_cnp;
6329
6330	ASSERT(cnp->cn_flags & SAVENAME);
6331
6332	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
6333}
6334
6335static int
6336zfs_freebsd_readdir(ap)
6337	struct vop_readdir_args /* {
6338		struct vnode *a_vp;
6339		struct uio *a_uio;
6340		struct ucred *a_cred;
6341		int *a_eofflag;
6342		int *a_ncookies;
6343		u_long **a_cookies;
6344	} */ *ap;
6345{
6346
6347	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
6348	    ap->a_ncookies, ap->a_cookies));
6349}
6350
6351static int
6352zfs_freebsd_fsync(ap)
6353	struct vop_fsync_args /* {
6354		struct vnode *a_vp;
6355		int a_waitfor;
6356		struct thread *a_td;
6357	} */ *ap;
6358{
6359
6360	vop_stdfsync(ap);
6361	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
6362}
6363
6364static int
6365zfs_freebsd_getattr(ap)
6366	struct vop_getattr_args /* {
6367		struct vnode *a_vp;
6368		struct vattr *a_vap;
6369		struct ucred *a_cred;
6370	} */ *ap;
6371{
6372	vattr_t *vap = ap->a_vap;
6373	xvattr_t xvap;
6374	u_long fflags = 0;
6375	int error;
6376
6377	xva_init(&xvap);
6378	xvap.xva_vattr = *vap;
6379	xvap.xva_vattr.va_mask |= AT_XVATTR;
6380
6381	/* Convert chflags into ZFS-type flags. */
6382	/* XXX: what about SF_SETTABLE?. */
6383	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
6384	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
6385	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
6386	XVA_SET_REQ(&xvap, XAT_NODUMP);
6387	XVA_SET_REQ(&xvap, XAT_READONLY);
6388	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
6389	XVA_SET_REQ(&xvap, XAT_SYSTEM);
6390	XVA_SET_REQ(&xvap, XAT_HIDDEN);
6391	XVA_SET_REQ(&xvap, XAT_REPARSE);
6392	XVA_SET_REQ(&xvap, XAT_OFFLINE);
6393	XVA_SET_REQ(&xvap, XAT_SPARSE);
6394
6395	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
6396	if (error != 0)
6397		return (error);
6398
6399	/* Convert ZFS xattr into chflags. */
6400#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
6401	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
6402		fflags |= (fflag);					\
6403} while (0)
6404	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
6405	    xvap.xva_xoptattrs.xoa_immutable);
6406	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
6407	    xvap.xva_xoptattrs.xoa_appendonly);
6408	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
6409	    xvap.xva_xoptattrs.xoa_nounlink);
6410	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
6411	    xvap.xva_xoptattrs.xoa_archive);
6412	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
6413	    xvap.xva_xoptattrs.xoa_nodump);
6414	FLAG_CHECK(UF_READONLY, XAT_READONLY,
6415	    xvap.xva_xoptattrs.xoa_readonly);
6416	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
6417	    xvap.xva_xoptattrs.xoa_system);
6418	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
6419	    xvap.xva_xoptattrs.xoa_hidden);
6420	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
6421	    xvap.xva_xoptattrs.xoa_reparse);
6422	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
6423	    xvap.xva_xoptattrs.xoa_offline);
6424	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
6425	    xvap.xva_xoptattrs.xoa_sparse);
6426
6427#undef	FLAG_CHECK
6428	*vap = xvap.xva_vattr;
6429	vap->va_flags = fflags;
6430	return (0);
6431}
6432
6433static int
6434zfs_freebsd_setattr(ap)
6435	struct vop_setattr_args /* {
6436		struct vnode *a_vp;
6437		struct vattr *a_vap;
6438		struct ucred *a_cred;
6439	} */ *ap;
6440{
6441	vnode_t *vp = ap->a_vp;
6442	vattr_t *vap = ap->a_vap;
6443	cred_t *cred = ap->a_cred;
6444	xvattr_t xvap;
6445	u_long fflags;
6446	uint64_t zflags;
6447
6448	vattr_init_mask(vap);
6449	vap->va_mask &= ~AT_NOSET;
6450
6451	xva_init(&xvap);
6452	xvap.xva_vattr = *vap;
6453
6454	zflags = VTOZ(vp)->z_pflags;
6455
6456	if (vap->va_flags != VNOVAL) {
6457		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
6458		int error;
6459
6460		if (zfsvfs->z_use_fuids == B_FALSE)
6461			return (EOPNOTSUPP);
6462
6463		fflags = vap->va_flags;
6464		/*
6465		 * XXX KDM
6466		 * We need to figure out whether it makes sense to allow
6467		 * UF_REPARSE through, since we don't really have other
6468		 * facilities to handle reparse points and zfs_setattr()
6469		 * doesn't currently allow setting that attribute anyway.
6470		 */
6471		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
6472		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
6473		     UF_OFFLINE|UF_SPARSE)) != 0)
6474			return (EOPNOTSUPP);
6475		/*
6476		 * Unprivileged processes are not permitted to unset system
6477		 * flags, or modify flags if any system flags are set.
6478		 * Privileged non-jail processes may not modify system flags
6479		 * if securelevel > 0 and any existing system flags are set.
6480		 * Privileged jail processes behave like privileged non-jail
6481		 * processes if the security.jail.chflags_allowed sysctl is
6482		 * is non-zero; otherwise, they behave like unprivileged
6483		 * processes.
6484		 */
6485		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
6486		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
6487			if (zflags &
6488			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6489				error = securelevel_gt(cred, 0);
6490				if (error != 0)
6491					return (error);
6492			}
6493		} else {
6494			/*
6495			 * Callers may only modify the file flags on objects they
6496			 * have VADMIN rights for.
6497			 */
6498			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
6499				return (error);
6500			if (zflags &
6501			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6502				return (EPERM);
6503			}
6504			if (fflags &
6505			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
6506				return (EPERM);
6507			}
6508		}
6509
6510#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
6511	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
6512	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
6513		XVA_SET_REQ(&xvap, (xflag));				\
6514		(xfield) = ((fflags & (fflag)) != 0);			\
6515	}								\
6516} while (0)
6517		/* Convert chflags into ZFS-type flags. */
6518		/* XXX: what about SF_SETTABLE?. */
6519		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6520		    xvap.xva_xoptattrs.xoa_immutable);
6521		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6522		    xvap.xva_xoptattrs.xoa_appendonly);
6523		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6524		    xvap.xva_xoptattrs.xoa_nounlink);
6525		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
6526		    xvap.xva_xoptattrs.xoa_archive);
6527		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6528		    xvap.xva_xoptattrs.xoa_nodump);
6529		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
6530		    xvap.xva_xoptattrs.xoa_readonly);
6531		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
6532		    xvap.xva_xoptattrs.xoa_system);
6533		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
6534		    xvap.xva_xoptattrs.xoa_hidden);
6535		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
6536		    xvap.xva_xoptattrs.xoa_hidden);
6537		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
6538		    xvap.xva_xoptattrs.xoa_offline);
6539		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
6540		    xvap.xva_xoptattrs.xoa_sparse);
6541#undef	FLAG_CHANGE
6542	}
6543	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6544}
6545
6546static int
6547zfs_freebsd_rename(ap)
6548	struct vop_rename_args  /* {
6549		struct vnode *a_fdvp;
6550		struct vnode *a_fvp;
6551		struct componentname *a_fcnp;
6552		struct vnode *a_tdvp;
6553		struct vnode *a_tvp;
6554		struct componentname *a_tcnp;
6555	} */ *ap;
6556{
6557	vnode_t *fdvp = ap->a_fdvp;
6558	vnode_t *fvp = ap->a_fvp;
6559	vnode_t *tdvp = ap->a_tdvp;
6560	vnode_t *tvp = ap->a_tvp;
6561	int error;
6562
6563	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6564	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6565
6566	/*
6567	 * Check for cross-device rename.
6568	 */
6569	if ((fdvp->v_mount != tdvp->v_mount) ||
6570	    (tvp && (fdvp->v_mount != tvp->v_mount)))
6571		error = EXDEV;
6572	else
6573		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6574		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6575	if (tdvp == tvp)
6576		VN_RELE(tdvp);
6577	else
6578		VN_URELE(tdvp);
6579	if (tvp)
6580		VN_URELE(tvp);
6581	VN_RELE(fdvp);
6582	VN_RELE(fvp);
6583
6584	return (error);
6585}
6586
6587static int
6588zfs_freebsd_symlink(ap)
6589	struct vop_symlink_args /* {
6590		struct vnode *a_dvp;
6591		struct vnode **a_vpp;
6592		struct componentname *a_cnp;
6593		struct vattr *a_vap;
6594		char *a_target;
6595	} */ *ap;
6596{
6597	struct componentname *cnp = ap->a_cnp;
6598	vattr_t *vap = ap->a_vap;
6599
6600	ASSERT(cnp->cn_flags & SAVENAME);
6601
6602	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6603	vattr_init_mask(vap);
6604
6605	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6606	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6607}
6608
6609static int
6610zfs_freebsd_readlink(ap)
6611	struct vop_readlink_args /* {
6612		struct vnode *a_vp;
6613		struct uio *a_uio;
6614		struct ucred *a_cred;
6615	} */ *ap;
6616{
6617
6618	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6619}
6620
6621static int
6622zfs_freebsd_link(ap)
6623	struct vop_link_args /* {
6624		struct vnode *a_tdvp;
6625		struct vnode *a_vp;
6626		struct componentname *a_cnp;
6627	} */ *ap;
6628{
6629	struct componentname *cnp = ap->a_cnp;
6630	vnode_t *vp = ap->a_vp;
6631	vnode_t *tdvp = ap->a_tdvp;
6632
6633	if (tdvp->v_mount != vp->v_mount)
6634		return (EXDEV);
6635
6636	ASSERT(cnp->cn_flags & SAVENAME);
6637
6638	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6639}
6640
6641static int
6642zfs_freebsd_inactive(ap)
6643	struct vop_inactive_args /* {
6644		struct vnode *a_vp;
6645		struct thread *a_td;
6646	} */ *ap;
6647{
6648	vnode_t *vp = ap->a_vp;
6649
6650	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6651	return (0);
6652}
6653
6654static int
6655zfs_freebsd_reclaim(ap)
6656	struct vop_reclaim_args /* {
6657		struct vnode *a_vp;
6658		struct thread *a_td;
6659	} */ *ap;
6660{
6661	vnode_t	*vp = ap->a_vp;
6662	znode_t	*zp = VTOZ(vp);
6663	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6664
6665	ASSERT(zp != NULL);
6666
6667	/* Destroy the vm object and flush associated pages. */
6668	vnode_destroy_vobject(vp);
6669
6670	/*
6671	 * z_teardown_inactive_lock protects from a race with
6672	 * zfs_znode_dmu_fini in zfsvfs_teardown during
6673	 * force unmount.
6674	 */
6675	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6676	if (zp->z_sa_hdl == NULL)
6677		zfs_znode_free(zp);
6678	else
6679		zfs_zinactive(zp);
6680	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6681
6682	vp->v_data = NULL;
6683	return (0);
6684}
6685
6686static int
6687zfs_freebsd_fid(ap)
6688	struct vop_fid_args /* {
6689		struct vnode *a_vp;
6690		struct fid *a_fid;
6691	} */ *ap;
6692{
6693
6694	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6695}
6696
6697static int
6698zfs_freebsd_pathconf(ap)
6699	struct vop_pathconf_args /* {
6700		struct vnode *a_vp;
6701		int a_name;
6702		register_t *a_retval;
6703	} */ *ap;
6704{
6705	ulong_t val;
6706	int error;
6707
6708	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6709	if (error == 0)
6710		*ap->a_retval = val;
6711	else if (error == EOPNOTSUPP)
6712		error = vop_stdpathconf(ap);
6713	return (error);
6714}
6715
6716static int
6717zfs_freebsd_fifo_pathconf(ap)
6718	struct vop_pathconf_args /* {
6719		struct vnode *a_vp;
6720		int a_name;
6721		register_t *a_retval;
6722	} */ *ap;
6723{
6724
6725	switch (ap->a_name) {
6726	case _PC_ACL_EXTENDED:
6727	case _PC_ACL_NFS4:
6728	case _PC_ACL_PATH_MAX:
6729	case _PC_MAC_PRESENT:
6730		return (zfs_freebsd_pathconf(ap));
6731	default:
6732		return (fifo_specops.vop_pathconf(ap));
6733	}
6734}
6735
6736/*
6737 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6738 * extended attribute name:
6739 *
6740 *	NAMESPACE	PREFIX
6741 *	system		freebsd:system:
6742 *	user		(none, can be used to access ZFS fsattr(5) attributes
6743 *			created on Solaris)
6744 */
6745static int
6746zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6747    size_t size)
6748{
6749	const char *namespace, *prefix, *suffix;
6750
6751	/* We don't allow '/' character in attribute name. */
6752	if (strchr(name, '/') != NULL)
6753		return (EINVAL);
6754	/* We don't allow attribute names that start with "freebsd:" string. */
6755	if (strncmp(name, "freebsd:", 8) == 0)
6756		return (EINVAL);
6757
6758	bzero(attrname, size);
6759
6760	switch (attrnamespace) {
6761	case EXTATTR_NAMESPACE_USER:
6762#if 0
6763		prefix = "freebsd:";
6764		namespace = EXTATTR_NAMESPACE_USER_STRING;
6765		suffix = ":";
6766#else
6767		/*
6768		 * This is the default namespace by which we can access all
6769		 * attributes created on Solaris.
6770		 */
6771		prefix = namespace = suffix = "";
6772#endif
6773		break;
6774	case EXTATTR_NAMESPACE_SYSTEM:
6775		prefix = "freebsd:";
6776		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6777		suffix = ":";
6778		break;
6779	case EXTATTR_NAMESPACE_EMPTY:
6780	default:
6781		return (EINVAL);
6782	}
6783	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6784	    name) >= size) {
6785		return (ENAMETOOLONG);
6786	}
6787	return (0);
6788}
6789
6790/*
6791 * Vnode operating to retrieve a named extended attribute.
6792 */
6793static int
6794zfs_getextattr(struct vop_getextattr_args *ap)
6795/*
6796vop_getextattr {
6797	IN struct vnode *a_vp;
6798	IN int a_attrnamespace;
6799	IN const char *a_name;
6800	INOUT struct uio *a_uio;
6801	OUT size_t *a_size;
6802	IN struct ucred *a_cred;
6803	IN struct thread *a_td;
6804};
6805*/
6806{
6807	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6808	struct thread *td = ap->a_td;
6809	struct nameidata nd;
6810	char attrname[255];
6811	struct vattr va;
6812	vnode_t *xvp = NULL, *vp;
6813	int error, flags;
6814
6815	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6816	    ap->a_cred, ap->a_td, VREAD);
6817	if (error != 0)
6818		return (error);
6819
6820	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6821	    sizeof(attrname));
6822	if (error != 0)
6823		return (error);
6824
6825	ZFS_ENTER(zfsvfs);
6826
6827	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6828	    LOOKUP_XATTR);
6829	if (error != 0) {
6830		ZFS_EXIT(zfsvfs);
6831		return (error);
6832	}
6833
6834	flags = FREAD;
6835	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6836	    xvp, td);
6837	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6838	vp = nd.ni_vp;
6839	NDFREE(&nd, NDF_ONLY_PNBUF);
6840	if (error != 0) {
6841		ZFS_EXIT(zfsvfs);
6842		if (error == ENOENT)
6843			error = ENOATTR;
6844		return (error);
6845	}
6846
6847	if (ap->a_size != NULL) {
6848		error = VOP_GETATTR(vp, &va, ap->a_cred);
6849		if (error == 0)
6850			*ap->a_size = (size_t)va.va_size;
6851	} else if (ap->a_uio != NULL)
6852		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6853
6854	VOP_UNLOCK(vp, 0);
6855	vn_close(vp, flags, ap->a_cred, td);
6856	ZFS_EXIT(zfsvfs);
6857
6858	return (error);
6859}
6860
6861/*
6862 * Vnode operation to remove a named attribute.
6863 */
6864int
6865zfs_deleteextattr(struct vop_deleteextattr_args *ap)
6866/*
6867vop_deleteextattr {
6868	IN struct vnode *a_vp;
6869	IN int a_attrnamespace;
6870	IN const char *a_name;
6871	IN struct ucred *a_cred;
6872	IN struct thread *a_td;
6873};
6874*/
6875{
6876	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6877	struct thread *td = ap->a_td;
6878	struct nameidata nd;
6879	char attrname[255];
6880	struct vattr va;
6881	vnode_t *xvp = NULL, *vp;
6882	int error, flags;
6883
6884	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6885	    ap->a_cred, ap->a_td, VWRITE);
6886	if (error != 0)
6887		return (error);
6888
6889	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6890	    sizeof(attrname));
6891	if (error != 0)
6892		return (error);
6893
6894	ZFS_ENTER(zfsvfs);
6895
6896	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6897	    LOOKUP_XATTR);
6898	if (error != 0) {
6899		ZFS_EXIT(zfsvfs);
6900		return (error);
6901	}
6902
6903	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6904	    UIO_SYSSPACE, attrname, xvp, td);
6905	error = namei(&nd);
6906	vp = nd.ni_vp;
6907	if (error != 0) {
6908		ZFS_EXIT(zfsvfs);
6909		NDFREE(&nd, NDF_ONLY_PNBUF);
6910		if (error == ENOENT)
6911			error = ENOATTR;
6912		return (error);
6913	}
6914
6915	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6916	NDFREE(&nd, NDF_ONLY_PNBUF);
6917
6918	vput(nd.ni_dvp);
6919	if (vp == nd.ni_dvp)
6920		vrele(vp);
6921	else
6922		vput(vp);
6923	ZFS_EXIT(zfsvfs);
6924
6925	return (error);
6926}
6927
6928/*
6929 * Vnode operation to set a named attribute.
6930 */
6931static int
6932zfs_setextattr(struct vop_setextattr_args *ap)
6933/*
6934vop_setextattr {
6935	IN struct vnode *a_vp;
6936	IN int a_attrnamespace;
6937	IN const char *a_name;
6938	INOUT struct uio *a_uio;
6939	IN struct ucred *a_cred;
6940	IN struct thread *a_td;
6941};
6942*/
6943{
6944	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6945	struct thread *td = ap->a_td;
6946	struct nameidata nd;
6947	char attrname[255];
6948	struct vattr va;
6949	vnode_t *xvp = NULL, *vp;
6950	int error, flags;
6951
6952	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6953	    ap->a_cred, ap->a_td, VWRITE);
6954	if (error != 0)
6955		return (error);
6956
6957	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6958	    sizeof(attrname));
6959	if (error != 0)
6960		return (error);
6961
6962	ZFS_ENTER(zfsvfs);
6963
6964	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6965	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6966	if (error != 0) {
6967		ZFS_EXIT(zfsvfs);
6968		return (error);
6969	}
6970
6971	flags = FFLAGS(O_WRONLY | O_CREAT);
6972	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6973	    xvp, td);
6974	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6975	vp = nd.ni_vp;
6976	NDFREE(&nd, NDF_ONLY_PNBUF);
6977	if (error != 0) {
6978		ZFS_EXIT(zfsvfs);
6979		return (error);
6980	}
6981
6982	VATTR_NULL(&va);
6983	va.va_size = 0;
6984	error = VOP_SETATTR(vp, &va, ap->a_cred);
6985	if (error == 0)
6986		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6987
6988	VOP_UNLOCK(vp, 0);
6989	vn_close(vp, flags, ap->a_cred, td);
6990	ZFS_EXIT(zfsvfs);
6991
6992	return (error);
6993}
6994
6995/*
6996 * Vnode operation to retrieve extended attributes on a vnode.
6997 */
6998static int
6999zfs_listextattr(struct vop_listextattr_args *ap)
7000/*
7001vop_listextattr {
7002	IN struct vnode *a_vp;
7003	IN int a_attrnamespace;
7004	INOUT struct uio *a_uio;
7005	OUT size_t *a_size;
7006	IN struct ucred *a_cred;
7007	IN struct thread *a_td;
7008};
7009*/
7010{
7011	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
7012	struct thread *td = ap->a_td;
7013	struct nameidata nd;
7014	char attrprefix[16];
7015	u_char dirbuf[sizeof(struct dirent)];
7016	struct dirent *dp;
7017	struct iovec aiov;
7018	struct uio auio, *uio = ap->a_uio;
7019	size_t *sizep = ap->a_size;
7020	size_t plen;
7021	vnode_t *xvp = NULL, *vp;
7022	int done, error, eof, pos;
7023
7024	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
7025	    ap->a_cred, ap->a_td, VREAD);
7026	if (error != 0)
7027		return (error);
7028
7029	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
7030	    sizeof(attrprefix));
7031	if (error != 0)
7032		return (error);
7033	plen = strlen(attrprefix);
7034
7035	ZFS_ENTER(zfsvfs);
7036
7037	if (sizep != NULL)
7038		*sizep = 0;
7039
7040	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
7041	    LOOKUP_XATTR);
7042	if (error != 0) {
7043		ZFS_EXIT(zfsvfs);
7044		/*
7045		 * ENOATTR means that the EA directory does not yet exist,
7046		 * i.e. there are no extended attributes there.
7047		 */
7048		if (error == ENOATTR)
7049			error = 0;
7050		return (error);
7051	}
7052
7053	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
7054	    UIO_SYSSPACE, ".", xvp, td);
7055	error = namei(&nd);
7056	vp = nd.ni_vp;
7057	NDFREE(&nd, NDF_ONLY_PNBUF);
7058	if (error != 0) {
7059		ZFS_EXIT(zfsvfs);
7060		return (error);
7061	}
7062
7063	auio.uio_iov = &aiov;
7064	auio.uio_iovcnt = 1;
7065	auio.uio_segflg = UIO_SYSSPACE;
7066	auio.uio_td = td;
7067	auio.uio_rw = UIO_READ;
7068	auio.uio_offset = 0;
7069
7070	do {
7071		u_char nlen;
7072
7073		aiov.iov_base = (void *)dirbuf;
7074		aiov.iov_len = sizeof(dirbuf);
7075		auio.uio_resid = sizeof(dirbuf);
7076		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
7077		done = sizeof(dirbuf) - auio.uio_resid;
7078		if (error != 0)
7079			break;
7080		for (pos = 0; pos < done;) {
7081			dp = (struct dirent *)(dirbuf + pos);
7082			pos += dp->d_reclen;
7083			/*
7084			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
7085			 * is what we get when attribute was created on Solaris.
7086			 */
7087			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
7088				continue;
7089			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
7090				continue;
7091			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
7092				continue;
7093			nlen = dp->d_namlen - plen;
7094			if (sizep != NULL)
7095				*sizep += 1 + nlen;
7096			else if (uio != NULL) {
7097				/*
7098				 * Format of extattr name entry is one byte for
7099				 * length and the rest for name.
7100				 */
7101				error = uiomove(&nlen, 1, uio->uio_rw, uio);
7102				if (error == 0) {
7103					error = uiomove(dp->d_name + plen, nlen,
7104					    uio->uio_rw, uio);
7105				}
7106				if (error != 0)
7107					break;
7108			}
7109		}
7110	} while (!eof && error == 0);
7111
7112	vput(vp);
7113	ZFS_EXIT(zfsvfs);
7114
7115	return (error);
7116}
7117
7118int
7119zfs_freebsd_getacl(ap)
7120	struct vop_getacl_args /* {
7121		struct vnode *vp;
7122		acl_type_t type;
7123		struct acl *aclp;
7124		struct ucred *cred;
7125		struct thread *td;
7126	} */ *ap;
7127{
7128	int		error;
7129	vsecattr_t      vsecattr;
7130
7131	if (ap->a_type != ACL_TYPE_NFS4)
7132		return (EINVAL);
7133
7134	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
7135	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
7136		return (error);
7137
7138	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
7139	if (vsecattr.vsa_aclentp != NULL)
7140		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
7141
7142	return (error);
7143}
7144
7145int
7146zfs_freebsd_setacl(ap)
7147	struct vop_setacl_args /* {
7148		struct vnode *vp;
7149		acl_type_t type;
7150		struct acl *aclp;
7151		struct ucred *cred;
7152		struct thread *td;
7153	} */ *ap;
7154{
7155	int		error;
7156	vsecattr_t      vsecattr;
7157	int		aclbsize;	/* size of acl list in bytes */
7158	aclent_t	*aaclp;
7159
7160	if (ap->a_type != ACL_TYPE_NFS4)
7161		return (EINVAL);
7162
7163	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
7164		return (EINVAL);
7165
7166	/*
7167	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
7168	 * splitting every entry into two and appending "canonical six"
7169	 * entries at the end.  Don't allow for setting an ACL that would
7170	 * cause chmod(2) to run out of ACL entries.
7171	 */
7172	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
7173		return (ENOSPC);
7174
7175	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
7176	if (error != 0)
7177		return (error);
7178
7179	vsecattr.vsa_mask = VSA_ACE;
7180	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
7181	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
7182	aaclp = vsecattr.vsa_aclentp;
7183	vsecattr.vsa_aclentsz = aclbsize;
7184
7185	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
7186	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
7187	kmem_free(aaclp, aclbsize);
7188
7189	return (error);
7190}
7191
7192int
7193zfs_freebsd_aclcheck(ap)
7194	struct vop_aclcheck_args /* {
7195		struct vnode *vp;
7196		acl_type_t type;
7197		struct acl *aclp;
7198		struct ucred *cred;
7199		struct thread *td;
7200	} */ *ap;
7201{
7202
7203	return (EOPNOTSUPP);
7204}
7205
7206static int
7207zfs_vptocnp(struct vop_vptocnp_args *ap)
7208{
7209	vnode_t *covered_vp;
7210	vnode_t *vp = ap->a_vp;;
7211	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
7212	znode_t *zp = VTOZ(vp);
7213	uint64_t parent;
7214	int ltype;
7215	int error;
7216
7217	ZFS_ENTER(zfsvfs);
7218	ZFS_VERIFY_ZP(zp);
7219
7220	/*
7221	 * If we are a snapshot mounted under .zfs, run the operation
7222	 * on the covered vnode.
7223	 */
7224	if ((error = sa_lookup(zp->z_sa_hdl,
7225	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) {
7226		ZFS_EXIT(zfsvfs);
7227		return (error);
7228	}
7229
7230	if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) {
7231		ZFS_EXIT(zfsvfs);
7232		return (vop_stdvptocnp(ap));
7233	}
7234	ZFS_EXIT(zfsvfs);
7235
7236	covered_vp = vp->v_mount->mnt_vnodecovered;
7237	vhold(covered_vp);
7238	ltype = VOP_ISLOCKED(vp);
7239	VOP_UNLOCK(vp, 0);
7240	error = vget(covered_vp, LK_EXCLUSIVE, curthread);
7241	vdrop(covered_vp);
7242	if (error == 0) {
7243		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
7244		    ap->a_buf, ap->a_buflen);
7245		vput(covered_vp);
7246	}
7247	vn_lock(vp, ltype | LK_RETRY);
7248	if ((vp->v_iflag & VI_DOOMED) != 0)
7249		error = SET_ERROR(ENOENT);
7250	return (error);
7251}
7252
7253struct vop_vector zfs_vnodeops;
7254struct vop_vector zfs_fifoops;
7255struct vop_vector zfs_shareops;
7256
7257struct vop_vector zfs_vnodeops = {
7258	.vop_default =		&default_vnodeops,
7259	.vop_inactive =		zfs_freebsd_inactive,
7260	.vop_reclaim =		zfs_freebsd_reclaim,
7261	.vop_access =		zfs_freebsd_access,
7262#ifdef FREEBSD_NAMECACHE
7263	.vop_lookup =		vfs_cache_lookup,
7264	.vop_cachedlookup =	zfs_freebsd_lookup,
7265#else
7266	.vop_lookup =		zfs_freebsd_lookup,
7267#endif
7268	.vop_getattr =		zfs_freebsd_getattr,
7269	.vop_setattr =		zfs_freebsd_setattr,
7270	.vop_create =		zfs_freebsd_create,
7271	.vop_mknod =		zfs_freebsd_create,
7272	.vop_mkdir =		zfs_freebsd_mkdir,
7273	.vop_readdir =		zfs_freebsd_readdir,
7274	.vop_fsync =		zfs_freebsd_fsync,
7275	.vop_open =		zfs_freebsd_open,
7276	.vop_close =		zfs_freebsd_close,
7277	.vop_rmdir =		zfs_freebsd_rmdir,
7278	.vop_ioctl =		zfs_freebsd_ioctl,
7279	.vop_link =		zfs_freebsd_link,
7280	.vop_symlink =		zfs_freebsd_symlink,
7281	.vop_readlink =		zfs_freebsd_readlink,
7282	.vop_read =		zfs_freebsd_read,
7283	.vop_write =		zfs_freebsd_write,
7284	.vop_remove =		zfs_freebsd_remove,
7285	.vop_rename =		zfs_freebsd_rename,
7286	.vop_pathconf =		zfs_freebsd_pathconf,
7287	.vop_bmap =		zfs_freebsd_bmap,
7288	.vop_fid =		zfs_freebsd_fid,
7289	.vop_getextattr =	zfs_getextattr,
7290	.vop_deleteextattr =	zfs_deleteextattr,
7291	.vop_setextattr =	zfs_setextattr,
7292	.vop_listextattr =	zfs_listextattr,
7293	.vop_getacl =		zfs_freebsd_getacl,
7294	.vop_setacl =		zfs_freebsd_setacl,
7295	.vop_aclcheck =		zfs_freebsd_aclcheck,
7296	.vop_getpages =		zfs_freebsd_getpages,
7297	.vop_putpages =		zfs_freebsd_putpages,
7298	.vop_vptocnp =		zfs_vptocnp,
7299};
7300
7301struct vop_vector zfs_fifoops = {
7302	.vop_default =		&fifo_specops,
7303	.vop_fsync =		zfs_freebsd_fsync,
7304	.vop_access =		zfs_freebsd_access,
7305	.vop_getattr =		zfs_freebsd_getattr,
7306	.vop_inactive =		zfs_freebsd_inactive,
7307	.vop_read =		VOP_PANIC,
7308	.vop_reclaim =		zfs_freebsd_reclaim,
7309	.vop_setattr =		zfs_freebsd_setattr,
7310	.vop_write =		VOP_PANIC,
7311	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
7312	.vop_fid =		zfs_freebsd_fid,
7313	.vop_getacl =		zfs_freebsd_getacl,
7314	.vop_setacl =		zfs_freebsd_setacl,
7315	.vop_aclcheck =		zfs_freebsd_aclcheck,
7316};
7317
7318/*
7319 * special share hidden files vnode operations template
7320 */
7321struct vop_vector zfs_shareops = {
7322	.vop_default =		&default_vnodeops,
7323	.vop_access =		zfs_freebsd_access,
7324	.vop_inactive =		zfs_freebsd_inactive,
7325	.vop_reclaim =		zfs_freebsd_reclaim,
7326	.vop_fid =		zfs_freebsd_fid,
7327	.vop_pathconf =		zfs_freebsd_pathconf,
7328};
7329