vfs_syscalls.c revision 293474
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 293474 2016-01-09 14:20:23Z dchagin $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capsicum.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <ufs/ufs/quota.h>
91
92MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93
94SDT_PROVIDER_DEFINE(vfs);
95SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
97
98static int chroot_refuse_vdir_fds(struct filedesc *fdp);
99static int kern_chflags(struct thread *td, const char *path,
100    enum uio_seg pathseg, u_long flags);
101static int kern_chflagsat(struct thread *td, int fd, const char *path,
102    enum uio_seg pathseg, u_long flags, int atflag);
103static int setfflags(struct thread *td, struct vnode *, u_long);
104static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
105static int getutimens(const struct timespec *, enum uio_seg,
106    struct timespec *, int *);
107static int setutimes(struct thread *td, struct vnode *,
108    const struct timespec *, int, int);
109static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
110    struct thread *td);
111
112/*
113 * The module initialization routine for POSIX asynchronous I/O will
114 * set this to the version of AIO that it implements.  (Zero means
115 * that it is not implemented.)  This value is used here by pathconf()
116 * and in kern_descrip.c by fpathconf().
117 */
118int async_io_version;
119
120/*
121 * Sync each mounted filesystem.
122 */
123#ifndef _SYS_SYSPROTO_H_
124struct sync_args {
125	int     dummy;
126};
127#endif
128/* ARGSUSED */
129int
130sys_sync(td, uap)
131	struct thread *td;
132	struct sync_args *uap;
133{
134	struct mount *mp, *nmp;
135	int save;
136
137	mtx_lock(&mountlist_mtx);
138	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
139		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
140			nmp = TAILQ_NEXT(mp, mnt_list);
141			continue;
142		}
143		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
144		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
145			save = curthread_pflags_set(TDP_SYNCIO);
146			vfs_msync(mp, MNT_NOWAIT);
147			VFS_SYNC(mp, MNT_NOWAIT);
148			curthread_pflags_restore(save);
149			vn_finished_write(mp);
150		}
151		mtx_lock(&mountlist_mtx);
152		nmp = TAILQ_NEXT(mp, mnt_list);
153		vfs_unbusy(mp);
154	}
155	mtx_unlock(&mountlist_mtx);
156	return (0);
157}
158
159/*
160 * Change filesystem quotas.
161 */
162#ifndef _SYS_SYSPROTO_H_
163struct quotactl_args {
164	char *path;
165	int cmd;
166	int uid;
167	caddr_t arg;
168};
169#endif
170int
171sys_quotactl(td, uap)
172	struct thread *td;
173	register struct quotactl_args /* {
174		char *path;
175		int cmd;
176		int uid;
177		caddr_t arg;
178	} */ *uap;
179{
180	struct mount *mp;
181	struct nameidata nd;
182	int error;
183
184	AUDIT_ARG_CMD(uap->cmd);
185	AUDIT_ARG_UID(uap->uid);
186	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
187		return (EPERM);
188	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
189	    uap->path, td);
190	if ((error = namei(&nd)) != 0)
191		return (error);
192	NDFREE(&nd, NDF_ONLY_PNBUF);
193	mp = nd.ni_vp->v_mount;
194	vfs_ref(mp);
195	vput(nd.ni_vp);
196	error = vfs_busy(mp, 0);
197	vfs_rel(mp);
198	if (error != 0)
199		return (error);
200	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
201
202	/*
203	 * Since quota on operation typically needs to open quota
204	 * file, the Q_QUOTAON handler needs to unbusy the mount point
205	 * before calling into namei.  Otherwise, unmount might be
206	 * started between two vfs_busy() invocations (first is our,
207	 * second is from mount point cross-walk code in lookup()),
208	 * causing deadlock.
209	 *
210	 * Require that Q_QUOTAON handles the vfs_busy() reference on
211	 * its own, always returning with ubusied mount point.
212	 */
213	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
214		vfs_unbusy(mp);
215	return (error);
216}
217
218/*
219 * Used by statfs conversion routines to scale the block size up if
220 * necessary so that all of the block counts are <= 'max_size'.  Note
221 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
222 * value of 'n'.
223 */
224void
225statfs_scale_blocks(struct statfs *sf, long max_size)
226{
227	uint64_t count;
228	int shift;
229
230	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
231
232	/*
233	 * Attempt to scale the block counts to give a more accurate
234	 * overview to userland of the ratio of free space to used
235	 * space.  To do this, find the largest block count and compute
236	 * a divisor that lets it fit into a signed integer <= max_size.
237	 */
238	if (sf->f_bavail < 0)
239		count = -sf->f_bavail;
240	else
241		count = sf->f_bavail;
242	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
243	if (count <= max_size)
244		return;
245
246	count >>= flsl(max_size);
247	shift = 0;
248	while (count > 0) {
249		shift++;
250		count >>=1;
251	}
252
253	sf->f_bsize <<= shift;
254	sf->f_blocks >>= shift;
255	sf->f_bfree >>= shift;
256	sf->f_bavail >>= shift;
257}
258
259/*
260 * Get filesystem statistics.
261 */
262#ifndef _SYS_SYSPROTO_H_
263struct statfs_args {
264	char *path;
265	struct statfs *buf;
266};
267#endif
268int
269sys_statfs(td, uap)
270	struct thread *td;
271	register struct statfs_args /* {
272		char *path;
273		struct statfs *buf;
274	} */ *uap;
275{
276	struct statfs sf;
277	int error;
278
279	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
280	if (error == 0)
281		error = copyout(&sf, uap->buf, sizeof(sf));
282	return (error);
283}
284
285int
286kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
287    struct statfs *buf)
288{
289	struct mount *mp;
290	struct statfs *sp, sb;
291	struct nameidata nd;
292	int error;
293
294	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
295	    pathseg, path, td);
296	error = namei(&nd);
297	if (error != 0)
298		return (error);
299	mp = nd.ni_vp->v_mount;
300	vfs_ref(mp);
301	NDFREE(&nd, NDF_ONLY_PNBUF);
302	vput(nd.ni_vp);
303	error = vfs_busy(mp, 0);
304	vfs_rel(mp);
305	if (error != 0)
306		return (error);
307#ifdef MAC
308	error = mac_mount_check_stat(td->td_ucred, mp);
309	if (error != 0)
310		goto out;
311#endif
312	/*
313	 * Set these in case the underlying filesystem fails to do so.
314	 */
315	sp = &mp->mnt_stat;
316	sp->f_version = STATFS_VERSION;
317	sp->f_namemax = NAME_MAX;
318	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
319	error = VFS_STATFS(mp, sp);
320	if (error != 0)
321		goto out;
322	if (priv_check(td, PRIV_VFS_GENERATION)) {
323		bcopy(sp, &sb, sizeof(sb));
324		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
325		prison_enforce_statfs(td->td_ucred, mp, &sb);
326		sp = &sb;
327	}
328	*buf = *sp;
329out:
330	vfs_unbusy(mp);
331	return (error);
332}
333
334/*
335 * Get filesystem statistics.
336 */
337#ifndef _SYS_SYSPROTO_H_
338struct fstatfs_args {
339	int fd;
340	struct statfs *buf;
341};
342#endif
343int
344sys_fstatfs(td, uap)
345	struct thread *td;
346	register struct fstatfs_args /* {
347		int fd;
348		struct statfs *buf;
349	} */ *uap;
350{
351	struct statfs sf;
352	int error;
353
354	error = kern_fstatfs(td, uap->fd, &sf);
355	if (error == 0)
356		error = copyout(&sf, uap->buf, sizeof(sf));
357	return (error);
358}
359
360int
361kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
362{
363	struct file *fp;
364	struct mount *mp;
365	struct statfs *sp, sb;
366	struct vnode *vp;
367	cap_rights_t rights;
368	int error;
369
370	AUDIT_ARG_FD(fd);
371	error = getvnode(td->td_proc->p_fd, fd,
372	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
373	if (error != 0)
374		return (error);
375	vp = fp->f_vnode;
376	vn_lock(vp, LK_SHARED | LK_RETRY);
377#ifdef AUDIT
378	AUDIT_ARG_VNODE1(vp);
379#endif
380	mp = vp->v_mount;
381	if (mp)
382		vfs_ref(mp);
383	VOP_UNLOCK(vp, 0);
384	fdrop(fp, td);
385	if (mp == NULL) {
386		error = EBADF;
387		goto out;
388	}
389	error = vfs_busy(mp, 0);
390	vfs_rel(mp);
391	if (error != 0)
392		return (error);
393#ifdef MAC
394	error = mac_mount_check_stat(td->td_ucred, mp);
395	if (error != 0)
396		goto out;
397#endif
398	/*
399	 * Set these in case the underlying filesystem fails to do so.
400	 */
401	sp = &mp->mnt_stat;
402	sp->f_version = STATFS_VERSION;
403	sp->f_namemax = NAME_MAX;
404	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
405	error = VFS_STATFS(mp, sp);
406	if (error != 0)
407		goto out;
408	if (priv_check(td, PRIV_VFS_GENERATION)) {
409		bcopy(sp, &sb, sizeof(sb));
410		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
411		prison_enforce_statfs(td->td_ucred, mp, &sb);
412		sp = &sb;
413	}
414	*buf = *sp;
415out:
416	if (mp)
417		vfs_unbusy(mp);
418	return (error);
419}
420
421/*
422 * Get statistics on all filesystems.
423 */
424#ifndef _SYS_SYSPROTO_H_
425struct getfsstat_args {
426	struct statfs *buf;
427	long bufsize;
428	int flags;
429};
430#endif
431int
432sys_getfsstat(td, uap)
433	struct thread *td;
434	register struct getfsstat_args /* {
435		struct statfs *buf;
436		long bufsize;
437		int flags;
438	} */ *uap;
439{
440
441	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
442	    uap->flags));
443}
444
445/*
446 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
447 *	The caller is responsible for freeing memory which will be allocated
448 *	in '*buf'.
449 */
450int
451kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
452    enum uio_seg bufseg, int flags)
453{
454	struct mount *mp, *nmp;
455	struct statfs *sfsp, *sp, sb;
456	size_t count, maxcount;
457	int error;
458
459	maxcount = bufsize / sizeof(struct statfs);
460	if (bufsize == 0)
461		sfsp = NULL;
462	else if (bufseg == UIO_USERSPACE)
463		sfsp = *buf;
464	else /* if (bufseg == UIO_SYSSPACE) */ {
465		count = 0;
466		mtx_lock(&mountlist_mtx);
467		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
468			count++;
469		}
470		mtx_unlock(&mountlist_mtx);
471		if (maxcount > count)
472			maxcount = count;
473		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
474		    M_WAITOK);
475	}
476	count = 0;
477	mtx_lock(&mountlist_mtx);
478	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
479		if (prison_canseemount(td->td_ucred, mp) != 0) {
480			nmp = TAILQ_NEXT(mp, mnt_list);
481			continue;
482		}
483#ifdef MAC
484		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
485			nmp = TAILQ_NEXT(mp, mnt_list);
486			continue;
487		}
488#endif
489		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
490			nmp = TAILQ_NEXT(mp, mnt_list);
491			continue;
492		}
493		if (sfsp && count < maxcount) {
494			sp = &mp->mnt_stat;
495			/*
496			 * Set these in case the underlying filesystem
497			 * fails to do so.
498			 */
499			sp->f_version = STATFS_VERSION;
500			sp->f_namemax = NAME_MAX;
501			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
502			/*
503			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
504			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
505			 * overrides MNT_WAIT.
506			 */
507			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
508			    (flags & MNT_WAIT)) &&
509			    (error = VFS_STATFS(mp, sp))) {
510				mtx_lock(&mountlist_mtx);
511				nmp = TAILQ_NEXT(mp, mnt_list);
512				vfs_unbusy(mp);
513				continue;
514			}
515			if (priv_check(td, PRIV_VFS_GENERATION)) {
516				bcopy(sp, &sb, sizeof(sb));
517				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
518				prison_enforce_statfs(td->td_ucred, mp, &sb);
519				sp = &sb;
520			}
521			if (bufseg == UIO_SYSSPACE)
522				bcopy(sp, sfsp, sizeof(*sp));
523			else /* if (bufseg == UIO_USERSPACE) */ {
524				error = copyout(sp, sfsp, sizeof(*sp));
525				if (error != 0) {
526					vfs_unbusy(mp);
527					return (error);
528				}
529			}
530			sfsp++;
531		}
532		count++;
533		mtx_lock(&mountlist_mtx);
534		nmp = TAILQ_NEXT(mp, mnt_list);
535		vfs_unbusy(mp);
536	}
537	mtx_unlock(&mountlist_mtx);
538	if (sfsp && count > maxcount)
539		td->td_retval[0] = maxcount;
540	else
541		td->td_retval[0] = count;
542	return (0);
543}
544
545#ifdef COMPAT_FREEBSD4
546/*
547 * Get old format filesystem statistics.
548 */
549static void cvtstatfs(struct statfs *, struct ostatfs *);
550
551#ifndef _SYS_SYSPROTO_H_
552struct freebsd4_statfs_args {
553	char *path;
554	struct ostatfs *buf;
555};
556#endif
557int
558freebsd4_statfs(td, uap)
559	struct thread *td;
560	struct freebsd4_statfs_args /* {
561		char *path;
562		struct ostatfs *buf;
563	} */ *uap;
564{
565	struct ostatfs osb;
566	struct statfs sf;
567	int error;
568
569	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
570	if (error != 0)
571		return (error);
572	cvtstatfs(&sf, &osb);
573	return (copyout(&osb, uap->buf, sizeof(osb)));
574}
575
576/*
577 * Get filesystem statistics.
578 */
579#ifndef _SYS_SYSPROTO_H_
580struct freebsd4_fstatfs_args {
581	int fd;
582	struct ostatfs *buf;
583};
584#endif
585int
586freebsd4_fstatfs(td, uap)
587	struct thread *td;
588	struct freebsd4_fstatfs_args /* {
589		int fd;
590		struct ostatfs *buf;
591	} */ *uap;
592{
593	struct ostatfs osb;
594	struct statfs sf;
595	int error;
596
597	error = kern_fstatfs(td, uap->fd, &sf);
598	if (error != 0)
599		return (error);
600	cvtstatfs(&sf, &osb);
601	return (copyout(&osb, uap->buf, sizeof(osb)));
602}
603
604/*
605 * Get statistics on all filesystems.
606 */
607#ifndef _SYS_SYSPROTO_H_
608struct freebsd4_getfsstat_args {
609	struct ostatfs *buf;
610	long bufsize;
611	int flags;
612};
613#endif
614int
615freebsd4_getfsstat(td, uap)
616	struct thread *td;
617	register struct freebsd4_getfsstat_args /* {
618		struct ostatfs *buf;
619		long bufsize;
620		int flags;
621	} */ *uap;
622{
623	struct statfs *buf, *sp;
624	struct ostatfs osb;
625	size_t count, size;
626	int error;
627
628	count = uap->bufsize / sizeof(struct ostatfs);
629	size = count * sizeof(struct statfs);
630	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
631	if (size > 0) {
632		count = td->td_retval[0];
633		sp = buf;
634		while (count > 0 && error == 0) {
635			cvtstatfs(sp, &osb);
636			error = copyout(&osb, uap->buf, sizeof(osb));
637			sp++;
638			uap->buf++;
639			count--;
640		}
641		free(buf, M_TEMP);
642	}
643	return (error);
644}
645
646/*
647 * Implement fstatfs() for (NFS) file handles.
648 */
649#ifndef _SYS_SYSPROTO_H_
650struct freebsd4_fhstatfs_args {
651	struct fhandle *u_fhp;
652	struct ostatfs *buf;
653};
654#endif
655int
656freebsd4_fhstatfs(td, uap)
657	struct thread *td;
658	struct freebsd4_fhstatfs_args /* {
659		struct fhandle *u_fhp;
660		struct ostatfs *buf;
661	} */ *uap;
662{
663	struct ostatfs osb;
664	struct statfs sf;
665	fhandle_t fh;
666	int error;
667
668	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
669	if (error != 0)
670		return (error);
671	error = kern_fhstatfs(td, fh, &sf);
672	if (error != 0)
673		return (error);
674	cvtstatfs(&sf, &osb);
675	return (copyout(&osb, uap->buf, sizeof(osb)));
676}
677
678/*
679 * Convert a new format statfs structure to an old format statfs structure.
680 */
681static void
682cvtstatfs(nsp, osp)
683	struct statfs *nsp;
684	struct ostatfs *osp;
685{
686
687	statfs_scale_blocks(nsp, LONG_MAX);
688	bzero(osp, sizeof(*osp));
689	osp->f_bsize = nsp->f_bsize;
690	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
691	osp->f_blocks = nsp->f_blocks;
692	osp->f_bfree = nsp->f_bfree;
693	osp->f_bavail = nsp->f_bavail;
694	osp->f_files = MIN(nsp->f_files, LONG_MAX);
695	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
696	osp->f_owner = nsp->f_owner;
697	osp->f_type = nsp->f_type;
698	osp->f_flags = nsp->f_flags;
699	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
700	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
701	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
702	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
703	strlcpy(osp->f_fstypename, nsp->f_fstypename,
704	    MIN(MFSNAMELEN, OMFSNAMELEN));
705	strlcpy(osp->f_mntonname, nsp->f_mntonname,
706	    MIN(MNAMELEN, OMNAMELEN));
707	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
708	    MIN(MNAMELEN, OMNAMELEN));
709	osp->f_fsid = nsp->f_fsid;
710}
711#endif /* COMPAT_FREEBSD4 */
712
713/*
714 * Change current working directory to a given file descriptor.
715 */
716#ifndef _SYS_SYSPROTO_H_
717struct fchdir_args {
718	int	fd;
719};
720#endif
721int
722sys_fchdir(td, uap)
723	struct thread *td;
724	struct fchdir_args /* {
725		int fd;
726	} */ *uap;
727{
728	register struct filedesc *fdp = td->td_proc->p_fd;
729	struct vnode *vp, *tdp, *vpold;
730	struct mount *mp;
731	struct file *fp;
732	cap_rights_t rights;
733	int error;
734
735	AUDIT_ARG_FD(uap->fd);
736	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
737	    &fp);
738	if (error != 0)
739		return (error);
740	vp = fp->f_vnode;
741	VREF(vp);
742	fdrop(fp, td);
743	vn_lock(vp, LK_SHARED | LK_RETRY);
744	AUDIT_ARG_VNODE1(vp);
745	error = change_dir(vp, td);
746	while (!error && (mp = vp->v_mountedhere) != NULL) {
747		if (vfs_busy(mp, 0))
748			continue;
749		error = VFS_ROOT(mp, LK_SHARED, &tdp);
750		vfs_unbusy(mp);
751		if (error != 0)
752			break;
753		vput(vp);
754		vp = tdp;
755	}
756	if (error != 0) {
757		vput(vp);
758		return (error);
759	}
760	VOP_UNLOCK(vp, 0);
761	FILEDESC_XLOCK(fdp);
762	vpold = fdp->fd_cdir;
763	fdp->fd_cdir = vp;
764	FILEDESC_XUNLOCK(fdp);
765	vrele(vpold);
766	return (0);
767}
768
769/*
770 * Change current working directory (``.'').
771 */
772#ifndef _SYS_SYSPROTO_H_
773struct chdir_args {
774	char	*path;
775};
776#endif
777int
778sys_chdir(td, uap)
779	struct thread *td;
780	struct chdir_args /* {
781		char *path;
782	} */ *uap;
783{
784
785	return (kern_chdir(td, uap->path, UIO_USERSPACE));
786}
787
788int
789kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
790{
791	register struct filedesc *fdp = td->td_proc->p_fd;
792	struct nameidata nd;
793	struct vnode *vp;
794	int error;
795
796	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
797	    pathseg, path, td);
798	if ((error = namei(&nd)) != 0)
799		return (error);
800	if ((error = change_dir(nd.ni_vp, td)) != 0) {
801		vput(nd.ni_vp);
802		NDFREE(&nd, NDF_ONLY_PNBUF);
803		return (error);
804	}
805	VOP_UNLOCK(nd.ni_vp, 0);
806	NDFREE(&nd, NDF_ONLY_PNBUF);
807	FILEDESC_XLOCK(fdp);
808	vp = fdp->fd_cdir;
809	fdp->fd_cdir = nd.ni_vp;
810	FILEDESC_XUNLOCK(fdp);
811	vrele(vp);
812	return (0);
813}
814
815/*
816 * Helper function for raised chroot(2) security function:  Refuse if
817 * any filedescriptors are open directories.
818 */
819static int
820chroot_refuse_vdir_fds(fdp)
821	struct filedesc *fdp;
822{
823	struct vnode *vp;
824	struct file *fp;
825	int fd;
826
827	FILEDESC_LOCK_ASSERT(fdp);
828
829	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
830		fp = fget_locked(fdp, fd);
831		if (fp == NULL)
832			continue;
833		if (fp->f_type == DTYPE_VNODE) {
834			vp = fp->f_vnode;
835			if (vp->v_type == VDIR)
836				return (EPERM);
837		}
838	}
839	return (0);
840}
841
842/*
843 * This sysctl determines if we will allow a process to chroot(2) if it
844 * has a directory open:
845 *	0: disallowed for all processes.
846 *	1: allowed for processes that were not already chroot(2)'ed.
847 *	2: allowed for all processes.
848 */
849
850static int chroot_allow_open_directories = 1;
851
852SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
853     &chroot_allow_open_directories, 0,
854     "Allow a process to chroot(2) if it has a directory open");
855
856/*
857 * Change notion of root (``/'') directory.
858 */
859#ifndef _SYS_SYSPROTO_H_
860struct chroot_args {
861	char	*path;
862};
863#endif
864int
865sys_chroot(td, uap)
866	struct thread *td;
867	struct chroot_args /* {
868		char *path;
869	} */ *uap;
870{
871	struct nameidata nd;
872	int error;
873
874	error = priv_check(td, PRIV_VFS_CHROOT);
875	if (error != 0)
876		return (error);
877	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
878	    UIO_USERSPACE, uap->path, td);
879	error = namei(&nd);
880	if (error != 0)
881		goto error;
882	error = change_dir(nd.ni_vp, td);
883	if (error != 0)
884		goto e_vunlock;
885#ifdef MAC
886	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
887	if (error != 0)
888		goto e_vunlock;
889#endif
890	VOP_UNLOCK(nd.ni_vp, 0);
891	error = change_root(nd.ni_vp, td);
892	vrele(nd.ni_vp);
893	NDFREE(&nd, NDF_ONLY_PNBUF);
894	return (error);
895e_vunlock:
896	vput(nd.ni_vp);
897error:
898	NDFREE(&nd, NDF_ONLY_PNBUF);
899	return (error);
900}
901
902/*
903 * Common routine for chroot and chdir.  Callers must provide a locked vnode
904 * instance.
905 */
906int
907change_dir(vp, td)
908	struct vnode *vp;
909	struct thread *td;
910{
911#ifdef MAC
912	int error;
913#endif
914
915	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
916	if (vp->v_type != VDIR)
917		return (ENOTDIR);
918#ifdef MAC
919	error = mac_vnode_check_chdir(td->td_ucred, vp);
920	if (error != 0)
921		return (error);
922#endif
923	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
924}
925
926/*
927 * Common routine for kern_chroot() and jail_attach().  The caller is
928 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
929 * authorize this operation.
930 */
931int
932change_root(vp, td)
933	struct vnode *vp;
934	struct thread *td;
935{
936	struct filedesc *fdp;
937	struct vnode *oldvp;
938	int error;
939
940	fdp = td->td_proc->p_fd;
941	FILEDESC_XLOCK(fdp);
942	if (chroot_allow_open_directories == 0 ||
943	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
944		error = chroot_refuse_vdir_fds(fdp);
945		if (error != 0) {
946			FILEDESC_XUNLOCK(fdp);
947			return (error);
948		}
949	}
950	oldvp = fdp->fd_rdir;
951	fdp->fd_rdir = vp;
952	VREF(fdp->fd_rdir);
953	if (!fdp->fd_jdir) {
954		fdp->fd_jdir = vp;
955		VREF(fdp->fd_jdir);
956	}
957	FILEDESC_XUNLOCK(fdp);
958	vrele(oldvp);
959	return (0);
960}
961
962static __inline void
963flags_to_rights(int flags, cap_rights_t *rightsp)
964{
965
966	if (flags & O_EXEC) {
967		cap_rights_set(rightsp, CAP_FEXECVE);
968	} else {
969		switch ((flags & O_ACCMODE)) {
970		case O_RDONLY:
971			cap_rights_set(rightsp, CAP_READ);
972			break;
973		case O_RDWR:
974			cap_rights_set(rightsp, CAP_READ);
975			/* FALLTHROUGH */
976		case O_WRONLY:
977			cap_rights_set(rightsp, CAP_WRITE);
978			if (!(flags & (O_APPEND | O_TRUNC)))
979				cap_rights_set(rightsp, CAP_SEEK);
980			break;
981		}
982	}
983
984	if (flags & O_CREAT)
985		cap_rights_set(rightsp, CAP_CREATE);
986
987	if (flags & O_TRUNC)
988		cap_rights_set(rightsp, CAP_FTRUNCATE);
989
990	if (flags & (O_SYNC | O_FSYNC))
991		cap_rights_set(rightsp, CAP_FSYNC);
992
993	if (flags & (O_EXLOCK | O_SHLOCK))
994		cap_rights_set(rightsp, CAP_FLOCK);
995}
996
997/*
998 * Check permissions, allocate an open file structure, and call the device
999 * open routine if any.
1000 */
1001#ifndef _SYS_SYSPROTO_H_
1002struct open_args {
1003	char	*path;
1004	int	flags;
1005	int	mode;
1006};
1007#endif
1008int
1009sys_open(td, uap)
1010	struct thread *td;
1011	register struct open_args /* {
1012		char *path;
1013		int flags;
1014		int mode;
1015	} */ *uap;
1016{
1017
1018	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1019}
1020
1021#ifndef _SYS_SYSPROTO_H_
1022struct openat_args {
1023	int	fd;
1024	char	*path;
1025	int	flag;
1026	int	mode;
1027};
1028#endif
1029int
1030sys_openat(struct thread *td, struct openat_args *uap)
1031{
1032
1033	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1034	    uap->mode));
1035}
1036
1037int
1038kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1039    int mode)
1040{
1041
1042	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1043}
1044
1045int
1046kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1047    int flags, int mode)
1048{
1049	struct proc *p = td->td_proc;
1050	struct filedesc *fdp = p->p_fd;
1051	struct file *fp;
1052	struct vnode *vp;
1053	struct nameidata nd;
1054	cap_rights_t rights;
1055	int cmode, error, indx;
1056
1057	indx = -1;
1058
1059	AUDIT_ARG_FFLAGS(flags);
1060	AUDIT_ARG_MODE(mode);
1061	/* XXX: audit dirfd */
1062	cap_rights_init(&rights, CAP_LOOKUP);
1063	flags_to_rights(flags, &rights);
1064	/*
1065	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1066	 * may be specified.
1067	 */
1068	if (flags & O_EXEC) {
1069		if (flags & O_ACCMODE)
1070			return (EINVAL);
1071	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1072		return (EINVAL);
1073	} else {
1074		flags = FFLAGS(flags);
1075	}
1076
1077	/*
1078	 * Allocate the file descriptor, but don't install a descriptor yet.
1079	 */
1080	error = falloc_noinstall(td, &fp);
1081	if (error != 0)
1082		return (error);
1083	/*
1084	 * An extra reference on `fp' has been held for us by
1085	 * falloc_noinstall().
1086	 */
1087	/* Set the flags early so the finit in devfs can pick them up. */
1088	fp->f_flag = flags & FMASK;
1089	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1090	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1091	    &rights, td);
1092	td->td_dupfd = -1;		/* XXX check for fdopen */
1093	error = vn_open(&nd, &flags, cmode, fp);
1094	if (error != 0) {
1095		/*
1096		 * If the vn_open replaced the method vector, something
1097		 * wonderous happened deep below and we just pass it up
1098		 * pretending we know what we do.
1099		 */
1100		if (error == ENXIO && fp->f_ops != &badfileops)
1101			goto success;
1102
1103		/*
1104		 * Handle special fdopen() case. bleh.
1105		 *
1106		 * Don't do this for relative (capability) lookups; we don't
1107		 * understand exactly what would happen, and we don't think
1108		 * that it ever should.
1109		 */
1110		if (nd.ni_strictrelative == 0 &&
1111		    (error == ENODEV || error == ENXIO) &&
1112		    td->td_dupfd >= 0) {
1113			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1114			    &indx);
1115			if (error == 0)
1116				goto success;
1117		}
1118
1119		goto bad;
1120	}
1121	td->td_dupfd = 0;
1122	NDFREE(&nd, NDF_ONLY_PNBUF);
1123	vp = nd.ni_vp;
1124
1125	/*
1126	 * Store the vnode, for any f_type. Typically, the vnode use
1127	 * count is decremented by direct call to vn_closefile() for
1128	 * files that switched type in the cdevsw fdopen() method.
1129	 */
1130	fp->f_vnode = vp;
1131	/*
1132	 * If the file wasn't claimed by devfs bind it to the normal
1133	 * vnode operations here.
1134	 */
1135	if (fp->f_ops == &badfileops) {
1136		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1137		fp->f_seqcount = 1;
1138		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1139		    DTYPE_VNODE, vp, &vnops);
1140	}
1141
1142	VOP_UNLOCK(vp, 0);
1143	if (flags & O_TRUNC) {
1144		error = fo_truncate(fp, 0, td->td_ucred, td);
1145		if (error != 0)
1146			goto bad;
1147	}
1148success:
1149	/*
1150	 * If we haven't already installed the FD (for dupfdopen), do so now.
1151	 */
1152	if (indx == -1) {
1153		struct filecaps *fcaps;
1154
1155#ifdef CAPABILITIES
1156		if (nd.ni_strictrelative == 1)
1157			fcaps = &nd.ni_filecaps;
1158		else
1159#endif
1160			fcaps = NULL;
1161		error = finstall(td, fp, &indx, flags, fcaps);
1162		/* On success finstall() consumes fcaps. */
1163		if (error != 0) {
1164			filecaps_free(&nd.ni_filecaps);
1165			goto bad;
1166		}
1167	} else {
1168		filecaps_free(&nd.ni_filecaps);
1169	}
1170
1171	/*
1172	 * Release our private reference, leaving the one associated with
1173	 * the descriptor table intact.
1174	 */
1175	fdrop(fp, td);
1176	td->td_retval[0] = indx;
1177	return (0);
1178bad:
1179	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1180	fdrop(fp, td);
1181	return (error);
1182}
1183
1184#ifdef COMPAT_43
1185/*
1186 * Create a file.
1187 */
1188#ifndef _SYS_SYSPROTO_H_
1189struct ocreat_args {
1190	char	*path;
1191	int	mode;
1192};
1193#endif
1194int
1195ocreat(td, uap)
1196	struct thread *td;
1197	register struct ocreat_args /* {
1198		char *path;
1199		int mode;
1200	} */ *uap;
1201{
1202
1203	return (kern_open(td, uap->path, UIO_USERSPACE,
1204	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1205}
1206#endif /* COMPAT_43 */
1207
1208/*
1209 * Create a special file.
1210 */
1211#ifndef _SYS_SYSPROTO_H_
1212struct mknod_args {
1213	char	*path;
1214	int	mode;
1215	int	dev;
1216};
1217#endif
1218int
1219sys_mknod(td, uap)
1220	struct thread *td;
1221	register struct mknod_args /* {
1222		char *path;
1223		int mode;
1224		int dev;
1225	} */ *uap;
1226{
1227
1228	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1229}
1230
1231#ifndef _SYS_SYSPROTO_H_
1232struct mknodat_args {
1233	int	fd;
1234	char	*path;
1235	mode_t	mode;
1236	dev_t	dev;
1237};
1238#endif
1239int
1240sys_mknodat(struct thread *td, struct mknodat_args *uap)
1241{
1242
1243	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1244	    uap->dev));
1245}
1246
1247int
1248kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1249    int dev)
1250{
1251
1252	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1253}
1254
1255int
1256kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1257    int mode, int dev)
1258{
1259	struct vnode *vp;
1260	struct mount *mp;
1261	struct vattr vattr;
1262	struct nameidata nd;
1263	cap_rights_t rights;
1264	int error, whiteout = 0;
1265
1266	AUDIT_ARG_MODE(mode);
1267	AUDIT_ARG_DEV(dev);
1268	switch (mode & S_IFMT) {
1269	case S_IFCHR:
1270	case S_IFBLK:
1271		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1272		break;
1273	case S_IFMT:
1274		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1275		break;
1276	case S_IFWHT:
1277		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1278		break;
1279	case S_IFIFO:
1280		if (dev == 0)
1281			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1282		/* FALLTHROUGH */
1283	default:
1284		error = EINVAL;
1285		break;
1286	}
1287	if (error != 0)
1288		return (error);
1289restart:
1290	bwillwrite();
1291	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1292	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1293	    td);
1294	if ((error = namei(&nd)) != 0)
1295		return (error);
1296	vp = nd.ni_vp;
1297	if (vp != NULL) {
1298		NDFREE(&nd, NDF_ONLY_PNBUF);
1299		if (vp == nd.ni_dvp)
1300			vrele(nd.ni_dvp);
1301		else
1302			vput(nd.ni_dvp);
1303		vrele(vp);
1304		return (EEXIST);
1305	} else {
1306		VATTR_NULL(&vattr);
1307		vattr.va_mode = (mode & ALLPERMS) &
1308		    ~td->td_proc->p_fd->fd_cmask;
1309		vattr.va_rdev = dev;
1310		whiteout = 0;
1311
1312		switch (mode & S_IFMT) {
1313		case S_IFMT:	/* used by badsect to flag bad sectors */
1314			vattr.va_type = VBAD;
1315			break;
1316		case S_IFCHR:
1317			vattr.va_type = VCHR;
1318			break;
1319		case S_IFBLK:
1320			vattr.va_type = VBLK;
1321			break;
1322		case S_IFWHT:
1323			whiteout = 1;
1324			break;
1325		default:
1326			panic("kern_mknod: invalid mode");
1327		}
1328	}
1329	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1330		NDFREE(&nd, NDF_ONLY_PNBUF);
1331		vput(nd.ni_dvp);
1332		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1333			return (error);
1334		goto restart;
1335	}
1336#ifdef MAC
1337	if (error == 0 && !whiteout)
1338		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1339		    &nd.ni_cnd, &vattr);
1340#endif
1341	if (error == 0) {
1342		if (whiteout)
1343			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1344		else {
1345			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1346						&nd.ni_cnd, &vattr);
1347			if (error == 0)
1348				vput(nd.ni_vp);
1349		}
1350	}
1351	NDFREE(&nd, NDF_ONLY_PNBUF);
1352	vput(nd.ni_dvp);
1353	vn_finished_write(mp);
1354	return (error);
1355}
1356
1357/*
1358 * Create a named pipe.
1359 */
1360#ifndef _SYS_SYSPROTO_H_
1361struct mkfifo_args {
1362	char	*path;
1363	int	mode;
1364};
1365#endif
1366int
1367sys_mkfifo(td, uap)
1368	struct thread *td;
1369	register struct mkfifo_args /* {
1370		char *path;
1371		int mode;
1372	} */ *uap;
1373{
1374
1375	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1376}
1377
1378#ifndef _SYS_SYSPROTO_H_
1379struct mkfifoat_args {
1380	int	fd;
1381	char	*path;
1382	mode_t	mode;
1383};
1384#endif
1385int
1386sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1387{
1388
1389	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1390	    uap->mode));
1391}
1392
1393int
1394kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1395{
1396
1397	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1398}
1399
1400int
1401kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1402    int mode)
1403{
1404	struct mount *mp;
1405	struct vattr vattr;
1406	struct nameidata nd;
1407	cap_rights_t rights;
1408	int error;
1409
1410	AUDIT_ARG_MODE(mode);
1411restart:
1412	bwillwrite();
1413	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1414	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1415	    td);
1416	if ((error = namei(&nd)) != 0)
1417		return (error);
1418	if (nd.ni_vp != NULL) {
1419		NDFREE(&nd, NDF_ONLY_PNBUF);
1420		if (nd.ni_vp == nd.ni_dvp)
1421			vrele(nd.ni_dvp);
1422		else
1423			vput(nd.ni_dvp);
1424		vrele(nd.ni_vp);
1425		return (EEXIST);
1426	}
1427	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1428		NDFREE(&nd, NDF_ONLY_PNBUF);
1429		vput(nd.ni_dvp);
1430		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1431			return (error);
1432		goto restart;
1433	}
1434	VATTR_NULL(&vattr);
1435	vattr.va_type = VFIFO;
1436	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1437#ifdef MAC
1438	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1439	    &vattr);
1440	if (error != 0)
1441		goto out;
1442#endif
1443	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1444	if (error == 0)
1445		vput(nd.ni_vp);
1446#ifdef MAC
1447out:
1448#endif
1449	vput(nd.ni_dvp);
1450	vn_finished_write(mp);
1451	NDFREE(&nd, NDF_ONLY_PNBUF);
1452	return (error);
1453}
1454
1455/*
1456 * Make a hard file link.
1457 */
1458#ifndef _SYS_SYSPROTO_H_
1459struct link_args {
1460	char	*path;
1461	char	*link;
1462};
1463#endif
1464int
1465sys_link(td, uap)
1466	struct thread *td;
1467	register struct link_args /* {
1468		char *path;
1469		char *link;
1470	} */ *uap;
1471{
1472
1473	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1474}
1475
1476#ifndef _SYS_SYSPROTO_H_
1477struct linkat_args {
1478	int	fd1;
1479	char	*path1;
1480	int	fd2;
1481	char	*path2;
1482	int	flag;
1483};
1484#endif
1485int
1486sys_linkat(struct thread *td, struct linkat_args *uap)
1487{
1488	int flag;
1489
1490	flag = uap->flag;
1491	if (flag & ~AT_SYMLINK_FOLLOW)
1492		return (EINVAL);
1493
1494	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1495	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1496}
1497
1498int hardlink_check_uid = 0;
1499SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1500    &hardlink_check_uid, 0,
1501    "Unprivileged processes cannot create hard links to files owned by other "
1502    "users");
1503static int hardlink_check_gid = 0;
1504SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1505    &hardlink_check_gid, 0,
1506    "Unprivileged processes cannot create hard links to files owned by other "
1507    "groups");
1508
1509static int
1510can_hardlink(struct vnode *vp, struct ucred *cred)
1511{
1512	struct vattr va;
1513	int error;
1514
1515	if (!hardlink_check_uid && !hardlink_check_gid)
1516		return (0);
1517
1518	error = VOP_GETATTR(vp, &va, cred);
1519	if (error != 0)
1520		return (error);
1521
1522	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1523		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1524		if (error != 0)
1525			return (error);
1526	}
1527
1528	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1529		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1530		if (error != 0)
1531			return (error);
1532	}
1533
1534	return (0);
1535}
1536
1537int
1538kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1539{
1540
1541	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1542}
1543
1544int
1545kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1546    enum uio_seg segflg, int follow)
1547{
1548	struct vnode *vp;
1549	struct mount *mp;
1550	struct nameidata nd;
1551	cap_rights_t rights;
1552	int error;
1553
1554again:
1555	bwillwrite();
1556	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1557
1558	if ((error = namei(&nd)) != 0)
1559		return (error);
1560	NDFREE(&nd, NDF_ONLY_PNBUF);
1561	vp = nd.ni_vp;
1562	if (vp->v_type == VDIR) {
1563		vrele(vp);
1564		return (EPERM);		/* POSIX */
1565	}
1566	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
1567	    NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
1568	    td);
1569	if ((error = namei(&nd)) == 0) {
1570		if (nd.ni_vp != NULL) {
1571			NDFREE(&nd, NDF_ONLY_PNBUF);
1572			if (nd.ni_dvp == nd.ni_vp)
1573				vrele(nd.ni_dvp);
1574			else
1575				vput(nd.ni_dvp);
1576			vrele(nd.ni_vp);
1577			vrele(vp);
1578			return (EEXIST);
1579		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1580			/*
1581			 * Cross-device link.  No need to recheck
1582			 * vp->v_type, since it cannot change, except
1583			 * to VBAD.
1584			 */
1585			NDFREE(&nd, NDF_ONLY_PNBUF);
1586			vput(nd.ni_dvp);
1587			vrele(vp);
1588			return (EXDEV);
1589		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1590			error = can_hardlink(vp, td->td_ucred);
1591#ifdef MAC
1592			if (error == 0)
1593				error = mac_vnode_check_link(td->td_ucred,
1594				    nd.ni_dvp, vp, &nd.ni_cnd);
1595#endif
1596			if (error != 0) {
1597				vput(vp);
1598				vput(nd.ni_dvp);
1599				NDFREE(&nd, NDF_ONLY_PNBUF);
1600				return (error);
1601			}
1602			error = vn_start_write(vp, &mp, V_NOWAIT);
1603			if (error != 0) {
1604				vput(vp);
1605				vput(nd.ni_dvp);
1606				NDFREE(&nd, NDF_ONLY_PNBUF);
1607				error = vn_start_write(NULL, &mp,
1608				    V_XSLEEP | PCATCH);
1609				if (error != 0)
1610					return (error);
1611				goto again;
1612			}
1613			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1614			VOP_UNLOCK(vp, 0);
1615			vput(nd.ni_dvp);
1616			vn_finished_write(mp);
1617			NDFREE(&nd, NDF_ONLY_PNBUF);
1618		} else {
1619			vput(nd.ni_dvp);
1620			NDFREE(&nd, NDF_ONLY_PNBUF);
1621			vrele(vp);
1622			goto again;
1623		}
1624	}
1625	vrele(vp);
1626	return (error);
1627}
1628
1629/*
1630 * Make a symbolic link.
1631 */
1632#ifndef _SYS_SYSPROTO_H_
1633struct symlink_args {
1634	char	*path;
1635	char	*link;
1636};
1637#endif
1638int
1639sys_symlink(td, uap)
1640	struct thread *td;
1641	register struct symlink_args /* {
1642		char *path;
1643		char *link;
1644	} */ *uap;
1645{
1646
1647	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1648}
1649
1650#ifndef _SYS_SYSPROTO_H_
1651struct symlinkat_args {
1652	char	*path;
1653	int	fd;
1654	char	*path2;
1655};
1656#endif
1657int
1658sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1659{
1660
1661	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1662	    UIO_USERSPACE));
1663}
1664
1665int
1666kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1667{
1668
1669	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1670}
1671
1672int
1673kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1674    enum uio_seg segflg)
1675{
1676	struct mount *mp;
1677	struct vattr vattr;
1678	char *syspath;
1679	struct nameidata nd;
1680	int error;
1681	cap_rights_t rights;
1682
1683	if (segflg == UIO_SYSSPACE) {
1684		syspath = path1;
1685	} else {
1686		syspath = uma_zalloc(namei_zone, M_WAITOK);
1687		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1688			goto out;
1689	}
1690	AUDIT_ARG_TEXT(syspath);
1691restart:
1692	bwillwrite();
1693	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1694	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1695	    td);
1696	if ((error = namei(&nd)) != 0)
1697		goto out;
1698	if (nd.ni_vp) {
1699		NDFREE(&nd, NDF_ONLY_PNBUF);
1700		if (nd.ni_vp == nd.ni_dvp)
1701			vrele(nd.ni_dvp);
1702		else
1703			vput(nd.ni_dvp);
1704		vrele(nd.ni_vp);
1705		error = EEXIST;
1706		goto out;
1707	}
1708	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1709		NDFREE(&nd, NDF_ONLY_PNBUF);
1710		vput(nd.ni_dvp);
1711		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1712			goto out;
1713		goto restart;
1714	}
1715	VATTR_NULL(&vattr);
1716	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1717#ifdef MAC
1718	vattr.va_type = VLNK;
1719	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1720	    &vattr);
1721	if (error != 0)
1722		goto out2;
1723#endif
1724	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1725	if (error == 0)
1726		vput(nd.ni_vp);
1727#ifdef MAC
1728out2:
1729#endif
1730	NDFREE(&nd, NDF_ONLY_PNBUF);
1731	vput(nd.ni_dvp);
1732	vn_finished_write(mp);
1733out:
1734	if (segflg != UIO_SYSSPACE)
1735		uma_zfree(namei_zone, syspath);
1736	return (error);
1737}
1738
1739/*
1740 * Delete a whiteout from the filesystem.
1741 */
1742int
1743sys_undelete(td, uap)
1744	struct thread *td;
1745	register struct undelete_args /* {
1746		char *path;
1747	} */ *uap;
1748{
1749	struct mount *mp;
1750	struct nameidata nd;
1751	int error;
1752
1753restart:
1754	bwillwrite();
1755	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1756	    UIO_USERSPACE, uap->path, td);
1757	error = namei(&nd);
1758	if (error != 0)
1759		return (error);
1760
1761	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1762		NDFREE(&nd, NDF_ONLY_PNBUF);
1763		if (nd.ni_vp == nd.ni_dvp)
1764			vrele(nd.ni_dvp);
1765		else
1766			vput(nd.ni_dvp);
1767		if (nd.ni_vp)
1768			vrele(nd.ni_vp);
1769		return (EEXIST);
1770	}
1771	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1772		NDFREE(&nd, NDF_ONLY_PNBUF);
1773		vput(nd.ni_dvp);
1774		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1775			return (error);
1776		goto restart;
1777	}
1778	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1779	NDFREE(&nd, NDF_ONLY_PNBUF);
1780	vput(nd.ni_dvp);
1781	vn_finished_write(mp);
1782	return (error);
1783}
1784
1785/*
1786 * Delete a name from the filesystem.
1787 */
1788#ifndef _SYS_SYSPROTO_H_
1789struct unlink_args {
1790	char	*path;
1791};
1792#endif
1793int
1794sys_unlink(td, uap)
1795	struct thread *td;
1796	struct unlink_args /* {
1797		char *path;
1798	} */ *uap;
1799{
1800
1801	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1802}
1803
1804#ifndef _SYS_SYSPROTO_H_
1805struct unlinkat_args {
1806	int	fd;
1807	char	*path;
1808	int	flag;
1809};
1810#endif
1811int
1812sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1813{
1814	int flag = uap->flag;
1815	int fd = uap->fd;
1816	char *path = uap->path;
1817
1818	if (flag & ~AT_REMOVEDIR)
1819		return (EINVAL);
1820
1821	if (flag & AT_REMOVEDIR)
1822		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1823	else
1824		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1825}
1826
1827int
1828kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1829{
1830
1831	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1832}
1833
1834int
1835kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1836    ino_t oldinum)
1837{
1838	struct mount *mp;
1839	struct vnode *vp;
1840	struct nameidata nd;
1841	struct stat sb;
1842	cap_rights_t rights;
1843	int error;
1844
1845restart:
1846	bwillwrite();
1847	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1848	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1849	if ((error = namei(&nd)) != 0)
1850		return (error == EINVAL ? EPERM : error);
1851	vp = nd.ni_vp;
1852	if (vp->v_type == VDIR && oldinum == 0) {
1853		error = EPERM;		/* POSIX */
1854	} else if (oldinum != 0 &&
1855		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1856		  sb.st_ino != oldinum) {
1857			error = EIDRM;	/* Identifier removed */
1858	} else {
1859		/*
1860		 * The root of a mounted filesystem cannot be deleted.
1861		 *
1862		 * XXX: can this only be a VDIR case?
1863		 */
1864		if (vp->v_vflag & VV_ROOT)
1865			error = EBUSY;
1866	}
1867	if (error == 0) {
1868		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1869			NDFREE(&nd, NDF_ONLY_PNBUF);
1870			vput(nd.ni_dvp);
1871			if (vp == nd.ni_dvp)
1872				vrele(vp);
1873			else
1874				vput(vp);
1875			if ((error = vn_start_write(NULL, &mp,
1876			    V_XSLEEP | PCATCH)) != 0)
1877				return (error);
1878			goto restart;
1879		}
1880#ifdef MAC
1881		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1882		    &nd.ni_cnd);
1883		if (error != 0)
1884			goto out;
1885#endif
1886		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1887		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1888#ifdef MAC
1889out:
1890#endif
1891		vn_finished_write(mp);
1892	}
1893	NDFREE(&nd, NDF_ONLY_PNBUF);
1894	vput(nd.ni_dvp);
1895	if (vp == nd.ni_dvp)
1896		vrele(vp);
1897	else
1898		vput(vp);
1899	return (error);
1900}
1901
1902/*
1903 * Reposition read/write file offset.
1904 */
1905#ifndef _SYS_SYSPROTO_H_
1906struct lseek_args {
1907	int	fd;
1908	int	pad;
1909	off_t	offset;
1910	int	whence;
1911};
1912#endif
1913int
1914sys_lseek(td, uap)
1915	struct thread *td;
1916	register struct lseek_args /* {
1917		int fd;
1918		int pad;
1919		off_t offset;
1920		int whence;
1921	} */ *uap;
1922{
1923	struct file *fp;
1924	cap_rights_t rights;
1925	int error;
1926
1927	AUDIT_ARG_FD(uap->fd);
1928	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1929	if (error != 0)
1930		return (error);
1931	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1932	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1933	fdrop(fp, td);
1934	return (error);
1935}
1936
1937#if defined(COMPAT_43)
1938/*
1939 * Reposition read/write file offset.
1940 */
1941#ifndef _SYS_SYSPROTO_H_
1942struct olseek_args {
1943	int	fd;
1944	long	offset;
1945	int	whence;
1946};
1947#endif
1948int
1949olseek(td, uap)
1950	struct thread *td;
1951	register struct olseek_args /* {
1952		int fd;
1953		long offset;
1954		int whence;
1955	} */ *uap;
1956{
1957	struct lseek_args /* {
1958		int fd;
1959		int pad;
1960		off_t offset;
1961		int whence;
1962	} */ nuap;
1963
1964	nuap.fd = uap->fd;
1965	nuap.offset = uap->offset;
1966	nuap.whence = uap->whence;
1967	return (sys_lseek(td, &nuap));
1968}
1969#endif /* COMPAT_43 */
1970
1971/* Version with the 'pad' argument */
1972int
1973freebsd6_lseek(td, uap)
1974	struct thread *td;
1975	register struct freebsd6_lseek_args *uap;
1976{
1977	struct lseek_args ouap;
1978
1979	ouap.fd = uap->fd;
1980	ouap.offset = uap->offset;
1981	ouap.whence = uap->whence;
1982	return (sys_lseek(td, &ouap));
1983}
1984
1985/*
1986 * Check access permissions using passed credentials.
1987 */
1988static int
1989vn_access(vp, user_flags, cred, td)
1990	struct vnode	*vp;
1991	int		user_flags;
1992	struct ucred	*cred;
1993	struct thread	*td;
1994{
1995	accmode_t accmode;
1996	int error;
1997
1998	/* Flags == 0 means only check for existence. */
1999	error = 0;
2000	if (user_flags) {
2001		accmode = 0;
2002		if (user_flags & R_OK)
2003			accmode |= VREAD;
2004		if (user_flags & W_OK)
2005			accmode |= VWRITE;
2006		if (user_flags & X_OK)
2007			accmode |= VEXEC;
2008#ifdef MAC
2009		error = mac_vnode_check_access(cred, vp, accmode);
2010		if (error != 0)
2011			return (error);
2012#endif
2013		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2014			error = VOP_ACCESS(vp, accmode, cred, td);
2015	}
2016	return (error);
2017}
2018
2019/*
2020 * Check access permissions using "real" credentials.
2021 */
2022#ifndef _SYS_SYSPROTO_H_
2023struct access_args {
2024	char	*path;
2025	int	amode;
2026};
2027#endif
2028int
2029sys_access(td, uap)
2030	struct thread *td;
2031	register struct access_args /* {
2032		char *path;
2033		int amode;
2034	} */ *uap;
2035{
2036
2037	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2038}
2039
2040#ifndef _SYS_SYSPROTO_H_
2041struct faccessat_args {
2042	int	dirfd;
2043	char	*path;
2044	int	amode;
2045	int	flag;
2046}
2047#endif
2048int
2049sys_faccessat(struct thread *td, struct faccessat_args *uap)
2050{
2051
2052	if (uap->flag & ~AT_EACCESS)
2053		return (EINVAL);
2054	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2055	    uap->amode));
2056}
2057
2058int
2059kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2060{
2061
2062	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2063}
2064
2065int
2066kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2067    int flag, int amode)
2068{
2069	struct ucred *cred, *tmpcred;
2070	struct vnode *vp;
2071	struct nameidata nd;
2072	cap_rights_t rights;
2073	int error;
2074
2075	/*
2076	 * Create and modify a temporary credential instead of one that
2077	 * is potentially shared.
2078	 */
2079	if (!(flag & AT_EACCESS)) {
2080		cred = td->td_ucred;
2081		tmpcred = crdup(cred);
2082		tmpcred->cr_uid = cred->cr_ruid;
2083		tmpcred->cr_groups[0] = cred->cr_rgid;
2084		td->td_ucred = tmpcred;
2085	} else
2086		cred = tmpcred = td->td_ucred;
2087	AUDIT_ARG_VALUE(amode);
2088	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2089	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2090	    td);
2091	if ((error = namei(&nd)) != 0)
2092		goto out1;
2093	vp = nd.ni_vp;
2094
2095	error = vn_access(vp, amode, tmpcred, td);
2096	NDFREE(&nd, NDF_ONLY_PNBUF);
2097	vput(vp);
2098out1:
2099	if (!(flag & AT_EACCESS)) {
2100		td->td_ucred = cred;
2101		crfree(tmpcred);
2102	}
2103	return (error);
2104}
2105
2106/*
2107 * Check access permissions using "effective" credentials.
2108 */
2109#ifndef _SYS_SYSPROTO_H_
2110struct eaccess_args {
2111	char	*path;
2112	int	amode;
2113};
2114#endif
2115int
2116sys_eaccess(td, uap)
2117	struct thread *td;
2118	register struct eaccess_args /* {
2119		char *path;
2120		int amode;
2121	} */ *uap;
2122{
2123
2124	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2125}
2126
2127int
2128kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2129{
2130
2131	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2132}
2133
2134#if defined(COMPAT_43)
2135/*
2136 * Get file status; this version follows links.
2137 */
2138#ifndef _SYS_SYSPROTO_H_
2139struct ostat_args {
2140	char	*path;
2141	struct ostat *ub;
2142};
2143#endif
2144int
2145ostat(td, uap)
2146	struct thread *td;
2147	register struct ostat_args /* {
2148		char *path;
2149		struct ostat *ub;
2150	} */ *uap;
2151{
2152	struct stat sb;
2153	struct ostat osb;
2154	int error;
2155
2156	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2157	if (error != 0)
2158		return (error);
2159	cvtstat(&sb, &osb);
2160	return (copyout(&osb, uap->ub, sizeof (osb)));
2161}
2162
2163/*
2164 * Get file status; this version does not follow links.
2165 */
2166#ifndef _SYS_SYSPROTO_H_
2167struct olstat_args {
2168	char	*path;
2169	struct ostat *ub;
2170};
2171#endif
2172int
2173olstat(td, uap)
2174	struct thread *td;
2175	register struct olstat_args /* {
2176		char *path;
2177		struct ostat *ub;
2178	} */ *uap;
2179{
2180	struct stat sb;
2181	struct ostat osb;
2182	int error;
2183
2184	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2185	if (error != 0)
2186		return (error);
2187	cvtstat(&sb, &osb);
2188	return (copyout(&osb, uap->ub, sizeof (osb)));
2189}
2190
2191/*
2192 * Convert from an old to a new stat structure.
2193 */
2194void
2195cvtstat(st, ost)
2196	struct stat *st;
2197	struct ostat *ost;
2198{
2199
2200	ost->st_dev = st->st_dev;
2201	ost->st_ino = st->st_ino;
2202	ost->st_mode = st->st_mode;
2203	ost->st_nlink = st->st_nlink;
2204	ost->st_uid = st->st_uid;
2205	ost->st_gid = st->st_gid;
2206	ost->st_rdev = st->st_rdev;
2207	if (st->st_size < (quad_t)1 << 32)
2208		ost->st_size = st->st_size;
2209	else
2210		ost->st_size = -2;
2211	ost->st_atim = st->st_atim;
2212	ost->st_mtim = st->st_mtim;
2213	ost->st_ctim = st->st_ctim;
2214	ost->st_blksize = st->st_blksize;
2215	ost->st_blocks = st->st_blocks;
2216	ost->st_flags = st->st_flags;
2217	ost->st_gen = st->st_gen;
2218}
2219#endif /* COMPAT_43 */
2220
2221/*
2222 * Get file status; this version follows links.
2223 */
2224#ifndef _SYS_SYSPROTO_H_
2225struct stat_args {
2226	char	*path;
2227	struct stat *ub;
2228};
2229#endif
2230int
2231sys_stat(td, uap)
2232	struct thread *td;
2233	register struct stat_args /* {
2234		char *path;
2235		struct stat *ub;
2236	} */ *uap;
2237{
2238	struct stat sb;
2239	int error;
2240
2241	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2242	if (error == 0)
2243		error = copyout(&sb, uap->ub, sizeof (sb));
2244	return (error);
2245}
2246
2247#ifndef _SYS_SYSPROTO_H_
2248struct fstatat_args {
2249	int	fd;
2250	char	*path;
2251	struct stat	*buf;
2252	int	flag;
2253}
2254#endif
2255int
2256sys_fstatat(struct thread *td, struct fstatat_args *uap)
2257{
2258	struct stat sb;
2259	int error;
2260
2261	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2262	    UIO_USERSPACE, &sb);
2263	if (error == 0)
2264		error = copyout(&sb, uap->buf, sizeof (sb));
2265	return (error);
2266}
2267
2268int
2269kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2270{
2271
2272	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2273}
2274
2275int
2276kern_statat(struct thread *td, int flag, int fd, char *path,
2277    enum uio_seg pathseg, struct stat *sbp)
2278{
2279
2280	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2281}
2282
2283int
2284kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2285    enum uio_seg pathseg, struct stat *sbp,
2286    void (*hook)(struct vnode *vp, struct stat *sbp))
2287{
2288	struct nameidata nd;
2289	struct stat sb;
2290	cap_rights_t rights;
2291	int error;
2292
2293	if (flag & ~AT_SYMLINK_NOFOLLOW)
2294		return (EINVAL);
2295
2296	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2297	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2298	    cap_rights_init(&rights, CAP_FSTAT), td);
2299
2300	if ((error = namei(&nd)) != 0)
2301		return (error);
2302	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2303	if (error == 0) {
2304		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2305		if (S_ISREG(sb.st_mode))
2306			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2307		if (__predict_false(hook != NULL))
2308			hook(nd.ni_vp, &sb);
2309	}
2310	NDFREE(&nd, NDF_ONLY_PNBUF);
2311	vput(nd.ni_vp);
2312	if (error != 0)
2313		return (error);
2314	*sbp = sb;
2315#ifdef KTRACE
2316	if (KTRPOINT(td, KTR_STRUCT))
2317		ktrstat(&sb);
2318#endif
2319	return (0);
2320}
2321
2322/*
2323 * Get file status; this version does not follow links.
2324 */
2325#ifndef _SYS_SYSPROTO_H_
2326struct lstat_args {
2327	char	*path;
2328	struct stat *ub;
2329};
2330#endif
2331int
2332sys_lstat(td, uap)
2333	struct thread *td;
2334	register struct lstat_args /* {
2335		char *path;
2336		struct stat *ub;
2337	} */ *uap;
2338{
2339	struct stat sb;
2340	int error;
2341
2342	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2343	if (error == 0)
2344		error = copyout(&sb, uap->ub, sizeof (sb));
2345	return (error);
2346}
2347
2348int
2349kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2350{
2351
2352	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2353	    sbp));
2354}
2355
2356/*
2357 * Implementation of the NetBSD [l]stat() functions.
2358 */
2359void
2360cvtnstat(sb, nsb)
2361	struct stat *sb;
2362	struct nstat *nsb;
2363{
2364
2365	bzero(nsb, sizeof *nsb);
2366	nsb->st_dev = sb->st_dev;
2367	nsb->st_ino = sb->st_ino;
2368	nsb->st_mode = sb->st_mode;
2369	nsb->st_nlink = sb->st_nlink;
2370	nsb->st_uid = sb->st_uid;
2371	nsb->st_gid = sb->st_gid;
2372	nsb->st_rdev = sb->st_rdev;
2373	nsb->st_atim = sb->st_atim;
2374	nsb->st_mtim = sb->st_mtim;
2375	nsb->st_ctim = sb->st_ctim;
2376	nsb->st_size = sb->st_size;
2377	nsb->st_blocks = sb->st_blocks;
2378	nsb->st_blksize = sb->st_blksize;
2379	nsb->st_flags = sb->st_flags;
2380	nsb->st_gen = sb->st_gen;
2381	nsb->st_birthtim = sb->st_birthtim;
2382}
2383
2384#ifndef _SYS_SYSPROTO_H_
2385struct nstat_args {
2386	char	*path;
2387	struct nstat *ub;
2388};
2389#endif
2390int
2391sys_nstat(td, uap)
2392	struct thread *td;
2393	register struct nstat_args /* {
2394		char *path;
2395		struct nstat *ub;
2396	} */ *uap;
2397{
2398	struct stat sb;
2399	struct nstat nsb;
2400	int error;
2401
2402	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2403	if (error != 0)
2404		return (error);
2405	cvtnstat(&sb, &nsb);
2406	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2407}
2408
2409/*
2410 * NetBSD lstat.  Get file status; this version does not follow links.
2411 */
2412#ifndef _SYS_SYSPROTO_H_
2413struct lstat_args {
2414	char	*path;
2415	struct stat *ub;
2416};
2417#endif
2418int
2419sys_nlstat(td, uap)
2420	struct thread *td;
2421	register struct nlstat_args /* {
2422		char *path;
2423		struct nstat *ub;
2424	} */ *uap;
2425{
2426	struct stat sb;
2427	struct nstat nsb;
2428	int error;
2429
2430	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2431	if (error != 0)
2432		return (error);
2433	cvtnstat(&sb, &nsb);
2434	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2435}
2436
2437/*
2438 * Get configurable pathname variables.
2439 */
2440#ifndef _SYS_SYSPROTO_H_
2441struct pathconf_args {
2442	char	*path;
2443	int	name;
2444};
2445#endif
2446int
2447sys_pathconf(td, uap)
2448	struct thread *td;
2449	register struct pathconf_args /* {
2450		char *path;
2451		int name;
2452	} */ *uap;
2453{
2454
2455	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2456}
2457
2458#ifndef _SYS_SYSPROTO_H_
2459struct lpathconf_args {
2460	char	*path;
2461	int	name;
2462};
2463#endif
2464int
2465sys_lpathconf(td, uap)
2466	struct thread *td;
2467	register struct lpathconf_args /* {
2468		char *path;
2469		int name;
2470	} */ *uap;
2471{
2472
2473	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2474	    NOFOLLOW));
2475}
2476
2477int
2478kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2479    u_long flags)
2480{
2481	struct nameidata nd;
2482	int error;
2483
2484	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2485	    pathseg, path, td);
2486	if ((error = namei(&nd)) != 0)
2487		return (error);
2488	NDFREE(&nd, NDF_ONLY_PNBUF);
2489
2490	/* If asynchronous I/O is available, it works for all files. */
2491	if (name == _PC_ASYNC_IO)
2492		td->td_retval[0] = async_io_version;
2493	else
2494		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2495	vput(nd.ni_vp);
2496	return (error);
2497}
2498
2499/*
2500 * Return target name of a symbolic link.
2501 */
2502#ifndef _SYS_SYSPROTO_H_
2503struct readlink_args {
2504	char	*path;
2505	char	*buf;
2506	size_t	count;
2507};
2508#endif
2509int
2510sys_readlink(td, uap)
2511	struct thread *td;
2512	register struct readlink_args /* {
2513		char *path;
2514		char *buf;
2515		size_t count;
2516	} */ *uap;
2517{
2518
2519	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2520	    UIO_USERSPACE, uap->count));
2521}
2522#ifndef _SYS_SYSPROTO_H_
2523struct readlinkat_args {
2524	int	fd;
2525	char	*path;
2526	char	*buf;
2527	size_t	bufsize;
2528};
2529#endif
2530int
2531sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2532{
2533
2534	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2535	    uap->buf, UIO_USERSPACE, uap->bufsize));
2536}
2537
2538int
2539kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2540    enum uio_seg bufseg, size_t count)
2541{
2542
2543	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2544	    count));
2545}
2546
2547int
2548kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2549    char *buf, enum uio_seg bufseg, size_t count)
2550{
2551	struct vnode *vp;
2552	struct iovec aiov;
2553	struct uio auio;
2554	struct nameidata nd;
2555	int error;
2556
2557	if (count > IOSIZE_MAX)
2558		return (EINVAL);
2559
2560	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2561	    pathseg, path, fd, td);
2562
2563	if ((error = namei(&nd)) != 0)
2564		return (error);
2565	NDFREE(&nd, NDF_ONLY_PNBUF);
2566	vp = nd.ni_vp;
2567#ifdef MAC
2568	error = mac_vnode_check_readlink(td->td_ucred, vp);
2569	if (error != 0) {
2570		vput(vp);
2571		return (error);
2572	}
2573#endif
2574	if (vp->v_type != VLNK)
2575		error = EINVAL;
2576	else {
2577		aiov.iov_base = buf;
2578		aiov.iov_len = count;
2579		auio.uio_iov = &aiov;
2580		auio.uio_iovcnt = 1;
2581		auio.uio_offset = 0;
2582		auio.uio_rw = UIO_READ;
2583		auio.uio_segflg = bufseg;
2584		auio.uio_td = td;
2585		auio.uio_resid = count;
2586		error = VOP_READLINK(vp, &auio, td->td_ucred);
2587		td->td_retval[0] = count - auio.uio_resid;
2588	}
2589	vput(vp);
2590	return (error);
2591}
2592
2593/*
2594 * Common implementation code for chflags() and fchflags().
2595 */
2596static int
2597setfflags(td, vp, flags)
2598	struct thread *td;
2599	struct vnode *vp;
2600	u_long flags;
2601{
2602	struct mount *mp;
2603	struct vattr vattr;
2604	int error;
2605
2606	/* We can't support the value matching VNOVAL. */
2607	if (flags == VNOVAL)
2608		return (EOPNOTSUPP);
2609
2610	/*
2611	 * Prevent non-root users from setting flags on devices.  When
2612	 * a device is reused, users can retain ownership of the device
2613	 * if they are allowed to set flags and programs assume that
2614	 * chown can't fail when done as root.
2615	 */
2616	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2617		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2618		if (error != 0)
2619			return (error);
2620	}
2621
2622	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2623		return (error);
2624	VATTR_NULL(&vattr);
2625	vattr.va_flags = flags;
2626	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2627#ifdef MAC
2628	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2629	if (error == 0)
2630#endif
2631		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2632	VOP_UNLOCK(vp, 0);
2633	vn_finished_write(mp);
2634	return (error);
2635}
2636
2637/*
2638 * Change flags of a file given a path name.
2639 */
2640#ifndef _SYS_SYSPROTO_H_
2641struct chflags_args {
2642	const char *path;
2643	u_long	flags;
2644};
2645#endif
2646int
2647sys_chflags(td, uap)
2648	struct thread *td;
2649	register struct chflags_args /* {
2650		const char *path;
2651		u_long flags;
2652	} */ *uap;
2653{
2654
2655	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2656}
2657
2658#ifndef _SYS_SYSPROTO_H_
2659struct chflagsat_args {
2660	int	fd;
2661	const char *path;
2662	u_long	flags;
2663	int	atflag;
2664}
2665#endif
2666int
2667sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2668{
2669	int fd = uap->fd;
2670	const char *path = uap->path;
2671	u_long flags = uap->flags;
2672	int atflag = uap->atflag;
2673
2674	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2675		return (EINVAL);
2676
2677	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2678}
2679
2680static int
2681kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2682    u_long flags)
2683{
2684
2685	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2686}
2687
2688/*
2689 * Same as chflags() but doesn't follow symlinks.
2690 */
2691int
2692sys_lchflags(td, uap)
2693	struct thread *td;
2694	register struct lchflags_args /* {
2695		const char *path;
2696		u_long flags;
2697	} */ *uap;
2698{
2699
2700	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2701	    uap->flags, AT_SYMLINK_NOFOLLOW));
2702}
2703
2704static int
2705kern_chflagsat(struct thread *td, int fd, const char *path,
2706    enum uio_seg pathseg, u_long flags, int atflag)
2707{
2708	struct nameidata nd;
2709	cap_rights_t rights;
2710	int error, follow;
2711
2712	AUDIT_ARG_FFLAGS(flags);
2713	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2714	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2715	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2716	if ((error = namei(&nd)) != 0)
2717		return (error);
2718	NDFREE(&nd, NDF_ONLY_PNBUF);
2719	error = setfflags(td, nd.ni_vp, flags);
2720	vrele(nd.ni_vp);
2721	return (error);
2722}
2723
2724/*
2725 * Change flags of a file given a file descriptor.
2726 */
2727#ifndef _SYS_SYSPROTO_H_
2728struct fchflags_args {
2729	int	fd;
2730	u_long	flags;
2731};
2732#endif
2733int
2734sys_fchflags(td, uap)
2735	struct thread *td;
2736	register struct fchflags_args /* {
2737		int fd;
2738		u_long flags;
2739	} */ *uap;
2740{
2741	struct file *fp;
2742	cap_rights_t rights;
2743	int error;
2744
2745	AUDIT_ARG_FD(uap->fd);
2746	AUDIT_ARG_FFLAGS(uap->flags);
2747	error = getvnode(td->td_proc->p_fd, uap->fd,
2748	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2749	if (error != 0)
2750		return (error);
2751#ifdef AUDIT
2752	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2753	AUDIT_ARG_VNODE1(fp->f_vnode);
2754	VOP_UNLOCK(fp->f_vnode, 0);
2755#endif
2756	error = setfflags(td, fp->f_vnode, uap->flags);
2757	fdrop(fp, td);
2758	return (error);
2759}
2760
2761/*
2762 * Common implementation code for chmod(), lchmod() and fchmod().
2763 */
2764int
2765setfmode(td, cred, vp, mode)
2766	struct thread *td;
2767	struct ucred *cred;
2768	struct vnode *vp;
2769	int mode;
2770{
2771	struct mount *mp;
2772	struct vattr vattr;
2773	int error;
2774
2775	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2776		return (error);
2777	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2778	VATTR_NULL(&vattr);
2779	vattr.va_mode = mode & ALLPERMS;
2780#ifdef MAC
2781	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2782	if (error == 0)
2783#endif
2784		error = VOP_SETATTR(vp, &vattr, cred);
2785	VOP_UNLOCK(vp, 0);
2786	vn_finished_write(mp);
2787	return (error);
2788}
2789
2790/*
2791 * Change mode of a file given path name.
2792 */
2793#ifndef _SYS_SYSPROTO_H_
2794struct chmod_args {
2795	char	*path;
2796	int	mode;
2797};
2798#endif
2799int
2800sys_chmod(td, uap)
2801	struct thread *td;
2802	register struct chmod_args /* {
2803		char *path;
2804		int mode;
2805	} */ *uap;
2806{
2807
2808	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2809}
2810
2811#ifndef _SYS_SYSPROTO_H_
2812struct fchmodat_args {
2813	int	dirfd;
2814	char	*path;
2815	mode_t	mode;
2816	int	flag;
2817}
2818#endif
2819int
2820sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2821{
2822	int flag = uap->flag;
2823	int fd = uap->fd;
2824	char *path = uap->path;
2825	mode_t mode = uap->mode;
2826
2827	if (flag & ~AT_SYMLINK_NOFOLLOW)
2828		return (EINVAL);
2829
2830	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2831}
2832
2833int
2834kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2835{
2836
2837	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2838}
2839
2840/*
2841 * Change mode of a file given path name (don't follow links.)
2842 */
2843#ifndef _SYS_SYSPROTO_H_
2844struct lchmod_args {
2845	char	*path;
2846	int	mode;
2847};
2848#endif
2849int
2850sys_lchmod(td, uap)
2851	struct thread *td;
2852	register struct lchmod_args /* {
2853		char *path;
2854		int mode;
2855	} */ *uap;
2856{
2857
2858	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2859	    uap->mode, AT_SYMLINK_NOFOLLOW));
2860}
2861
2862int
2863kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2864    mode_t mode, int flag)
2865{
2866	struct nameidata nd;
2867	cap_rights_t rights;
2868	int error, follow;
2869
2870	AUDIT_ARG_MODE(mode);
2871	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2872	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2873	    cap_rights_init(&rights, CAP_FCHMOD), td);
2874	if ((error = namei(&nd)) != 0)
2875		return (error);
2876	NDFREE(&nd, NDF_ONLY_PNBUF);
2877	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2878	vrele(nd.ni_vp);
2879	return (error);
2880}
2881
2882/*
2883 * Change mode of a file given a file descriptor.
2884 */
2885#ifndef _SYS_SYSPROTO_H_
2886struct fchmod_args {
2887	int	fd;
2888	int	mode;
2889};
2890#endif
2891int
2892sys_fchmod(struct thread *td, struct fchmod_args *uap)
2893{
2894	struct file *fp;
2895	cap_rights_t rights;
2896	int error;
2897
2898	AUDIT_ARG_FD(uap->fd);
2899	AUDIT_ARG_MODE(uap->mode);
2900
2901	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2902	if (error != 0)
2903		return (error);
2904	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2905	fdrop(fp, td);
2906	return (error);
2907}
2908
2909/*
2910 * Common implementation for chown(), lchown(), and fchown()
2911 */
2912int
2913setfown(td, cred, vp, uid, gid)
2914	struct thread *td;
2915	struct ucred *cred;
2916	struct vnode *vp;
2917	uid_t uid;
2918	gid_t gid;
2919{
2920	struct mount *mp;
2921	struct vattr vattr;
2922	int error;
2923
2924	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2925		return (error);
2926	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2927	VATTR_NULL(&vattr);
2928	vattr.va_uid = uid;
2929	vattr.va_gid = gid;
2930#ifdef MAC
2931	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2932	    vattr.va_gid);
2933	if (error == 0)
2934#endif
2935		error = VOP_SETATTR(vp, &vattr, cred);
2936	VOP_UNLOCK(vp, 0);
2937	vn_finished_write(mp);
2938	return (error);
2939}
2940
2941/*
2942 * Set ownership given a path name.
2943 */
2944#ifndef _SYS_SYSPROTO_H_
2945struct chown_args {
2946	char	*path;
2947	int	uid;
2948	int	gid;
2949};
2950#endif
2951int
2952sys_chown(td, uap)
2953	struct thread *td;
2954	register struct chown_args /* {
2955		char *path;
2956		int uid;
2957		int gid;
2958	} */ *uap;
2959{
2960
2961	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2962}
2963
2964#ifndef _SYS_SYSPROTO_H_
2965struct fchownat_args {
2966	int fd;
2967	const char * path;
2968	uid_t uid;
2969	gid_t gid;
2970	int flag;
2971};
2972#endif
2973int
2974sys_fchownat(struct thread *td, struct fchownat_args *uap)
2975{
2976	int flag;
2977
2978	flag = uap->flag;
2979	if (flag & ~AT_SYMLINK_NOFOLLOW)
2980		return (EINVAL);
2981
2982	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2983	    uap->gid, uap->flag));
2984}
2985
2986int
2987kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2988    int gid)
2989{
2990
2991	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2992}
2993
2994int
2995kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2996    int uid, int gid, int flag)
2997{
2998	struct nameidata nd;
2999	cap_rights_t rights;
3000	int error, follow;
3001
3002	AUDIT_ARG_OWNER(uid, gid);
3003	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3004	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3005	    cap_rights_init(&rights, CAP_FCHOWN), td);
3006
3007	if ((error = namei(&nd)) != 0)
3008		return (error);
3009	NDFREE(&nd, NDF_ONLY_PNBUF);
3010	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3011	vrele(nd.ni_vp);
3012	return (error);
3013}
3014
3015/*
3016 * Set ownership given a path name, do not cross symlinks.
3017 */
3018#ifndef _SYS_SYSPROTO_H_
3019struct lchown_args {
3020	char	*path;
3021	int	uid;
3022	int	gid;
3023};
3024#endif
3025int
3026sys_lchown(td, uap)
3027	struct thread *td;
3028	register struct lchown_args /* {
3029		char *path;
3030		int uid;
3031		int gid;
3032	} */ *uap;
3033{
3034
3035	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3036}
3037
3038int
3039kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3040    int gid)
3041{
3042
3043	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3044	    AT_SYMLINK_NOFOLLOW));
3045}
3046
3047/*
3048 * Set ownership given a file descriptor.
3049 */
3050#ifndef _SYS_SYSPROTO_H_
3051struct fchown_args {
3052	int	fd;
3053	int	uid;
3054	int	gid;
3055};
3056#endif
3057int
3058sys_fchown(td, uap)
3059	struct thread *td;
3060	register struct fchown_args /* {
3061		int fd;
3062		int uid;
3063		int gid;
3064	} */ *uap;
3065{
3066	struct file *fp;
3067	cap_rights_t rights;
3068	int error;
3069
3070	AUDIT_ARG_FD(uap->fd);
3071	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3072	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3073	if (error != 0)
3074		return (error);
3075	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3076	fdrop(fp, td);
3077	return (error);
3078}
3079
3080/*
3081 * Common implementation code for utimes(), lutimes(), and futimes().
3082 */
3083static int
3084getutimes(usrtvp, tvpseg, tsp)
3085	const struct timeval *usrtvp;
3086	enum uio_seg tvpseg;
3087	struct timespec *tsp;
3088{
3089	struct timeval tv[2];
3090	const struct timeval *tvp;
3091	int error;
3092
3093	if (usrtvp == NULL) {
3094		vfs_timestamp(&tsp[0]);
3095		tsp[1] = tsp[0];
3096	} else {
3097		if (tvpseg == UIO_SYSSPACE) {
3098			tvp = usrtvp;
3099		} else {
3100			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3101				return (error);
3102			tvp = tv;
3103		}
3104
3105		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3106		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3107			return (EINVAL);
3108		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3109		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3110	}
3111	return (0);
3112}
3113
3114/*
3115 * Common implementation code for futimens(), utimensat().
3116 */
3117#define	UTIMENS_NULL	0x1
3118#define	UTIMENS_EXIT	0x2
3119static int
3120getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
3121    struct timespec *tsp, int *retflags)
3122{
3123	struct timespec tsnow;
3124	int error;
3125
3126	vfs_timestamp(&tsnow);
3127	*retflags = 0;
3128	if (usrtsp == NULL) {
3129		tsp[0] = tsnow;
3130		tsp[1] = tsnow;
3131		*retflags |= UTIMENS_NULL;
3132		return (0);
3133	}
3134	if (tspseg == UIO_SYSSPACE) {
3135		tsp[0] = usrtsp[0];
3136		tsp[1] = usrtsp[1];
3137	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
3138		return (error);
3139	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
3140		*retflags |= UTIMENS_EXIT;
3141	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
3142		*retflags |= UTIMENS_NULL;
3143	if (tsp[0].tv_nsec == UTIME_OMIT)
3144		tsp[0].tv_sec = VNOVAL;
3145	else if (tsp[0].tv_nsec == UTIME_NOW)
3146		tsp[0] = tsnow;
3147	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
3148		return (EINVAL);
3149	if (tsp[1].tv_nsec == UTIME_OMIT)
3150		tsp[1].tv_sec = VNOVAL;
3151	else if (tsp[1].tv_nsec == UTIME_NOW)
3152		tsp[1] = tsnow;
3153	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
3154		return (EINVAL);
3155
3156	return (0);
3157}
3158
3159/*
3160 * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
3161 * and utimensat().
3162 */
3163static int
3164setutimes(td, vp, ts, numtimes, nullflag)
3165	struct thread *td;
3166	struct vnode *vp;
3167	const struct timespec *ts;
3168	int numtimes;
3169	int nullflag;
3170{
3171	struct mount *mp;
3172	struct vattr vattr;
3173	int error, setbirthtime;
3174
3175	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3176		return (error);
3177	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3178	setbirthtime = 0;
3179	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3180	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3181		setbirthtime = 1;
3182	VATTR_NULL(&vattr);
3183	vattr.va_atime = ts[0];
3184	vattr.va_mtime = ts[1];
3185	if (setbirthtime)
3186		vattr.va_birthtime = ts[1];
3187	if (numtimes > 2)
3188		vattr.va_birthtime = ts[2];
3189	if (nullflag)
3190		vattr.va_vaflags |= VA_UTIMES_NULL;
3191#ifdef MAC
3192	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3193	    vattr.va_mtime);
3194#endif
3195	if (error == 0)
3196		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3197	VOP_UNLOCK(vp, 0);
3198	vn_finished_write(mp);
3199	return (error);
3200}
3201
3202/*
3203 * Set the access and modification times of a file.
3204 */
3205#ifndef _SYS_SYSPROTO_H_
3206struct utimes_args {
3207	char	*path;
3208	struct	timeval *tptr;
3209};
3210#endif
3211int
3212sys_utimes(td, uap)
3213	struct thread *td;
3214	register struct utimes_args /* {
3215		char *path;
3216		struct timeval *tptr;
3217	} */ *uap;
3218{
3219
3220	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3221	    UIO_USERSPACE));
3222}
3223
3224#ifndef _SYS_SYSPROTO_H_
3225struct futimesat_args {
3226	int fd;
3227	const char * path;
3228	const struct timeval * times;
3229};
3230#endif
3231int
3232sys_futimesat(struct thread *td, struct futimesat_args *uap)
3233{
3234
3235	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3236	    uap->times, UIO_USERSPACE));
3237}
3238
3239int
3240kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3241    struct timeval *tptr, enum uio_seg tptrseg)
3242{
3243
3244	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3245}
3246
3247int
3248kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3249    struct timeval *tptr, enum uio_seg tptrseg)
3250{
3251	struct nameidata nd;
3252	struct timespec ts[2];
3253	cap_rights_t rights;
3254	int error;
3255
3256	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3257		return (error);
3258	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3259	    cap_rights_init(&rights, CAP_FUTIMES), td);
3260
3261	if ((error = namei(&nd)) != 0)
3262		return (error);
3263	NDFREE(&nd, NDF_ONLY_PNBUF);
3264	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3265	vrele(nd.ni_vp);
3266	return (error);
3267}
3268
3269/*
3270 * Set the access and modification times of a file.
3271 */
3272#ifndef _SYS_SYSPROTO_H_
3273struct lutimes_args {
3274	char	*path;
3275	struct	timeval *tptr;
3276};
3277#endif
3278int
3279sys_lutimes(td, uap)
3280	struct thread *td;
3281	register struct lutimes_args /* {
3282		char *path;
3283		struct timeval *tptr;
3284	} */ *uap;
3285{
3286
3287	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3288	    UIO_USERSPACE));
3289}
3290
3291int
3292kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3293    struct timeval *tptr, enum uio_seg tptrseg)
3294{
3295	struct timespec ts[2];
3296	struct nameidata nd;
3297	int error;
3298
3299	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3300		return (error);
3301	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3302	if ((error = namei(&nd)) != 0)
3303		return (error);
3304	NDFREE(&nd, NDF_ONLY_PNBUF);
3305	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3306	vrele(nd.ni_vp);
3307	return (error);
3308}
3309
3310/*
3311 * Set the access and modification times of a file.
3312 */
3313#ifndef _SYS_SYSPROTO_H_
3314struct futimes_args {
3315	int	fd;
3316	struct	timeval *tptr;
3317};
3318#endif
3319int
3320sys_futimes(td, uap)
3321	struct thread *td;
3322	register struct futimes_args /* {
3323		int  fd;
3324		struct timeval *tptr;
3325	} */ *uap;
3326{
3327
3328	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3329}
3330
3331int
3332kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3333    enum uio_seg tptrseg)
3334{
3335	struct timespec ts[2];
3336	struct file *fp;
3337	cap_rights_t rights;
3338	int error;
3339
3340	AUDIT_ARG_FD(fd);
3341	error = getutimes(tptr, tptrseg, ts);
3342	if (error != 0)
3343		return (error);
3344	error = getvnode(td->td_proc->p_fd, fd,
3345	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3346	if (error != 0)
3347		return (error);
3348#ifdef AUDIT
3349	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3350	AUDIT_ARG_VNODE1(fp->f_vnode);
3351	VOP_UNLOCK(fp->f_vnode, 0);
3352#endif
3353	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3354	fdrop(fp, td);
3355	return (error);
3356}
3357
3358int
3359sys_futimens(struct thread *td, struct futimens_args *uap)
3360{
3361
3362	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3363}
3364
3365int
3366kern_futimens(struct thread *td, int fd, struct timespec *tptr,
3367    enum uio_seg tptrseg)
3368{
3369	struct timespec ts[2];
3370	struct file *fp;
3371	cap_rights_t rights;
3372	int error, flags;
3373
3374	AUDIT_ARG_FD(fd);
3375	error = getutimens(tptr, tptrseg, ts, &flags);
3376	if (error != 0)
3377		return (error);
3378	if (flags & UTIMENS_EXIT)
3379		return (0);
3380	error = getvnode(td->td_proc->p_fd, fd,
3381	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3382	if (error != 0)
3383		return (error);
3384#ifdef AUDIT
3385	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3386	AUDIT_ARG_VNODE1(fp->f_vnode);
3387	VOP_UNLOCK(fp->f_vnode, 0);
3388#endif
3389	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3390	fdrop(fp, td);
3391	return (error);
3392}
3393
3394int
3395sys_utimensat(struct thread *td, struct utimensat_args *uap)
3396{
3397
3398	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3399	    uap->times, UIO_USERSPACE, uap->flag));
3400}
3401
3402int
3403kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3404    struct timespec *tptr, enum uio_seg tptrseg, int flag)
3405{
3406	struct nameidata nd;
3407	struct timespec ts[2];
3408	int error, flags;
3409
3410	if (flag & ~AT_SYMLINK_NOFOLLOW)
3411		return (EINVAL);
3412
3413	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3414		return (error);
3415	NDINIT_AT(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
3416	    FOLLOW) | AUDITVNODE1, pathseg, path, fd, td);
3417	if ((error = namei(&nd)) != 0)
3418		return (error);
3419	/*
3420	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3421	 * POSIX states:
3422	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3423	 * "Search permission is denied by a component of the path prefix."
3424	 */
3425	NDFREE(&nd, NDF_ONLY_PNBUF);
3426	if ((flags & UTIMENS_EXIT) == 0)
3427		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3428	vrele(nd.ni_vp);
3429	return (error);
3430}
3431
3432/*
3433 * Truncate a file given its path name.
3434 */
3435#ifndef _SYS_SYSPROTO_H_
3436struct truncate_args {
3437	char	*path;
3438	int	pad;
3439	off_t	length;
3440};
3441#endif
3442int
3443sys_truncate(td, uap)
3444	struct thread *td;
3445	register struct truncate_args /* {
3446		char *path;
3447		int pad;
3448		off_t length;
3449	} */ *uap;
3450{
3451
3452	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3453}
3454
3455int
3456kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3457{
3458	struct mount *mp;
3459	struct vnode *vp;
3460	void *rl_cookie;
3461	struct vattr vattr;
3462	struct nameidata nd;
3463	int error;
3464
3465	if (length < 0)
3466		return(EINVAL);
3467	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3468	if ((error = namei(&nd)) != 0)
3469		return (error);
3470	vp = nd.ni_vp;
3471	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3472	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3473		vn_rangelock_unlock(vp, rl_cookie);
3474		vrele(vp);
3475		return (error);
3476	}
3477	NDFREE(&nd, NDF_ONLY_PNBUF);
3478	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3479	if (vp->v_type == VDIR)
3480		error = EISDIR;
3481#ifdef MAC
3482	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3483	}
3484#endif
3485	else if ((error = vn_writechk(vp)) == 0 &&
3486	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3487		VATTR_NULL(&vattr);
3488		vattr.va_size = length;
3489		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3490	}
3491	VOP_UNLOCK(vp, 0);
3492	vn_finished_write(mp);
3493	vn_rangelock_unlock(vp, rl_cookie);
3494	vrele(vp);
3495	return (error);
3496}
3497
3498#if defined(COMPAT_43)
3499/*
3500 * Truncate a file given its path name.
3501 */
3502#ifndef _SYS_SYSPROTO_H_
3503struct otruncate_args {
3504	char	*path;
3505	long	length;
3506};
3507#endif
3508int
3509otruncate(td, uap)
3510	struct thread *td;
3511	register struct otruncate_args /* {
3512		char *path;
3513		long length;
3514	} */ *uap;
3515{
3516	struct truncate_args /* {
3517		char *path;
3518		int pad;
3519		off_t length;
3520	} */ nuap;
3521
3522	nuap.path = uap->path;
3523	nuap.length = uap->length;
3524	return (sys_truncate(td, &nuap));
3525}
3526#endif /* COMPAT_43 */
3527
3528/* Versions with the pad argument */
3529int
3530freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3531{
3532	struct truncate_args ouap;
3533
3534	ouap.path = uap->path;
3535	ouap.length = uap->length;
3536	return (sys_truncate(td, &ouap));
3537}
3538
3539int
3540freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3541{
3542	struct ftruncate_args ouap;
3543
3544	ouap.fd = uap->fd;
3545	ouap.length = uap->length;
3546	return (sys_ftruncate(td, &ouap));
3547}
3548
3549/*
3550 * Sync an open file.
3551 */
3552#ifndef _SYS_SYSPROTO_H_
3553struct fsync_args {
3554	int	fd;
3555};
3556#endif
3557int
3558sys_fsync(td, uap)
3559	struct thread *td;
3560	struct fsync_args /* {
3561		int fd;
3562	} */ *uap;
3563{
3564	struct vnode *vp;
3565	struct mount *mp;
3566	struct file *fp;
3567	cap_rights_t rights;
3568	int error, lock_flags;
3569
3570	AUDIT_ARG_FD(uap->fd);
3571	error = getvnode(td->td_proc->p_fd, uap->fd,
3572	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3573	if (error != 0)
3574		return (error);
3575	vp = fp->f_vnode;
3576	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3577	if (error != 0)
3578		goto drop;
3579	if (MNT_SHARED_WRITES(mp) ||
3580	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3581		lock_flags = LK_SHARED;
3582	} else {
3583		lock_flags = LK_EXCLUSIVE;
3584	}
3585	vn_lock(vp, lock_flags | LK_RETRY);
3586	AUDIT_ARG_VNODE1(vp);
3587	if (vp->v_object != NULL) {
3588		VM_OBJECT_WLOCK(vp->v_object);
3589		vm_object_page_clean(vp->v_object, 0, 0, 0);
3590		VM_OBJECT_WUNLOCK(vp->v_object);
3591	}
3592	error = VOP_FSYNC(vp, MNT_WAIT, td);
3593
3594	VOP_UNLOCK(vp, 0);
3595	vn_finished_write(mp);
3596drop:
3597	fdrop(fp, td);
3598	return (error);
3599}
3600
3601/*
3602 * Rename files.  Source and destination must either both be directories, or
3603 * both not be directories.  If target is a directory, it must be empty.
3604 */
3605#ifndef _SYS_SYSPROTO_H_
3606struct rename_args {
3607	char	*from;
3608	char	*to;
3609};
3610#endif
3611int
3612sys_rename(td, uap)
3613	struct thread *td;
3614	register struct rename_args /* {
3615		char *from;
3616		char *to;
3617	} */ *uap;
3618{
3619
3620	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3621}
3622
3623#ifndef _SYS_SYSPROTO_H_
3624struct renameat_args {
3625	int	oldfd;
3626	char	*old;
3627	int	newfd;
3628	char	*new;
3629};
3630#endif
3631int
3632sys_renameat(struct thread *td, struct renameat_args *uap)
3633{
3634
3635	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3636	    UIO_USERSPACE));
3637}
3638
3639int
3640kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3641{
3642
3643	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3644}
3645
3646int
3647kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3648    enum uio_seg pathseg)
3649{
3650	struct mount *mp = NULL;
3651	struct vnode *tvp, *fvp, *tdvp;
3652	struct nameidata fromnd, tond;
3653	cap_rights_t rights;
3654	int error;
3655
3656again:
3657	bwillwrite();
3658#ifdef MAC
3659	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3660	    AUDITVNODE1, pathseg, old, oldfd,
3661	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3662#else
3663	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3664	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3665#endif
3666
3667	if ((error = namei(&fromnd)) != 0)
3668		return (error);
3669#ifdef MAC
3670	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3671	    fromnd.ni_vp, &fromnd.ni_cnd);
3672	VOP_UNLOCK(fromnd.ni_dvp, 0);
3673	if (fromnd.ni_dvp != fromnd.ni_vp)
3674		VOP_UNLOCK(fromnd.ni_vp, 0);
3675#endif
3676	fvp = fromnd.ni_vp;
3677	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3678	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3679	    cap_rights_init(&rights, CAP_LINKAT), td);
3680	if (fromnd.ni_vp->v_type == VDIR)
3681		tond.ni_cnd.cn_flags |= WILLBEDIR;
3682	if ((error = namei(&tond)) != 0) {
3683		/* Translate error code for rename("dir1", "dir2/."). */
3684		if (error == EISDIR && fvp->v_type == VDIR)
3685			error = EINVAL;
3686		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3687		vrele(fromnd.ni_dvp);
3688		vrele(fvp);
3689		goto out1;
3690	}
3691	tdvp = tond.ni_dvp;
3692	tvp = tond.ni_vp;
3693	error = vn_start_write(fvp, &mp, V_NOWAIT);
3694	if (error != 0) {
3695		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3696		NDFREE(&tond, NDF_ONLY_PNBUF);
3697		if (tvp != NULL)
3698			vput(tvp);
3699		if (tdvp == tvp)
3700			vrele(tdvp);
3701		else
3702			vput(tdvp);
3703		vrele(fromnd.ni_dvp);
3704		vrele(fvp);
3705		vrele(tond.ni_startdir);
3706		if (fromnd.ni_startdir != NULL)
3707			vrele(fromnd.ni_startdir);
3708		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3709		if (error != 0)
3710			return (error);
3711		goto again;
3712	}
3713	if (tvp != NULL) {
3714		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3715			error = ENOTDIR;
3716			goto out;
3717		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3718			error = EISDIR;
3719			goto out;
3720		}
3721#ifdef CAPABILITIES
3722		if (newfd != AT_FDCWD) {
3723			/*
3724			 * If the target already exists we require CAP_UNLINKAT
3725			 * from 'newfd'.
3726			 */
3727			error = cap_check(&tond.ni_filecaps.fc_rights,
3728			    cap_rights_init(&rights, CAP_UNLINKAT));
3729			if (error != 0)
3730				goto out;
3731		}
3732#endif
3733	}
3734	if (fvp == tdvp) {
3735		error = EINVAL;
3736		goto out;
3737	}
3738	/*
3739	 * If the source is the same as the destination (that is, if they
3740	 * are links to the same vnode), then there is nothing to do.
3741	 */
3742	if (fvp == tvp)
3743		error = -1;
3744#ifdef MAC
3745	else
3746		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3747		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3748#endif
3749out:
3750	if (error == 0) {
3751		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3752		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3753		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3754		NDFREE(&tond, NDF_ONLY_PNBUF);
3755	} else {
3756		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3757		NDFREE(&tond, NDF_ONLY_PNBUF);
3758		if (tvp != NULL)
3759			vput(tvp);
3760		if (tdvp == tvp)
3761			vrele(tdvp);
3762		else
3763			vput(tdvp);
3764		vrele(fromnd.ni_dvp);
3765		vrele(fvp);
3766	}
3767	vrele(tond.ni_startdir);
3768	vn_finished_write(mp);
3769out1:
3770	if (fromnd.ni_startdir)
3771		vrele(fromnd.ni_startdir);
3772	if (error == -1)
3773		return (0);
3774	return (error);
3775}
3776
3777/*
3778 * Make a directory file.
3779 */
3780#ifndef _SYS_SYSPROTO_H_
3781struct mkdir_args {
3782	char	*path;
3783	int	mode;
3784};
3785#endif
3786int
3787sys_mkdir(td, uap)
3788	struct thread *td;
3789	register struct mkdir_args /* {
3790		char *path;
3791		int mode;
3792	} */ *uap;
3793{
3794
3795	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3796}
3797
3798#ifndef _SYS_SYSPROTO_H_
3799struct mkdirat_args {
3800	int	fd;
3801	char	*path;
3802	mode_t	mode;
3803};
3804#endif
3805int
3806sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3807{
3808
3809	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3810}
3811
3812int
3813kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3814{
3815
3816	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3817}
3818
3819int
3820kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3821    int mode)
3822{
3823	struct mount *mp;
3824	struct vnode *vp;
3825	struct vattr vattr;
3826	struct nameidata nd;
3827	cap_rights_t rights;
3828	int error;
3829
3830	AUDIT_ARG_MODE(mode);
3831restart:
3832	bwillwrite();
3833	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3834	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3835	    td);
3836	nd.ni_cnd.cn_flags |= WILLBEDIR;
3837	if ((error = namei(&nd)) != 0)
3838		return (error);
3839	vp = nd.ni_vp;
3840	if (vp != NULL) {
3841		NDFREE(&nd, NDF_ONLY_PNBUF);
3842		/*
3843		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3844		 * the strange behaviour of leaving the vnode unlocked
3845		 * if the target is the same vnode as the parent.
3846		 */
3847		if (vp == nd.ni_dvp)
3848			vrele(nd.ni_dvp);
3849		else
3850			vput(nd.ni_dvp);
3851		vrele(vp);
3852		return (EEXIST);
3853	}
3854	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3855		NDFREE(&nd, NDF_ONLY_PNBUF);
3856		vput(nd.ni_dvp);
3857		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3858			return (error);
3859		goto restart;
3860	}
3861	VATTR_NULL(&vattr);
3862	vattr.va_type = VDIR;
3863	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3864#ifdef MAC
3865	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3866	    &vattr);
3867	if (error != 0)
3868		goto out;
3869#endif
3870	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3871#ifdef MAC
3872out:
3873#endif
3874	NDFREE(&nd, NDF_ONLY_PNBUF);
3875	vput(nd.ni_dvp);
3876	if (error == 0)
3877		vput(nd.ni_vp);
3878	vn_finished_write(mp);
3879	return (error);
3880}
3881
3882/*
3883 * Remove a directory file.
3884 */
3885#ifndef _SYS_SYSPROTO_H_
3886struct rmdir_args {
3887	char	*path;
3888};
3889#endif
3890int
3891sys_rmdir(td, uap)
3892	struct thread *td;
3893	struct rmdir_args /* {
3894		char *path;
3895	} */ *uap;
3896{
3897
3898	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3899}
3900
3901int
3902kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3903{
3904
3905	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3906}
3907
3908int
3909kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3910{
3911	struct mount *mp;
3912	struct vnode *vp;
3913	struct nameidata nd;
3914	cap_rights_t rights;
3915	int error;
3916
3917restart:
3918	bwillwrite();
3919	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3920	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3921	if ((error = namei(&nd)) != 0)
3922		return (error);
3923	vp = nd.ni_vp;
3924	if (vp->v_type != VDIR) {
3925		error = ENOTDIR;
3926		goto out;
3927	}
3928	/*
3929	 * No rmdir "." please.
3930	 */
3931	if (nd.ni_dvp == vp) {
3932		error = EINVAL;
3933		goto out;
3934	}
3935	/*
3936	 * The root of a mounted filesystem cannot be deleted.
3937	 */
3938	if (vp->v_vflag & VV_ROOT) {
3939		error = EBUSY;
3940		goto out;
3941	}
3942#ifdef MAC
3943	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3944	    &nd.ni_cnd);
3945	if (error != 0)
3946		goto out;
3947#endif
3948	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3949		NDFREE(&nd, NDF_ONLY_PNBUF);
3950		vput(vp);
3951		if (nd.ni_dvp == vp)
3952			vrele(nd.ni_dvp);
3953		else
3954			vput(nd.ni_dvp);
3955		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3956			return (error);
3957		goto restart;
3958	}
3959	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3960	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3961	vn_finished_write(mp);
3962out:
3963	NDFREE(&nd, NDF_ONLY_PNBUF);
3964	vput(vp);
3965	if (nd.ni_dvp == vp)
3966		vrele(nd.ni_dvp);
3967	else
3968		vput(nd.ni_dvp);
3969	return (error);
3970}
3971
3972#ifdef COMPAT_43
3973/*
3974 * Read a block of directory entries in a filesystem independent format.
3975 */
3976#ifndef _SYS_SYSPROTO_H_
3977struct ogetdirentries_args {
3978	int	fd;
3979	char	*buf;
3980	u_int	count;
3981	long	*basep;
3982};
3983#endif
3984int
3985ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3986{
3987	long loff;
3988	int error;
3989
3990	error = kern_ogetdirentries(td, uap, &loff);
3991	if (error == 0)
3992		error = copyout(&loff, uap->basep, sizeof(long));
3993	return (error);
3994}
3995
3996int
3997kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3998    long *ploff)
3999{
4000	struct vnode *vp;
4001	struct file *fp;
4002	struct uio auio, kuio;
4003	struct iovec aiov, kiov;
4004	struct dirent *dp, *edp;
4005	cap_rights_t rights;
4006	caddr_t dirbuf;
4007	int error, eofflag, readcnt;
4008	long loff;
4009	off_t foffset;
4010
4011	/* XXX arbitrary sanity limit on `count'. */
4012	if (uap->count > 64 * 1024)
4013		return (EINVAL);
4014	error = getvnode(td->td_proc->p_fd, uap->fd,
4015	    cap_rights_init(&rights, CAP_READ), &fp);
4016	if (error != 0)
4017		return (error);
4018	if ((fp->f_flag & FREAD) == 0) {
4019		fdrop(fp, td);
4020		return (EBADF);
4021	}
4022	vp = fp->f_vnode;
4023	foffset = foffset_lock(fp, 0);
4024unionread:
4025	if (vp->v_type != VDIR) {
4026		foffset_unlock(fp, foffset, 0);
4027		fdrop(fp, td);
4028		return (EINVAL);
4029	}
4030	aiov.iov_base = uap->buf;
4031	aiov.iov_len = uap->count;
4032	auio.uio_iov = &aiov;
4033	auio.uio_iovcnt = 1;
4034	auio.uio_rw = UIO_READ;
4035	auio.uio_segflg = UIO_USERSPACE;
4036	auio.uio_td = td;
4037	auio.uio_resid = uap->count;
4038	vn_lock(vp, LK_SHARED | LK_RETRY);
4039	loff = auio.uio_offset = foffset;
4040#ifdef MAC
4041	error = mac_vnode_check_readdir(td->td_ucred, vp);
4042	if (error != 0) {
4043		VOP_UNLOCK(vp, 0);
4044		foffset_unlock(fp, foffset, FOF_NOUPDATE);
4045		fdrop(fp, td);
4046		return (error);
4047	}
4048#endif
4049#	if (BYTE_ORDER != LITTLE_ENDIAN)
4050		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4051			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4052			    NULL, NULL);
4053			foffset = auio.uio_offset;
4054		} else
4055#	endif
4056	{
4057		kuio = auio;
4058		kuio.uio_iov = &kiov;
4059		kuio.uio_segflg = UIO_SYSSPACE;
4060		kiov.iov_len = uap->count;
4061		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4062		kiov.iov_base = dirbuf;
4063		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4064			    NULL, NULL);
4065		foffset = kuio.uio_offset;
4066		if (error == 0) {
4067			readcnt = uap->count - kuio.uio_resid;
4068			edp = (struct dirent *)&dirbuf[readcnt];
4069			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4070#				if (BYTE_ORDER == LITTLE_ENDIAN)
4071					/*
4072					 * The expected low byte of
4073					 * dp->d_namlen is our dp->d_type.
4074					 * The high MBZ byte of dp->d_namlen
4075					 * is our dp->d_namlen.
4076					 */
4077					dp->d_type = dp->d_namlen;
4078					dp->d_namlen = 0;
4079#				else
4080					/*
4081					 * The dp->d_type is the high byte
4082					 * of the expected dp->d_namlen,
4083					 * so must be zero'ed.
4084					 */
4085					dp->d_type = 0;
4086#				endif
4087				if (dp->d_reclen > 0) {
4088					dp = (struct dirent *)
4089					    ((char *)dp + dp->d_reclen);
4090				} else {
4091					error = EIO;
4092					break;
4093				}
4094			}
4095			if (dp >= edp)
4096				error = uiomove(dirbuf, readcnt, &auio);
4097		}
4098		free(dirbuf, M_TEMP);
4099	}
4100	if (error != 0) {
4101		VOP_UNLOCK(vp, 0);
4102		foffset_unlock(fp, foffset, 0);
4103		fdrop(fp, td);
4104		return (error);
4105	}
4106	if (uap->count == auio.uio_resid &&
4107	    (vp->v_vflag & VV_ROOT) &&
4108	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4109		struct vnode *tvp = vp;
4110		vp = vp->v_mount->mnt_vnodecovered;
4111		VREF(vp);
4112		fp->f_vnode = vp;
4113		fp->f_data = vp;
4114		foffset = 0;
4115		vput(tvp);
4116		goto unionread;
4117	}
4118	VOP_UNLOCK(vp, 0);
4119	foffset_unlock(fp, foffset, 0);
4120	fdrop(fp, td);
4121	td->td_retval[0] = uap->count - auio.uio_resid;
4122	if (error == 0)
4123		*ploff = loff;
4124	return (error);
4125}
4126#endif /* COMPAT_43 */
4127
4128/*
4129 * Read a block of directory entries in a filesystem independent format.
4130 */
4131#ifndef _SYS_SYSPROTO_H_
4132struct getdirentries_args {
4133	int	fd;
4134	char	*buf;
4135	u_int	count;
4136	long	*basep;
4137};
4138#endif
4139int
4140sys_getdirentries(td, uap)
4141	struct thread *td;
4142	register struct getdirentries_args /* {
4143		int fd;
4144		char *buf;
4145		u_int count;
4146		long *basep;
4147	} */ *uap;
4148{
4149	long base;
4150	int error;
4151
4152	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4153	    NULL, UIO_USERSPACE);
4154	if (error != 0)
4155		return (error);
4156	if (uap->basep != NULL)
4157		error = copyout(&base, uap->basep, sizeof(long));
4158	return (error);
4159}
4160
4161int
4162kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4163    long *basep, ssize_t *residp, enum uio_seg bufseg)
4164{
4165	struct vnode *vp;
4166	struct file *fp;
4167	struct uio auio;
4168	struct iovec aiov;
4169	cap_rights_t rights;
4170	long loff;
4171	int error, eofflag;
4172	off_t foffset;
4173
4174	AUDIT_ARG_FD(fd);
4175	if (count > IOSIZE_MAX)
4176		return (EINVAL);
4177	auio.uio_resid = count;
4178	error = getvnode(td->td_proc->p_fd, fd,
4179	    cap_rights_init(&rights, CAP_READ), &fp);
4180	if (error != 0)
4181		return (error);
4182	if ((fp->f_flag & FREAD) == 0) {
4183		fdrop(fp, td);
4184		return (EBADF);
4185	}
4186	vp = fp->f_vnode;
4187	foffset = foffset_lock(fp, 0);
4188unionread:
4189	if (vp->v_type != VDIR) {
4190		error = EINVAL;
4191		goto fail;
4192	}
4193	aiov.iov_base = buf;
4194	aiov.iov_len = count;
4195	auio.uio_iov = &aiov;
4196	auio.uio_iovcnt = 1;
4197	auio.uio_rw = UIO_READ;
4198	auio.uio_segflg = bufseg;
4199	auio.uio_td = td;
4200	vn_lock(vp, LK_SHARED | LK_RETRY);
4201	AUDIT_ARG_VNODE1(vp);
4202	loff = auio.uio_offset = foffset;
4203#ifdef MAC
4204	error = mac_vnode_check_readdir(td->td_ucred, vp);
4205	if (error == 0)
4206#endif
4207		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4208		    NULL);
4209	foffset = auio.uio_offset;
4210	if (error != 0) {
4211		VOP_UNLOCK(vp, 0);
4212		goto fail;
4213	}
4214	if (count == auio.uio_resid &&
4215	    (vp->v_vflag & VV_ROOT) &&
4216	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4217		struct vnode *tvp = vp;
4218
4219		vp = vp->v_mount->mnt_vnodecovered;
4220		VREF(vp);
4221		fp->f_vnode = vp;
4222		fp->f_data = vp;
4223		foffset = 0;
4224		vput(tvp);
4225		goto unionread;
4226	}
4227	VOP_UNLOCK(vp, 0);
4228	*basep = loff;
4229	if (residp != NULL)
4230		*residp = auio.uio_resid;
4231	td->td_retval[0] = count - auio.uio_resid;
4232fail:
4233	foffset_unlock(fp, foffset, 0);
4234	fdrop(fp, td);
4235	return (error);
4236}
4237
4238#ifndef _SYS_SYSPROTO_H_
4239struct getdents_args {
4240	int fd;
4241	char *buf;
4242	size_t count;
4243};
4244#endif
4245int
4246sys_getdents(td, uap)
4247	struct thread *td;
4248	register struct getdents_args /* {
4249		int fd;
4250		char *buf;
4251		u_int count;
4252	} */ *uap;
4253{
4254	struct getdirentries_args ap;
4255
4256	ap.fd = uap->fd;
4257	ap.buf = uap->buf;
4258	ap.count = uap->count;
4259	ap.basep = NULL;
4260	return (sys_getdirentries(td, &ap));
4261}
4262
4263/*
4264 * Set the mode mask for creation of filesystem nodes.
4265 */
4266#ifndef _SYS_SYSPROTO_H_
4267struct umask_args {
4268	int	newmask;
4269};
4270#endif
4271int
4272sys_umask(td, uap)
4273	struct thread *td;
4274	struct umask_args /* {
4275		int newmask;
4276	} */ *uap;
4277{
4278	register struct filedesc *fdp;
4279
4280	FILEDESC_XLOCK(td->td_proc->p_fd);
4281	fdp = td->td_proc->p_fd;
4282	td->td_retval[0] = fdp->fd_cmask;
4283	fdp->fd_cmask = uap->newmask & ALLPERMS;
4284	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4285	return (0);
4286}
4287
4288/*
4289 * Void all references to file by ripping underlying filesystem away from
4290 * vnode.
4291 */
4292#ifndef _SYS_SYSPROTO_H_
4293struct revoke_args {
4294	char	*path;
4295};
4296#endif
4297int
4298sys_revoke(td, uap)
4299	struct thread *td;
4300	register struct revoke_args /* {
4301		char *path;
4302	} */ *uap;
4303{
4304	struct vnode *vp;
4305	struct vattr vattr;
4306	struct nameidata nd;
4307	int error;
4308
4309	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4310	    uap->path, td);
4311	if ((error = namei(&nd)) != 0)
4312		return (error);
4313	vp = nd.ni_vp;
4314	NDFREE(&nd, NDF_ONLY_PNBUF);
4315	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4316		error = EINVAL;
4317		goto out;
4318	}
4319#ifdef MAC
4320	error = mac_vnode_check_revoke(td->td_ucred, vp);
4321	if (error != 0)
4322		goto out;
4323#endif
4324	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4325	if (error != 0)
4326		goto out;
4327	if (td->td_ucred->cr_uid != vattr.va_uid) {
4328		error = priv_check(td, PRIV_VFS_ADMIN);
4329		if (error != 0)
4330			goto out;
4331	}
4332	if (vcount(vp) > 1)
4333		VOP_REVOKE(vp, REVOKEALL);
4334out:
4335	vput(vp);
4336	return (error);
4337}
4338
4339/*
4340 * Convert a user file descriptor to a kernel file entry and check that, if it
4341 * is a capability, the correct rights are present. A reference on the file
4342 * entry is held upon returning.
4343 */
4344int
4345getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4346{
4347	struct file *fp;
4348	int error;
4349
4350	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4351	if (error != 0)
4352		return (error);
4353
4354	/*
4355	 * The file could be not of the vnode type, or it may be not
4356	 * yet fully initialized, in which case the f_vnode pointer
4357	 * may be set, but f_ops is still badfileops.  E.g.,
4358	 * devfs_open() transiently create such situation to
4359	 * facilitate csw d_fdopen().
4360	 *
4361	 * Dupfdopen() handling in kern_openat() installs the
4362	 * half-baked file into the process descriptor table, allowing
4363	 * other thread to dereference it. Guard against the race by
4364	 * checking f_ops.
4365	 */
4366	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4367		fdrop(fp, curthread);
4368		return (EINVAL);
4369	}
4370	*fpp = fp;
4371	return (0);
4372}
4373
4374
4375/*
4376 * Get an (NFS) file handle.
4377 */
4378#ifndef _SYS_SYSPROTO_H_
4379struct lgetfh_args {
4380	char	*fname;
4381	fhandle_t *fhp;
4382};
4383#endif
4384int
4385sys_lgetfh(td, uap)
4386	struct thread *td;
4387	register struct lgetfh_args *uap;
4388{
4389	struct nameidata nd;
4390	fhandle_t fh;
4391	register struct vnode *vp;
4392	int error;
4393
4394	error = priv_check(td, PRIV_VFS_GETFH);
4395	if (error != 0)
4396		return (error);
4397	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4398	    uap->fname, td);
4399	error = namei(&nd);
4400	if (error != 0)
4401		return (error);
4402	NDFREE(&nd, NDF_ONLY_PNBUF);
4403	vp = nd.ni_vp;
4404	bzero(&fh, sizeof(fh));
4405	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4406	error = VOP_VPTOFH(vp, &fh.fh_fid);
4407	vput(vp);
4408	if (error == 0)
4409		error = copyout(&fh, uap->fhp, sizeof (fh));
4410	return (error);
4411}
4412
4413#ifndef _SYS_SYSPROTO_H_
4414struct getfh_args {
4415	char	*fname;
4416	fhandle_t *fhp;
4417};
4418#endif
4419int
4420sys_getfh(td, uap)
4421	struct thread *td;
4422	register struct getfh_args *uap;
4423{
4424	struct nameidata nd;
4425	fhandle_t fh;
4426	register struct vnode *vp;
4427	int error;
4428
4429	error = priv_check(td, PRIV_VFS_GETFH);
4430	if (error != 0)
4431		return (error);
4432	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4433	    uap->fname, td);
4434	error = namei(&nd);
4435	if (error != 0)
4436		return (error);
4437	NDFREE(&nd, NDF_ONLY_PNBUF);
4438	vp = nd.ni_vp;
4439	bzero(&fh, sizeof(fh));
4440	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4441	error = VOP_VPTOFH(vp, &fh.fh_fid);
4442	vput(vp);
4443	if (error == 0)
4444		error = copyout(&fh, uap->fhp, sizeof (fh));
4445	return (error);
4446}
4447
4448/*
4449 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4450 * open descriptor.
4451 *
4452 * warning: do not remove the priv_check() call or this becomes one giant
4453 * security hole.
4454 */
4455#ifndef _SYS_SYSPROTO_H_
4456struct fhopen_args {
4457	const struct fhandle *u_fhp;
4458	int flags;
4459};
4460#endif
4461int
4462sys_fhopen(td, uap)
4463	struct thread *td;
4464	struct fhopen_args /* {
4465		const struct fhandle *u_fhp;
4466		int flags;
4467	} */ *uap;
4468{
4469	struct mount *mp;
4470	struct vnode *vp;
4471	struct fhandle fhp;
4472	struct file *fp;
4473	int fmode, error;
4474	int indx;
4475
4476	error = priv_check(td, PRIV_VFS_FHOPEN);
4477	if (error != 0)
4478		return (error);
4479	indx = -1;
4480	fmode = FFLAGS(uap->flags);
4481	/* why not allow a non-read/write open for our lockd? */
4482	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4483		return (EINVAL);
4484	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4485	if (error != 0)
4486		return(error);
4487	/* find the mount point */
4488	mp = vfs_busyfs(&fhp.fh_fsid);
4489	if (mp == NULL)
4490		return (ESTALE);
4491	/* now give me my vnode, it gets returned to me locked */
4492	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4493	vfs_unbusy(mp);
4494	if (error != 0)
4495		return (error);
4496
4497	error = falloc_noinstall(td, &fp);
4498	if (error != 0) {
4499		vput(vp);
4500		return (error);
4501	}
4502	/*
4503	 * An extra reference on `fp' has been held for us by
4504	 * falloc_noinstall().
4505	 */
4506
4507#ifdef INVARIANTS
4508	td->td_dupfd = -1;
4509#endif
4510	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4511	if (error != 0) {
4512		KASSERT(fp->f_ops == &badfileops,
4513		    ("VOP_OPEN in fhopen() set f_ops"));
4514		KASSERT(td->td_dupfd < 0,
4515		    ("fhopen() encountered fdopen()"));
4516
4517		vput(vp);
4518		goto bad;
4519	}
4520#ifdef INVARIANTS
4521	td->td_dupfd = 0;
4522#endif
4523	fp->f_vnode = vp;
4524	fp->f_seqcount = 1;
4525	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4526	    &vnops);
4527	VOP_UNLOCK(vp, 0);
4528	if ((fmode & O_TRUNC) != 0) {
4529		error = fo_truncate(fp, 0, td->td_ucred, td);
4530		if (error != 0)
4531			goto bad;
4532	}
4533
4534	error = finstall(td, fp, &indx, fmode, NULL);
4535bad:
4536	fdrop(fp, td);
4537	td->td_retval[0] = indx;
4538	return (error);
4539}
4540
4541/*
4542 * Stat an (NFS) file handle.
4543 */
4544#ifndef _SYS_SYSPROTO_H_
4545struct fhstat_args {
4546	struct fhandle *u_fhp;
4547	struct stat *sb;
4548};
4549#endif
4550int
4551sys_fhstat(td, uap)
4552	struct thread *td;
4553	register struct fhstat_args /* {
4554		struct fhandle *u_fhp;
4555		struct stat *sb;
4556	} */ *uap;
4557{
4558	struct stat sb;
4559	struct fhandle fh;
4560	int error;
4561
4562	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4563	if (error != 0)
4564		return (error);
4565	error = kern_fhstat(td, fh, &sb);
4566	if (error == 0)
4567		error = copyout(&sb, uap->sb, sizeof(sb));
4568	return (error);
4569}
4570
4571int
4572kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4573{
4574	struct mount *mp;
4575	struct vnode *vp;
4576	int error;
4577
4578	error = priv_check(td, PRIV_VFS_FHSTAT);
4579	if (error != 0)
4580		return (error);
4581	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4582		return (ESTALE);
4583	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4584	vfs_unbusy(mp);
4585	if (error != 0)
4586		return (error);
4587	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4588	vput(vp);
4589	return (error);
4590}
4591
4592/*
4593 * Implement fstatfs() for (NFS) file handles.
4594 */
4595#ifndef _SYS_SYSPROTO_H_
4596struct fhstatfs_args {
4597	struct fhandle *u_fhp;
4598	struct statfs *buf;
4599};
4600#endif
4601int
4602sys_fhstatfs(td, uap)
4603	struct thread *td;
4604	struct fhstatfs_args /* {
4605		struct fhandle *u_fhp;
4606		struct statfs *buf;
4607	} */ *uap;
4608{
4609	struct statfs sf;
4610	fhandle_t fh;
4611	int error;
4612
4613	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4614	if (error != 0)
4615		return (error);
4616	error = kern_fhstatfs(td, fh, &sf);
4617	if (error != 0)
4618		return (error);
4619	return (copyout(&sf, uap->buf, sizeof(sf)));
4620}
4621
4622int
4623kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4624{
4625	struct statfs *sp;
4626	struct mount *mp;
4627	struct vnode *vp;
4628	int error;
4629
4630	error = priv_check(td, PRIV_VFS_FHSTATFS);
4631	if (error != 0)
4632		return (error);
4633	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4634		return (ESTALE);
4635	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4636	if (error != 0) {
4637		vfs_unbusy(mp);
4638		return (error);
4639	}
4640	vput(vp);
4641	error = prison_canseemount(td->td_ucred, mp);
4642	if (error != 0)
4643		goto out;
4644#ifdef MAC
4645	error = mac_mount_check_stat(td->td_ucred, mp);
4646	if (error != 0)
4647		goto out;
4648#endif
4649	/*
4650	 * Set these in case the underlying filesystem fails to do so.
4651	 */
4652	sp = &mp->mnt_stat;
4653	sp->f_version = STATFS_VERSION;
4654	sp->f_namemax = NAME_MAX;
4655	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4656	error = VFS_STATFS(mp, sp);
4657	if (error == 0)
4658		*buf = *sp;
4659out:
4660	vfs_unbusy(mp);
4661	return (error);
4662}
4663
4664int
4665kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4666{
4667	struct file *fp;
4668	struct mount *mp;
4669	struct vnode *vp;
4670	cap_rights_t rights;
4671	off_t olen, ooffset;
4672	int error;
4673
4674	if (offset < 0 || len <= 0)
4675		return (EINVAL);
4676	/* Check for wrap. */
4677	if (offset > OFF_MAX - len)
4678		return (EFBIG);
4679	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4680	if (error != 0)
4681		return (error);
4682	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4683		error = ESPIPE;
4684		goto out;
4685	}
4686	if ((fp->f_flag & FWRITE) == 0) {
4687		error = EBADF;
4688		goto out;
4689	}
4690	if (fp->f_type != DTYPE_VNODE) {
4691		error = ENODEV;
4692		goto out;
4693	}
4694	vp = fp->f_vnode;
4695	if (vp->v_type != VREG) {
4696		error = ENODEV;
4697		goto out;
4698	}
4699
4700	/* Allocating blocks may take a long time, so iterate. */
4701	for (;;) {
4702		olen = len;
4703		ooffset = offset;
4704
4705		bwillwrite();
4706		mp = NULL;
4707		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4708		if (error != 0)
4709			break;
4710		error = vn_lock(vp, LK_EXCLUSIVE);
4711		if (error != 0) {
4712			vn_finished_write(mp);
4713			break;
4714		}
4715#ifdef MAC
4716		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4717		if (error == 0)
4718#endif
4719			error = VOP_ALLOCATE(vp, &offset, &len);
4720		VOP_UNLOCK(vp, 0);
4721		vn_finished_write(mp);
4722
4723		if (olen + ooffset != offset + len) {
4724			panic("offset + len changed from %jx/%jx to %jx/%jx",
4725			    ooffset, olen, offset, len);
4726		}
4727		if (error != 0 || len == 0)
4728			break;
4729		KASSERT(olen > len, ("Iteration did not make progress?"));
4730		maybe_yield();
4731	}
4732 out:
4733	fdrop(fp, td);
4734	return (error);
4735}
4736
4737int
4738sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4739{
4740
4741	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4742	    uap->len);
4743	return (0);
4744}
4745
4746/*
4747 * Unlike madvise(2), we do not make a best effort to remember every
4748 * possible caching hint.  Instead, we remember the last setting with
4749 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4750 * region of any current setting.
4751 */
4752int
4753kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4754    int advice)
4755{
4756	struct fadvise_info *fa, *new;
4757	struct file *fp;
4758	struct vnode *vp;
4759	cap_rights_t rights;
4760	off_t end;
4761	int error;
4762
4763	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4764		return (EINVAL);
4765	switch (advice) {
4766	case POSIX_FADV_SEQUENTIAL:
4767	case POSIX_FADV_RANDOM:
4768	case POSIX_FADV_NOREUSE:
4769		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4770		break;
4771	case POSIX_FADV_NORMAL:
4772	case POSIX_FADV_WILLNEED:
4773	case POSIX_FADV_DONTNEED:
4774		new = NULL;
4775		break;
4776	default:
4777		return (EINVAL);
4778	}
4779	/* XXX: CAP_POSIX_FADVISE? */
4780	error = fget(td, fd, cap_rights_init(&rights), &fp);
4781	if (error != 0)
4782		goto out;
4783	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4784		error = ESPIPE;
4785		goto out;
4786	}
4787	if (fp->f_type != DTYPE_VNODE) {
4788		error = ENODEV;
4789		goto out;
4790	}
4791	vp = fp->f_vnode;
4792	if (vp->v_type != VREG) {
4793		error = ENODEV;
4794		goto out;
4795	}
4796	if (len == 0)
4797		end = OFF_MAX;
4798	else
4799		end = offset + len - 1;
4800	switch (advice) {
4801	case POSIX_FADV_SEQUENTIAL:
4802	case POSIX_FADV_RANDOM:
4803	case POSIX_FADV_NOREUSE:
4804		/*
4805		 * Try to merge any existing non-standard region with
4806		 * this new region if possible, otherwise create a new
4807		 * non-standard region for this request.
4808		 */
4809		mtx_pool_lock(mtxpool_sleep, fp);
4810		fa = fp->f_advice;
4811		if (fa != NULL && fa->fa_advice == advice &&
4812		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4813		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4814		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4815			if (offset < fa->fa_start)
4816				fa->fa_start = offset;
4817			if (end > fa->fa_end)
4818				fa->fa_end = end;
4819		} else {
4820			new->fa_advice = advice;
4821			new->fa_start = offset;
4822			new->fa_end = end;
4823			new->fa_prevstart = 0;
4824			new->fa_prevend = 0;
4825			fp->f_advice = new;
4826			new = fa;
4827		}
4828		mtx_pool_unlock(mtxpool_sleep, fp);
4829		break;
4830	case POSIX_FADV_NORMAL:
4831		/*
4832		 * If a the "normal" region overlaps with an existing
4833		 * non-standard region, trim or remove the
4834		 * non-standard region.
4835		 */
4836		mtx_pool_lock(mtxpool_sleep, fp);
4837		fa = fp->f_advice;
4838		if (fa != NULL) {
4839			if (offset <= fa->fa_start && end >= fa->fa_end) {
4840				new = fa;
4841				fp->f_advice = NULL;
4842			} else if (offset <= fa->fa_start &&
4843			    end >= fa->fa_start)
4844				fa->fa_start = end + 1;
4845			else if (offset <= fa->fa_end && end >= fa->fa_end)
4846				fa->fa_end = offset - 1;
4847			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4848				/*
4849				 * If the "normal" region is a middle
4850				 * portion of the existing
4851				 * non-standard region, just remove
4852				 * the whole thing rather than picking
4853				 * one side or the other to
4854				 * preserve.
4855				 */
4856				new = fa;
4857				fp->f_advice = NULL;
4858			}
4859		}
4860		mtx_pool_unlock(mtxpool_sleep, fp);
4861		break;
4862	case POSIX_FADV_WILLNEED:
4863	case POSIX_FADV_DONTNEED:
4864		error = VOP_ADVISE(vp, offset, end, advice);
4865		break;
4866	}
4867out:
4868	if (fp != NULL)
4869		fdrop(fp, td);
4870	free(new, M_FADVISE);
4871	return (error);
4872}
4873
4874int
4875sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4876{
4877
4878	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4879	    uap->len, uap->advice);
4880	return (0);
4881}
4882