vfs_syscalls.c revision 301054
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 301054 2016-05-31 16:57:42Z glebius $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capsicum.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <ufs/ufs/quota.h>
91
92MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93
94SDT_PROVIDER_DEFINE(vfs);
95SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
97
98static int chroot_refuse_vdir_fds(struct filedesc *fdp);
99static int kern_chflags(struct thread *td, const char *path,
100    enum uio_seg pathseg, u_long flags);
101static int kern_chflagsat(struct thread *td, int fd, const char *path,
102    enum uio_seg pathseg, u_long flags, int atflag);
103static int setfflags(struct thread *td, struct vnode *, u_long);
104static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
105static int getutimens(const struct timespec *, enum uio_seg,
106    struct timespec *, int *);
107static int setutimes(struct thread *td, struct vnode *,
108    const struct timespec *, int, int);
109static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
110    struct thread *td);
111
112/*
113 * The module initialization routine for POSIX asynchronous I/O will
114 * set this to the version of AIO that it implements.  (Zero means
115 * that it is not implemented.)  This value is used here by pathconf()
116 * and in kern_descrip.c by fpathconf().
117 */
118int async_io_version;
119
120/*
121 * Sync each mounted filesystem.
122 */
123#ifndef _SYS_SYSPROTO_H_
124struct sync_args {
125	int     dummy;
126};
127#endif
128/* ARGSUSED */
129int
130sys_sync(td, uap)
131	struct thread *td;
132	struct sync_args *uap;
133{
134	struct mount *mp, *nmp;
135	int save;
136
137	mtx_lock(&mountlist_mtx);
138	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
139		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
140			nmp = TAILQ_NEXT(mp, mnt_list);
141			continue;
142		}
143		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
144		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
145			save = curthread_pflags_set(TDP_SYNCIO);
146			vfs_msync(mp, MNT_NOWAIT);
147			VFS_SYNC(mp, MNT_NOWAIT);
148			curthread_pflags_restore(save);
149			vn_finished_write(mp);
150		}
151		mtx_lock(&mountlist_mtx);
152		nmp = TAILQ_NEXT(mp, mnt_list);
153		vfs_unbusy(mp);
154	}
155	mtx_unlock(&mountlist_mtx);
156	return (0);
157}
158
159/*
160 * Change filesystem quotas.
161 */
162#ifndef _SYS_SYSPROTO_H_
163struct quotactl_args {
164	char *path;
165	int cmd;
166	int uid;
167	caddr_t arg;
168};
169#endif
170int
171sys_quotactl(td, uap)
172	struct thread *td;
173	register struct quotactl_args /* {
174		char *path;
175		int cmd;
176		int uid;
177		caddr_t arg;
178	} */ *uap;
179{
180	struct mount *mp;
181	struct nameidata nd;
182	int error;
183
184	AUDIT_ARG_CMD(uap->cmd);
185	AUDIT_ARG_UID(uap->uid);
186	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
187		return (EPERM);
188	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
189	    uap->path, td);
190	if ((error = namei(&nd)) != 0)
191		return (error);
192	NDFREE(&nd, NDF_ONLY_PNBUF);
193	mp = nd.ni_vp->v_mount;
194	vfs_ref(mp);
195	vput(nd.ni_vp);
196	error = vfs_busy(mp, 0);
197	vfs_rel(mp);
198	if (error != 0)
199		return (error);
200	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
201
202	/*
203	 * Since quota on operation typically needs to open quota
204	 * file, the Q_QUOTAON handler needs to unbusy the mount point
205	 * before calling into namei.  Otherwise, unmount might be
206	 * started between two vfs_busy() invocations (first is our,
207	 * second is from mount point cross-walk code in lookup()),
208	 * causing deadlock.
209	 *
210	 * Require that Q_QUOTAON handles the vfs_busy() reference on
211	 * its own, always returning with ubusied mount point.
212	 */
213	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
214		vfs_unbusy(mp);
215	return (error);
216}
217
218/*
219 * Used by statfs conversion routines to scale the block size up if
220 * necessary so that all of the block counts are <= 'max_size'.  Note
221 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
222 * value of 'n'.
223 */
224void
225statfs_scale_blocks(struct statfs *sf, long max_size)
226{
227	uint64_t count;
228	int shift;
229
230	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
231
232	/*
233	 * Attempt to scale the block counts to give a more accurate
234	 * overview to userland of the ratio of free space to used
235	 * space.  To do this, find the largest block count and compute
236	 * a divisor that lets it fit into a signed integer <= max_size.
237	 */
238	if (sf->f_bavail < 0)
239		count = -sf->f_bavail;
240	else
241		count = sf->f_bavail;
242	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
243	if (count <= max_size)
244		return;
245
246	count >>= flsl(max_size);
247	shift = 0;
248	while (count > 0) {
249		shift++;
250		count >>=1;
251	}
252
253	sf->f_bsize <<= shift;
254	sf->f_blocks >>= shift;
255	sf->f_bfree >>= shift;
256	sf->f_bavail >>= shift;
257}
258
259/*
260 * Get filesystem statistics.
261 */
262#ifndef _SYS_SYSPROTO_H_
263struct statfs_args {
264	char *path;
265	struct statfs *buf;
266};
267#endif
268int
269sys_statfs(td, uap)
270	struct thread *td;
271	register struct statfs_args /* {
272		char *path;
273		struct statfs *buf;
274	} */ *uap;
275{
276	struct statfs sf;
277	int error;
278
279	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
280	if (error == 0)
281		error = copyout(&sf, uap->buf, sizeof(sf));
282	return (error);
283}
284
285int
286kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
287    struct statfs *buf)
288{
289	struct mount *mp;
290	struct statfs *sp, sb;
291	struct nameidata nd;
292	int error;
293
294	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
295	    pathseg, path, td);
296	error = namei(&nd);
297	if (error != 0)
298		return (error);
299	mp = nd.ni_vp->v_mount;
300	vfs_ref(mp);
301	NDFREE(&nd, NDF_ONLY_PNBUF);
302	vput(nd.ni_vp);
303	error = vfs_busy(mp, 0);
304	vfs_rel(mp);
305	if (error != 0)
306		return (error);
307#ifdef MAC
308	error = mac_mount_check_stat(td->td_ucred, mp);
309	if (error != 0)
310		goto out;
311#endif
312	/*
313	 * Set these in case the underlying filesystem fails to do so.
314	 */
315	sp = &mp->mnt_stat;
316	sp->f_version = STATFS_VERSION;
317	sp->f_namemax = NAME_MAX;
318	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
319	error = VFS_STATFS(mp, sp);
320	if (error != 0)
321		goto out;
322	if (priv_check(td, PRIV_VFS_GENERATION)) {
323		bcopy(sp, &sb, sizeof(sb));
324		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
325		prison_enforce_statfs(td->td_ucred, mp, &sb);
326		sp = &sb;
327	}
328	*buf = *sp;
329out:
330	vfs_unbusy(mp);
331	return (error);
332}
333
334/*
335 * Get filesystem statistics.
336 */
337#ifndef _SYS_SYSPROTO_H_
338struct fstatfs_args {
339	int fd;
340	struct statfs *buf;
341};
342#endif
343int
344sys_fstatfs(td, uap)
345	struct thread *td;
346	register struct fstatfs_args /* {
347		int fd;
348		struct statfs *buf;
349	} */ *uap;
350{
351	struct statfs sf;
352	int error;
353
354	error = kern_fstatfs(td, uap->fd, &sf);
355	if (error == 0)
356		error = copyout(&sf, uap->buf, sizeof(sf));
357	return (error);
358}
359
360int
361kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
362{
363	struct file *fp;
364	struct mount *mp;
365	struct statfs *sp, sb;
366	struct vnode *vp;
367	cap_rights_t rights;
368	int error;
369
370	AUDIT_ARG_FD(fd);
371	error = getvnode(td->td_proc->p_fd, fd,
372	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
373	if (error != 0)
374		return (error);
375	vp = fp->f_vnode;
376	vn_lock(vp, LK_SHARED | LK_RETRY);
377#ifdef AUDIT
378	AUDIT_ARG_VNODE1(vp);
379#endif
380	mp = vp->v_mount;
381	if (mp)
382		vfs_ref(mp);
383	VOP_UNLOCK(vp, 0);
384	fdrop(fp, td);
385	if (mp == NULL) {
386		error = EBADF;
387		goto out;
388	}
389	error = vfs_busy(mp, 0);
390	vfs_rel(mp);
391	if (error != 0)
392		return (error);
393#ifdef MAC
394	error = mac_mount_check_stat(td->td_ucred, mp);
395	if (error != 0)
396		goto out;
397#endif
398	/*
399	 * Set these in case the underlying filesystem fails to do so.
400	 */
401	sp = &mp->mnt_stat;
402	sp->f_version = STATFS_VERSION;
403	sp->f_namemax = NAME_MAX;
404	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
405	error = VFS_STATFS(mp, sp);
406	if (error != 0)
407		goto out;
408	if (priv_check(td, PRIV_VFS_GENERATION)) {
409		bcopy(sp, &sb, sizeof(sb));
410		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
411		prison_enforce_statfs(td->td_ucred, mp, &sb);
412		sp = &sb;
413	}
414	*buf = *sp;
415out:
416	if (mp)
417		vfs_unbusy(mp);
418	return (error);
419}
420
421/*
422 * Get statistics on all filesystems.
423 */
424#ifndef _SYS_SYSPROTO_H_
425struct getfsstat_args {
426	struct statfs *buf;
427	long bufsize;
428	int flags;
429};
430#endif
431int
432sys_getfsstat(td, uap)
433	struct thread *td;
434	register struct getfsstat_args /* {
435		struct statfs *buf;
436		long bufsize;
437		int flags;
438	} */ *uap;
439{
440
441	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
442	    uap->flags));
443}
444
445/*
446 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
447 *	The caller is responsible for freeing memory which will be allocated
448 *	in '*buf'.
449 */
450int
451kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
452    enum uio_seg bufseg, int flags)
453{
454	struct mount *mp, *nmp;
455	struct statfs *sfsp, *sp, sb;
456	size_t count, maxcount;
457	int error;
458
459	maxcount = bufsize / sizeof(struct statfs);
460	if (bufsize == 0)
461		sfsp = NULL;
462	else if (bufseg == UIO_USERSPACE)
463		sfsp = *buf;
464	else /* if (bufseg == UIO_SYSSPACE) */ {
465		count = 0;
466		mtx_lock(&mountlist_mtx);
467		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
468			count++;
469		}
470		mtx_unlock(&mountlist_mtx);
471		if (maxcount > count)
472			maxcount = count;
473		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
474		    M_WAITOK);
475	}
476	count = 0;
477	mtx_lock(&mountlist_mtx);
478	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
479		if (prison_canseemount(td->td_ucred, mp) != 0) {
480			nmp = TAILQ_NEXT(mp, mnt_list);
481			continue;
482		}
483#ifdef MAC
484		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
485			nmp = TAILQ_NEXT(mp, mnt_list);
486			continue;
487		}
488#endif
489		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
490			nmp = TAILQ_NEXT(mp, mnt_list);
491			continue;
492		}
493		if (sfsp && count < maxcount) {
494			sp = &mp->mnt_stat;
495			/*
496			 * Set these in case the underlying filesystem
497			 * fails to do so.
498			 */
499			sp->f_version = STATFS_VERSION;
500			sp->f_namemax = NAME_MAX;
501			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
502			/*
503			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
504			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
505			 * overrides MNT_WAIT.
506			 */
507			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
508			    (flags & MNT_WAIT)) &&
509			    (error = VFS_STATFS(mp, sp))) {
510				mtx_lock(&mountlist_mtx);
511				nmp = TAILQ_NEXT(mp, mnt_list);
512				vfs_unbusy(mp);
513				continue;
514			}
515			if (priv_check(td, PRIV_VFS_GENERATION)) {
516				bcopy(sp, &sb, sizeof(sb));
517				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
518				prison_enforce_statfs(td->td_ucred, mp, &sb);
519				sp = &sb;
520			}
521			if (bufseg == UIO_SYSSPACE)
522				bcopy(sp, sfsp, sizeof(*sp));
523			else /* if (bufseg == UIO_USERSPACE) */ {
524				error = copyout(sp, sfsp, sizeof(*sp));
525				if (error != 0) {
526					vfs_unbusy(mp);
527					return (error);
528				}
529			}
530			sfsp++;
531		}
532		count++;
533		mtx_lock(&mountlist_mtx);
534		nmp = TAILQ_NEXT(mp, mnt_list);
535		vfs_unbusy(mp);
536	}
537	mtx_unlock(&mountlist_mtx);
538	if (sfsp && count > maxcount)
539		td->td_retval[0] = maxcount;
540	else
541		td->td_retval[0] = count;
542	return (0);
543}
544
545#ifdef COMPAT_FREEBSD4
546/*
547 * Get old format filesystem statistics.
548 */
549static void cvtstatfs(struct statfs *, struct ostatfs *);
550
551#ifndef _SYS_SYSPROTO_H_
552struct freebsd4_statfs_args {
553	char *path;
554	struct ostatfs *buf;
555};
556#endif
557int
558freebsd4_statfs(td, uap)
559	struct thread *td;
560	struct freebsd4_statfs_args /* {
561		char *path;
562		struct ostatfs *buf;
563	} */ *uap;
564{
565	struct ostatfs osb;
566	struct statfs sf;
567	int error;
568
569	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
570	if (error != 0)
571		return (error);
572	cvtstatfs(&sf, &osb);
573	return (copyout(&osb, uap->buf, sizeof(osb)));
574}
575
576/*
577 * Get filesystem statistics.
578 */
579#ifndef _SYS_SYSPROTO_H_
580struct freebsd4_fstatfs_args {
581	int fd;
582	struct ostatfs *buf;
583};
584#endif
585int
586freebsd4_fstatfs(td, uap)
587	struct thread *td;
588	struct freebsd4_fstatfs_args /* {
589		int fd;
590		struct ostatfs *buf;
591	} */ *uap;
592{
593	struct ostatfs osb;
594	struct statfs sf;
595	int error;
596
597	error = kern_fstatfs(td, uap->fd, &sf);
598	if (error != 0)
599		return (error);
600	cvtstatfs(&sf, &osb);
601	return (copyout(&osb, uap->buf, sizeof(osb)));
602}
603
604/*
605 * Get statistics on all filesystems.
606 */
607#ifndef _SYS_SYSPROTO_H_
608struct freebsd4_getfsstat_args {
609	struct ostatfs *buf;
610	long bufsize;
611	int flags;
612};
613#endif
614int
615freebsd4_getfsstat(td, uap)
616	struct thread *td;
617	register struct freebsd4_getfsstat_args /* {
618		struct ostatfs *buf;
619		long bufsize;
620		int flags;
621	} */ *uap;
622{
623	struct statfs *buf, *sp;
624	struct ostatfs osb;
625	size_t count, size;
626	int error;
627
628	count = uap->bufsize / sizeof(struct ostatfs);
629	size = count * sizeof(struct statfs);
630	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
631	if (size > 0) {
632		count = td->td_retval[0];
633		sp = buf;
634		while (count > 0 && error == 0) {
635			cvtstatfs(sp, &osb);
636			error = copyout(&osb, uap->buf, sizeof(osb));
637			sp++;
638			uap->buf++;
639			count--;
640		}
641		free(buf, M_TEMP);
642	}
643	return (error);
644}
645
646/*
647 * Implement fstatfs() for (NFS) file handles.
648 */
649#ifndef _SYS_SYSPROTO_H_
650struct freebsd4_fhstatfs_args {
651	struct fhandle *u_fhp;
652	struct ostatfs *buf;
653};
654#endif
655int
656freebsd4_fhstatfs(td, uap)
657	struct thread *td;
658	struct freebsd4_fhstatfs_args /* {
659		struct fhandle *u_fhp;
660		struct ostatfs *buf;
661	} */ *uap;
662{
663	struct ostatfs osb;
664	struct statfs sf;
665	fhandle_t fh;
666	int error;
667
668	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
669	if (error != 0)
670		return (error);
671	error = kern_fhstatfs(td, fh, &sf);
672	if (error != 0)
673		return (error);
674	cvtstatfs(&sf, &osb);
675	return (copyout(&osb, uap->buf, sizeof(osb)));
676}
677
678/*
679 * Convert a new format statfs structure to an old format statfs structure.
680 */
681static void
682cvtstatfs(nsp, osp)
683	struct statfs *nsp;
684	struct ostatfs *osp;
685{
686
687	statfs_scale_blocks(nsp, LONG_MAX);
688	bzero(osp, sizeof(*osp));
689	osp->f_bsize = nsp->f_bsize;
690	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
691	osp->f_blocks = nsp->f_blocks;
692	osp->f_bfree = nsp->f_bfree;
693	osp->f_bavail = nsp->f_bavail;
694	osp->f_files = MIN(nsp->f_files, LONG_MAX);
695	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
696	osp->f_owner = nsp->f_owner;
697	osp->f_type = nsp->f_type;
698	osp->f_flags = nsp->f_flags;
699	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
700	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
701	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
702	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
703	strlcpy(osp->f_fstypename, nsp->f_fstypename,
704	    MIN(MFSNAMELEN, OMFSNAMELEN));
705	strlcpy(osp->f_mntonname, nsp->f_mntonname,
706	    MIN(MNAMELEN, OMNAMELEN));
707	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
708	    MIN(MNAMELEN, OMNAMELEN));
709	osp->f_fsid = nsp->f_fsid;
710}
711#endif /* COMPAT_FREEBSD4 */
712
713/*
714 * Change current working directory to a given file descriptor.
715 */
716#ifndef _SYS_SYSPROTO_H_
717struct fchdir_args {
718	int	fd;
719};
720#endif
721int
722sys_fchdir(td, uap)
723	struct thread *td;
724	struct fchdir_args /* {
725		int fd;
726	} */ *uap;
727{
728	register struct filedesc *fdp = td->td_proc->p_fd;
729	struct vnode *vp, *tdp, *vpold;
730	struct mount *mp;
731	struct file *fp;
732	cap_rights_t rights;
733	int error;
734
735	AUDIT_ARG_FD(uap->fd);
736	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
737	    &fp);
738	if (error != 0)
739		return (error);
740	vp = fp->f_vnode;
741	VREF(vp);
742	fdrop(fp, td);
743	vn_lock(vp, LK_SHARED | LK_RETRY);
744	AUDIT_ARG_VNODE1(vp);
745	error = change_dir(vp, td);
746	while (!error && (mp = vp->v_mountedhere) != NULL) {
747		if (vfs_busy(mp, 0))
748			continue;
749		error = VFS_ROOT(mp, LK_SHARED, &tdp);
750		vfs_unbusy(mp);
751		if (error != 0)
752			break;
753		vput(vp);
754		vp = tdp;
755	}
756	if (error != 0) {
757		vput(vp);
758		return (error);
759	}
760	VOP_UNLOCK(vp, 0);
761	FILEDESC_XLOCK(fdp);
762	vpold = fdp->fd_cdir;
763	fdp->fd_cdir = vp;
764	FILEDESC_XUNLOCK(fdp);
765	vrele(vpold);
766	return (0);
767}
768
769/*
770 * Change current working directory (``.'').
771 */
772#ifndef _SYS_SYSPROTO_H_
773struct chdir_args {
774	char	*path;
775};
776#endif
777int
778sys_chdir(td, uap)
779	struct thread *td;
780	struct chdir_args /* {
781		char *path;
782	} */ *uap;
783{
784
785	return (kern_chdir(td, uap->path, UIO_USERSPACE));
786}
787
788int
789kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
790{
791	register struct filedesc *fdp = td->td_proc->p_fd;
792	struct nameidata nd;
793	struct vnode *vp;
794	int error;
795
796	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
797	    pathseg, path, td);
798	if ((error = namei(&nd)) != 0)
799		return (error);
800	if ((error = change_dir(nd.ni_vp, td)) != 0) {
801		vput(nd.ni_vp);
802		NDFREE(&nd, NDF_ONLY_PNBUF);
803		return (error);
804	}
805	VOP_UNLOCK(nd.ni_vp, 0);
806	NDFREE(&nd, NDF_ONLY_PNBUF);
807	FILEDESC_XLOCK(fdp);
808	vp = fdp->fd_cdir;
809	fdp->fd_cdir = nd.ni_vp;
810	FILEDESC_XUNLOCK(fdp);
811	vrele(vp);
812	return (0);
813}
814
815/*
816 * Helper function for raised chroot(2) security function:  Refuse if
817 * any filedescriptors are open directories.
818 */
819static int
820chroot_refuse_vdir_fds(fdp)
821	struct filedesc *fdp;
822{
823	struct vnode *vp;
824	struct file *fp;
825	int fd;
826
827	FILEDESC_LOCK_ASSERT(fdp);
828
829	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
830		fp = fget_locked(fdp, fd);
831		if (fp == NULL)
832			continue;
833		if (fp->f_type == DTYPE_VNODE) {
834			vp = fp->f_vnode;
835			if (vp->v_type == VDIR)
836				return (EPERM);
837		}
838	}
839	return (0);
840}
841
842/*
843 * This sysctl determines if we will allow a process to chroot(2) if it
844 * has a directory open:
845 *	0: disallowed for all processes.
846 *	1: allowed for processes that were not already chroot(2)'ed.
847 *	2: allowed for all processes.
848 */
849
850static int chroot_allow_open_directories = 1;
851
852SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
853     &chroot_allow_open_directories, 0,
854     "Allow a process to chroot(2) if it has a directory open");
855
856/*
857 * Change notion of root (``/'') directory.
858 */
859#ifndef _SYS_SYSPROTO_H_
860struct chroot_args {
861	char	*path;
862};
863#endif
864int
865sys_chroot(td, uap)
866	struct thread *td;
867	struct chroot_args /* {
868		char *path;
869	} */ *uap;
870{
871	struct nameidata nd;
872	int error;
873
874	error = priv_check(td, PRIV_VFS_CHROOT);
875	if (error != 0)
876		return (error);
877	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
878	    UIO_USERSPACE, uap->path, td);
879	error = namei(&nd);
880	if (error != 0)
881		goto error;
882	error = change_dir(nd.ni_vp, td);
883	if (error != 0)
884		goto e_vunlock;
885#ifdef MAC
886	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
887	if (error != 0)
888		goto e_vunlock;
889#endif
890	VOP_UNLOCK(nd.ni_vp, 0);
891	error = change_root(nd.ni_vp, td);
892	vrele(nd.ni_vp);
893	NDFREE(&nd, NDF_ONLY_PNBUF);
894	return (error);
895e_vunlock:
896	vput(nd.ni_vp);
897error:
898	NDFREE(&nd, NDF_ONLY_PNBUF);
899	return (error);
900}
901
902/*
903 * Common routine for chroot and chdir.  Callers must provide a locked vnode
904 * instance.
905 */
906int
907change_dir(vp, td)
908	struct vnode *vp;
909	struct thread *td;
910{
911#ifdef MAC
912	int error;
913#endif
914
915	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
916	if (vp->v_type != VDIR)
917		return (ENOTDIR);
918#ifdef MAC
919	error = mac_vnode_check_chdir(td->td_ucred, vp);
920	if (error != 0)
921		return (error);
922#endif
923	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
924}
925
926/*
927 * Common routine for kern_chroot() and jail_attach().  The caller is
928 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
929 * authorize this operation.
930 */
931int
932change_root(vp, td)
933	struct vnode *vp;
934	struct thread *td;
935{
936	struct filedesc *fdp;
937	struct vnode *oldvp;
938	int error;
939
940	fdp = td->td_proc->p_fd;
941	FILEDESC_XLOCK(fdp);
942	if (chroot_allow_open_directories == 0 ||
943	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
944		error = chroot_refuse_vdir_fds(fdp);
945		if (error != 0) {
946			FILEDESC_XUNLOCK(fdp);
947			return (error);
948		}
949	}
950	oldvp = fdp->fd_rdir;
951	fdp->fd_rdir = vp;
952	VREF(fdp->fd_rdir);
953	if (!fdp->fd_jdir) {
954		fdp->fd_jdir = vp;
955		VREF(fdp->fd_jdir);
956	}
957	FILEDESC_XUNLOCK(fdp);
958	vrele(oldvp);
959	return (0);
960}
961
962static __inline void
963flags_to_rights(int flags, cap_rights_t *rightsp)
964{
965
966	if (flags & O_EXEC) {
967		cap_rights_set(rightsp, CAP_FEXECVE);
968	} else {
969		switch ((flags & O_ACCMODE)) {
970		case O_RDONLY:
971			cap_rights_set(rightsp, CAP_READ);
972			break;
973		case O_RDWR:
974			cap_rights_set(rightsp, CAP_READ);
975			/* FALLTHROUGH */
976		case O_WRONLY:
977			cap_rights_set(rightsp, CAP_WRITE);
978			if (!(flags & (O_APPEND | O_TRUNC)))
979				cap_rights_set(rightsp, CAP_SEEK);
980			break;
981		}
982	}
983
984	if (flags & O_CREAT)
985		cap_rights_set(rightsp, CAP_CREATE);
986
987	if (flags & O_TRUNC)
988		cap_rights_set(rightsp, CAP_FTRUNCATE);
989
990	if (flags & (O_SYNC | O_FSYNC))
991		cap_rights_set(rightsp, CAP_FSYNC);
992
993	if (flags & (O_EXLOCK | O_SHLOCK))
994		cap_rights_set(rightsp, CAP_FLOCK);
995}
996
997/*
998 * Check permissions, allocate an open file structure, and call the device
999 * open routine if any.
1000 */
1001#ifndef _SYS_SYSPROTO_H_
1002struct open_args {
1003	char	*path;
1004	int	flags;
1005	int	mode;
1006};
1007#endif
1008int
1009sys_open(td, uap)
1010	struct thread *td;
1011	register struct open_args /* {
1012		char *path;
1013		int flags;
1014		int mode;
1015	} */ *uap;
1016{
1017
1018	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1019}
1020
1021#ifndef _SYS_SYSPROTO_H_
1022struct openat_args {
1023	int	fd;
1024	char	*path;
1025	int	flag;
1026	int	mode;
1027};
1028#endif
1029int
1030sys_openat(struct thread *td, struct openat_args *uap)
1031{
1032
1033	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1034	    uap->mode));
1035}
1036
1037int
1038kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1039    int mode)
1040{
1041
1042	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1043}
1044
1045int
1046kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1047    int flags, int mode)
1048{
1049	struct proc *p = td->td_proc;
1050	struct filedesc *fdp = p->p_fd;
1051	struct file *fp;
1052	struct vnode *vp;
1053	struct nameidata nd;
1054	cap_rights_t rights;
1055	int cmode, error, indx;
1056
1057	indx = -1;
1058
1059	AUDIT_ARG_FFLAGS(flags);
1060	AUDIT_ARG_MODE(mode);
1061	/* XXX: audit dirfd */
1062	cap_rights_init(&rights, CAP_LOOKUP);
1063	flags_to_rights(flags, &rights);
1064	/*
1065	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1066	 * may be specified.
1067	 */
1068	if (flags & O_EXEC) {
1069		if (flags & O_ACCMODE)
1070			return (EINVAL);
1071	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1072		return (EINVAL);
1073	} else {
1074		flags = FFLAGS(flags);
1075	}
1076
1077	/*
1078	 * Allocate the file descriptor, but don't install a descriptor yet.
1079	 */
1080	error = falloc_noinstall(td, &fp);
1081	if (error != 0)
1082		return (error);
1083	/*
1084	 * An extra reference on `fp' has been held for us by
1085	 * falloc_noinstall().
1086	 */
1087	/* Set the flags early so the finit in devfs can pick them up. */
1088	fp->f_flag = flags & FMASK;
1089	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1090	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1091	    &rights, td);
1092	td->td_dupfd = -1;		/* XXX check for fdopen */
1093	error = vn_open(&nd, &flags, cmode, fp);
1094	if (error != 0) {
1095		/*
1096		 * If the vn_open replaced the method vector, something
1097		 * wonderous happened deep below and we just pass it up
1098		 * pretending we know what we do.
1099		 */
1100		if (error == ENXIO && fp->f_ops != &badfileops)
1101			goto success;
1102
1103		/*
1104		 * Handle special fdopen() case. bleh.
1105		 *
1106		 * Don't do this for relative (capability) lookups; we don't
1107		 * understand exactly what would happen, and we don't think
1108		 * that it ever should.
1109		 */
1110		if (nd.ni_strictrelative == 0 &&
1111		    (error == ENODEV || error == ENXIO) &&
1112		    td->td_dupfd >= 0) {
1113			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1114			    &indx);
1115			if (error == 0)
1116				goto success;
1117		}
1118
1119		goto bad;
1120	}
1121	td->td_dupfd = 0;
1122	NDFREE(&nd, NDF_ONLY_PNBUF);
1123	vp = nd.ni_vp;
1124
1125	/*
1126	 * Store the vnode, for any f_type. Typically, the vnode use
1127	 * count is decremented by direct call to vn_closefile() for
1128	 * files that switched type in the cdevsw fdopen() method.
1129	 */
1130	fp->f_vnode = vp;
1131	/*
1132	 * If the file wasn't claimed by devfs bind it to the normal
1133	 * vnode operations here.
1134	 */
1135	if (fp->f_ops == &badfileops) {
1136		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1137		fp->f_seqcount = 1;
1138		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1139		    DTYPE_VNODE, vp, &vnops);
1140	}
1141
1142	VOP_UNLOCK(vp, 0);
1143	if (flags & O_TRUNC) {
1144		error = fo_truncate(fp, 0, td->td_ucred, td);
1145		if (error != 0)
1146			goto bad;
1147	}
1148success:
1149	/*
1150	 * If we haven't already installed the FD (for dupfdopen), do so now.
1151	 */
1152	if (indx == -1) {
1153		struct filecaps *fcaps;
1154
1155#ifdef CAPABILITIES
1156		if (nd.ni_strictrelative == 1)
1157			fcaps = &nd.ni_filecaps;
1158		else
1159#endif
1160			fcaps = NULL;
1161		error = finstall(td, fp, &indx, flags, fcaps);
1162		/* On success finstall() consumes fcaps. */
1163		if (error != 0) {
1164			filecaps_free(&nd.ni_filecaps);
1165			goto bad;
1166		}
1167	} else {
1168		filecaps_free(&nd.ni_filecaps);
1169	}
1170
1171	/*
1172	 * Release our private reference, leaving the one associated with
1173	 * the descriptor table intact.
1174	 */
1175	fdrop(fp, td);
1176	td->td_retval[0] = indx;
1177	return (0);
1178bad:
1179	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1180	fdrop(fp, td);
1181	return (error);
1182}
1183
1184#ifdef COMPAT_43
1185/*
1186 * Create a file.
1187 */
1188#ifndef _SYS_SYSPROTO_H_
1189struct ocreat_args {
1190	char	*path;
1191	int	mode;
1192};
1193#endif
1194int
1195ocreat(td, uap)
1196	struct thread *td;
1197	register struct ocreat_args /* {
1198		char *path;
1199		int mode;
1200	} */ *uap;
1201{
1202
1203	return (kern_open(td, uap->path, UIO_USERSPACE,
1204	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1205}
1206#endif /* COMPAT_43 */
1207
1208/*
1209 * Create a special file.
1210 */
1211#ifndef _SYS_SYSPROTO_H_
1212struct mknod_args {
1213	char	*path;
1214	int	mode;
1215	int	dev;
1216};
1217#endif
1218int
1219sys_mknod(td, uap)
1220	struct thread *td;
1221	register struct mknod_args /* {
1222		char *path;
1223		int mode;
1224		int dev;
1225	} */ *uap;
1226{
1227
1228	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1229}
1230
1231#ifndef _SYS_SYSPROTO_H_
1232struct mknodat_args {
1233	int	fd;
1234	char	*path;
1235	mode_t	mode;
1236	dev_t	dev;
1237};
1238#endif
1239int
1240sys_mknodat(struct thread *td, struct mknodat_args *uap)
1241{
1242
1243	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1244	    uap->dev));
1245}
1246
1247int
1248kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1249    int dev)
1250{
1251
1252	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1253}
1254
1255int
1256kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1257    int mode, int dev)
1258{
1259	struct vnode *vp;
1260	struct mount *mp;
1261	struct vattr vattr;
1262	struct nameidata nd;
1263	cap_rights_t rights;
1264	int error, whiteout = 0;
1265
1266	AUDIT_ARG_MODE(mode);
1267	AUDIT_ARG_DEV(dev);
1268	switch (mode & S_IFMT) {
1269	case S_IFCHR:
1270	case S_IFBLK:
1271		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1272		break;
1273	case S_IFMT:
1274		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1275		break;
1276	case S_IFWHT:
1277		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1278		break;
1279	case S_IFIFO:
1280		if (dev == 0)
1281			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1282		/* FALLTHROUGH */
1283	default:
1284		error = EINVAL;
1285		break;
1286	}
1287	if (error != 0)
1288		return (error);
1289restart:
1290	bwillwrite();
1291	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1292	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1293	    td);
1294	if ((error = namei(&nd)) != 0)
1295		return (error);
1296	vp = nd.ni_vp;
1297	if (vp != NULL) {
1298		NDFREE(&nd, NDF_ONLY_PNBUF);
1299		if (vp == nd.ni_dvp)
1300			vrele(nd.ni_dvp);
1301		else
1302			vput(nd.ni_dvp);
1303		vrele(vp);
1304		return (EEXIST);
1305	} else {
1306		VATTR_NULL(&vattr);
1307		vattr.va_mode = (mode & ALLPERMS) &
1308		    ~td->td_proc->p_fd->fd_cmask;
1309		vattr.va_rdev = dev;
1310		whiteout = 0;
1311
1312		switch (mode & S_IFMT) {
1313		case S_IFMT:	/* used by badsect to flag bad sectors */
1314			vattr.va_type = VBAD;
1315			break;
1316		case S_IFCHR:
1317			vattr.va_type = VCHR;
1318			break;
1319		case S_IFBLK:
1320			vattr.va_type = VBLK;
1321			break;
1322		case S_IFWHT:
1323			whiteout = 1;
1324			break;
1325		default:
1326			panic("kern_mknod: invalid mode");
1327		}
1328	}
1329	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1330		NDFREE(&nd, NDF_ONLY_PNBUF);
1331		vput(nd.ni_dvp);
1332		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1333			return (error);
1334		goto restart;
1335	}
1336#ifdef MAC
1337	if (error == 0 && !whiteout)
1338		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1339		    &nd.ni_cnd, &vattr);
1340#endif
1341	if (error == 0) {
1342		if (whiteout)
1343			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1344		else {
1345			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1346						&nd.ni_cnd, &vattr);
1347			if (error == 0)
1348				vput(nd.ni_vp);
1349		}
1350	}
1351	NDFREE(&nd, NDF_ONLY_PNBUF);
1352	vput(nd.ni_dvp);
1353	vn_finished_write(mp);
1354	return (error);
1355}
1356
1357/*
1358 * Create a named pipe.
1359 */
1360#ifndef _SYS_SYSPROTO_H_
1361struct mkfifo_args {
1362	char	*path;
1363	int	mode;
1364};
1365#endif
1366int
1367sys_mkfifo(td, uap)
1368	struct thread *td;
1369	register struct mkfifo_args /* {
1370		char *path;
1371		int mode;
1372	} */ *uap;
1373{
1374
1375	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1376}
1377
1378#ifndef _SYS_SYSPROTO_H_
1379struct mkfifoat_args {
1380	int	fd;
1381	char	*path;
1382	mode_t	mode;
1383};
1384#endif
1385int
1386sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1387{
1388
1389	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1390	    uap->mode));
1391}
1392
1393int
1394kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1395{
1396
1397	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1398}
1399
1400int
1401kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1402    int mode)
1403{
1404	struct mount *mp;
1405	struct vattr vattr;
1406	struct nameidata nd;
1407	cap_rights_t rights;
1408	int error;
1409
1410	AUDIT_ARG_MODE(mode);
1411restart:
1412	bwillwrite();
1413	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1414	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1415	    td);
1416	if ((error = namei(&nd)) != 0)
1417		return (error);
1418	if (nd.ni_vp != NULL) {
1419		NDFREE(&nd, NDF_ONLY_PNBUF);
1420		if (nd.ni_vp == nd.ni_dvp)
1421			vrele(nd.ni_dvp);
1422		else
1423			vput(nd.ni_dvp);
1424		vrele(nd.ni_vp);
1425		return (EEXIST);
1426	}
1427	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1428		NDFREE(&nd, NDF_ONLY_PNBUF);
1429		vput(nd.ni_dvp);
1430		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1431			return (error);
1432		goto restart;
1433	}
1434	VATTR_NULL(&vattr);
1435	vattr.va_type = VFIFO;
1436	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1437#ifdef MAC
1438	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1439	    &vattr);
1440	if (error != 0)
1441		goto out;
1442#endif
1443	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1444	if (error == 0)
1445		vput(nd.ni_vp);
1446#ifdef MAC
1447out:
1448#endif
1449	vput(nd.ni_dvp);
1450	vn_finished_write(mp);
1451	NDFREE(&nd, NDF_ONLY_PNBUF);
1452	return (error);
1453}
1454
1455/*
1456 * Make a hard file link.
1457 */
1458#ifndef _SYS_SYSPROTO_H_
1459struct link_args {
1460	char	*path;
1461	char	*link;
1462};
1463#endif
1464int
1465sys_link(td, uap)
1466	struct thread *td;
1467	register struct link_args /* {
1468		char *path;
1469		char *link;
1470	} */ *uap;
1471{
1472
1473	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1474}
1475
1476#ifndef _SYS_SYSPROTO_H_
1477struct linkat_args {
1478	int	fd1;
1479	char	*path1;
1480	int	fd2;
1481	char	*path2;
1482	int	flag;
1483};
1484#endif
1485int
1486sys_linkat(struct thread *td, struct linkat_args *uap)
1487{
1488	int flag;
1489
1490	flag = uap->flag;
1491	if (flag & ~AT_SYMLINK_FOLLOW)
1492		return (EINVAL);
1493
1494	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1495	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1496}
1497
1498int hardlink_check_uid = 0;
1499SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1500    &hardlink_check_uid, 0,
1501    "Unprivileged processes cannot create hard links to files owned by other "
1502    "users");
1503static int hardlink_check_gid = 0;
1504SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1505    &hardlink_check_gid, 0,
1506    "Unprivileged processes cannot create hard links to files owned by other "
1507    "groups");
1508
1509static int
1510can_hardlink(struct vnode *vp, struct ucred *cred)
1511{
1512	struct vattr va;
1513	int error;
1514
1515	if (!hardlink_check_uid && !hardlink_check_gid)
1516		return (0);
1517
1518	error = VOP_GETATTR(vp, &va, cred);
1519	if (error != 0)
1520		return (error);
1521
1522	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1523		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1524		if (error != 0)
1525			return (error);
1526	}
1527
1528	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1529		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1530		if (error != 0)
1531			return (error);
1532	}
1533
1534	return (0);
1535}
1536
1537int
1538kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1539{
1540
1541	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1542}
1543
1544int
1545kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1546    enum uio_seg segflg, int follow)
1547{
1548	struct vnode *vp;
1549	struct mount *mp;
1550	struct nameidata nd;
1551	cap_rights_t rights;
1552	int error;
1553
1554again:
1555	bwillwrite();
1556	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1557
1558	if ((error = namei(&nd)) != 0)
1559		return (error);
1560	NDFREE(&nd, NDF_ONLY_PNBUF);
1561	vp = nd.ni_vp;
1562	if (vp->v_type == VDIR) {
1563		vrele(vp);
1564		return (EPERM);		/* POSIX */
1565	}
1566	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
1567	    NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
1568	    td);
1569	if ((error = namei(&nd)) == 0) {
1570		if (nd.ni_vp != NULL) {
1571			NDFREE(&nd, NDF_ONLY_PNBUF);
1572			if (nd.ni_dvp == nd.ni_vp)
1573				vrele(nd.ni_dvp);
1574			else
1575				vput(nd.ni_dvp);
1576			vrele(nd.ni_vp);
1577			vrele(vp);
1578			return (EEXIST);
1579		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1580			/*
1581			 * Cross-device link.  No need to recheck
1582			 * vp->v_type, since it cannot change, except
1583			 * to VBAD.
1584			 */
1585			NDFREE(&nd, NDF_ONLY_PNBUF);
1586			vput(nd.ni_dvp);
1587			vrele(vp);
1588			return (EXDEV);
1589		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1590			error = can_hardlink(vp, td->td_ucred);
1591#ifdef MAC
1592			if (error == 0)
1593				error = mac_vnode_check_link(td->td_ucred,
1594				    nd.ni_dvp, vp, &nd.ni_cnd);
1595#endif
1596			if (error != 0) {
1597				vput(vp);
1598				vput(nd.ni_dvp);
1599				NDFREE(&nd, NDF_ONLY_PNBUF);
1600				return (error);
1601			}
1602			error = vn_start_write(vp, &mp, V_NOWAIT);
1603			if (error != 0) {
1604				vput(vp);
1605				vput(nd.ni_dvp);
1606				NDFREE(&nd, NDF_ONLY_PNBUF);
1607				error = vn_start_write(NULL, &mp,
1608				    V_XSLEEP | PCATCH);
1609				if (error != 0)
1610					return (error);
1611				goto again;
1612			}
1613			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1614			VOP_UNLOCK(vp, 0);
1615			vput(nd.ni_dvp);
1616			vn_finished_write(mp);
1617			NDFREE(&nd, NDF_ONLY_PNBUF);
1618		} else {
1619			vput(nd.ni_dvp);
1620			NDFREE(&nd, NDF_ONLY_PNBUF);
1621			vrele(vp);
1622			goto again;
1623		}
1624	}
1625	vrele(vp);
1626	return (error);
1627}
1628
1629/*
1630 * Make a symbolic link.
1631 */
1632#ifndef _SYS_SYSPROTO_H_
1633struct symlink_args {
1634	char	*path;
1635	char	*link;
1636};
1637#endif
1638int
1639sys_symlink(td, uap)
1640	struct thread *td;
1641	register struct symlink_args /* {
1642		char *path;
1643		char *link;
1644	} */ *uap;
1645{
1646
1647	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1648}
1649
1650#ifndef _SYS_SYSPROTO_H_
1651struct symlinkat_args {
1652	char	*path;
1653	int	fd;
1654	char	*path2;
1655};
1656#endif
1657int
1658sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1659{
1660
1661	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1662	    UIO_USERSPACE));
1663}
1664
1665int
1666kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1667{
1668
1669	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1670}
1671
1672int
1673kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1674    enum uio_seg segflg)
1675{
1676	struct mount *mp;
1677	struct vattr vattr;
1678	char *syspath;
1679	struct nameidata nd;
1680	int error;
1681	cap_rights_t rights;
1682
1683	if (segflg == UIO_SYSSPACE) {
1684		syspath = path1;
1685	} else {
1686		syspath = uma_zalloc(namei_zone, M_WAITOK);
1687		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1688			goto out;
1689	}
1690	AUDIT_ARG_TEXT(syspath);
1691restart:
1692	bwillwrite();
1693	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1694	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1695	    td);
1696	if ((error = namei(&nd)) != 0)
1697		goto out;
1698	if (nd.ni_vp) {
1699		NDFREE(&nd, NDF_ONLY_PNBUF);
1700		if (nd.ni_vp == nd.ni_dvp)
1701			vrele(nd.ni_dvp);
1702		else
1703			vput(nd.ni_dvp);
1704		vrele(nd.ni_vp);
1705		error = EEXIST;
1706		goto out;
1707	}
1708	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1709		NDFREE(&nd, NDF_ONLY_PNBUF);
1710		vput(nd.ni_dvp);
1711		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1712			goto out;
1713		goto restart;
1714	}
1715	VATTR_NULL(&vattr);
1716	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1717#ifdef MAC
1718	vattr.va_type = VLNK;
1719	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1720	    &vattr);
1721	if (error != 0)
1722		goto out2;
1723#endif
1724	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1725	if (error == 0)
1726		vput(nd.ni_vp);
1727#ifdef MAC
1728out2:
1729#endif
1730	NDFREE(&nd, NDF_ONLY_PNBUF);
1731	vput(nd.ni_dvp);
1732	vn_finished_write(mp);
1733out:
1734	if (segflg != UIO_SYSSPACE)
1735		uma_zfree(namei_zone, syspath);
1736	return (error);
1737}
1738
1739/*
1740 * Delete a whiteout from the filesystem.
1741 */
1742int
1743sys_undelete(td, uap)
1744	struct thread *td;
1745	register struct undelete_args /* {
1746		char *path;
1747	} */ *uap;
1748{
1749	struct mount *mp;
1750	struct nameidata nd;
1751	int error;
1752
1753restart:
1754	bwillwrite();
1755	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1756	    UIO_USERSPACE, uap->path, td);
1757	error = namei(&nd);
1758	if (error != 0)
1759		return (error);
1760
1761	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1762		NDFREE(&nd, NDF_ONLY_PNBUF);
1763		if (nd.ni_vp == nd.ni_dvp)
1764			vrele(nd.ni_dvp);
1765		else
1766			vput(nd.ni_dvp);
1767		if (nd.ni_vp)
1768			vrele(nd.ni_vp);
1769		return (EEXIST);
1770	}
1771	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1772		NDFREE(&nd, NDF_ONLY_PNBUF);
1773		vput(nd.ni_dvp);
1774		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1775			return (error);
1776		goto restart;
1777	}
1778	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1779	NDFREE(&nd, NDF_ONLY_PNBUF);
1780	vput(nd.ni_dvp);
1781	vn_finished_write(mp);
1782	return (error);
1783}
1784
1785/*
1786 * Delete a name from the filesystem.
1787 */
1788#ifndef _SYS_SYSPROTO_H_
1789struct unlink_args {
1790	char	*path;
1791};
1792#endif
1793int
1794sys_unlink(td, uap)
1795	struct thread *td;
1796	struct unlink_args /* {
1797		char *path;
1798	} */ *uap;
1799{
1800
1801	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1802}
1803
1804#ifndef _SYS_SYSPROTO_H_
1805struct unlinkat_args {
1806	int	fd;
1807	char	*path;
1808	int	flag;
1809};
1810#endif
1811int
1812sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1813{
1814	int flag = uap->flag;
1815	int fd = uap->fd;
1816	char *path = uap->path;
1817
1818	if (flag & ~AT_REMOVEDIR)
1819		return (EINVAL);
1820
1821	if (flag & AT_REMOVEDIR)
1822		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1823	else
1824		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1825}
1826
1827int
1828kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1829{
1830
1831	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1832}
1833
1834int
1835kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1836    ino_t oldinum)
1837{
1838	struct mount *mp;
1839	struct vnode *vp;
1840	struct nameidata nd;
1841	struct stat sb;
1842	cap_rights_t rights;
1843	int error;
1844
1845restart:
1846	bwillwrite();
1847	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1848	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1849	if ((error = namei(&nd)) != 0)
1850		return (error == EINVAL ? EPERM : error);
1851	vp = nd.ni_vp;
1852	if (vp->v_type == VDIR && oldinum == 0) {
1853		error = EPERM;		/* POSIX */
1854	} else if (oldinum != 0 &&
1855		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1856		  sb.st_ino != oldinum) {
1857			error = EIDRM;	/* Identifier removed */
1858	} else {
1859		/*
1860		 * The root of a mounted filesystem cannot be deleted.
1861		 *
1862		 * XXX: can this only be a VDIR case?
1863		 */
1864		if (vp->v_vflag & VV_ROOT)
1865			error = EBUSY;
1866	}
1867	if (error == 0) {
1868		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1869			NDFREE(&nd, NDF_ONLY_PNBUF);
1870			vput(nd.ni_dvp);
1871			if (vp == nd.ni_dvp)
1872				vrele(vp);
1873			else
1874				vput(vp);
1875			if ((error = vn_start_write(NULL, &mp,
1876			    V_XSLEEP | PCATCH)) != 0)
1877				return (error);
1878			goto restart;
1879		}
1880#ifdef MAC
1881		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1882		    &nd.ni_cnd);
1883		if (error != 0)
1884			goto out;
1885#endif
1886		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1887		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1888#ifdef MAC
1889out:
1890#endif
1891		vn_finished_write(mp);
1892	}
1893	NDFREE(&nd, NDF_ONLY_PNBUF);
1894	vput(nd.ni_dvp);
1895	if (vp == nd.ni_dvp)
1896		vrele(vp);
1897	else
1898		vput(vp);
1899	return (error);
1900}
1901
1902/*
1903 * Reposition read/write file offset.
1904 */
1905#ifndef _SYS_SYSPROTO_H_
1906struct lseek_args {
1907	int	fd;
1908	int	pad;
1909	off_t	offset;
1910	int	whence;
1911};
1912#endif
1913int
1914sys_lseek(td, uap)
1915	struct thread *td;
1916	register struct lseek_args /* {
1917		int fd;
1918		int pad;
1919		off_t offset;
1920		int whence;
1921	} */ *uap;
1922{
1923	struct file *fp;
1924	cap_rights_t rights;
1925	int error;
1926
1927	AUDIT_ARG_FD(uap->fd);
1928	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1929	if (error != 0)
1930		return (error);
1931	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1932	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1933	fdrop(fp, td);
1934	return (error);
1935}
1936
1937#if defined(COMPAT_43)
1938/*
1939 * Reposition read/write file offset.
1940 */
1941#ifndef _SYS_SYSPROTO_H_
1942struct olseek_args {
1943	int	fd;
1944	long	offset;
1945	int	whence;
1946};
1947#endif
1948int
1949olseek(td, uap)
1950	struct thread *td;
1951	register struct olseek_args /* {
1952		int fd;
1953		long offset;
1954		int whence;
1955	} */ *uap;
1956{
1957	struct lseek_args /* {
1958		int fd;
1959		int pad;
1960		off_t offset;
1961		int whence;
1962	} */ nuap;
1963
1964	nuap.fd = uap->fd;
1965	nuap.offset = uap->offset;
1966	nuap.whence = uap->whence;
1967	return (sys_lseek(td, &nuap));
1968}
1969#endif /* COMPAT_43 */
1970
1971/* Version with the 'pad' argument */
1972int
1973freebsd6_lseek(td, uap)
1974	struct thread *td;
1975	register struct freebsd6_lseek_args *uap;
1976{
1977	struct lseek_args ouap;
1978
1979	ouap.fd = uap->fd;
1980	ouap.offset = uap->offset;
1981	ouap.whence = uap->whence;
1982	return (sys_lseek(td, &ouap));
1983}
1984
1985/*
1986 * Check access permissions using passed credentials.
1987 */
1988static int
1989vn_access(vp, user_flags, cred, td)
1990	struct vnode	*vp;
1991	int		user_flags;
1992	struct ucred	*cred;
1993	struct thread	*td;
1994{
1995	accmode_t accmode;
1996	int error;
1997
1998	/* Flags == 0 means only check for existence. */
1999	error = 0;
2000	if (user_flags) {
2001		accmode = 0;
2002		if (user_flags & R_OK)
2003			accmode |= VREAD;
2004		if (user_flags & W_OK)
2005			accmode |= VWRITE;
2006		if (user_flags & X_OK)
2007			accmode |= VEXEC;
2008#ifdef MAC
2009		error = mac_vnode_check_access(cred, vp, accmode);
2010		if (error != 0)
2011			return (error);
2012#endif
2013		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2014			error = VOP_ACCESS(vp, accmode, cred, td);
2015	}
2016	return (error);
2017}
2018
2019/*
2020 * Check access permissions using "real" credentials.
2021 */
2022#ifndef _SYS_SYSPROTO_H_
2023struct access_args {
2024	char	*path;
2025	int	amode;
2026};
2027#endif
2028int
2029sys_access(td, uap)
2030	struct thread *td;
2031	register struct access_args /* {
2032		char *path;
2033		int amode;
2034	} */ *uap;
2035{
2036
2037	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2038}
2039
2040#ifndef _SYS_SYSPROTO_H_
2041struct faccessat_args {
2042	int	dirfd;
2043	char	*path;
2044	int	amode;
2045	int	flag;
2046}
2047#endif
2048int
2049sys_faccessat(struct thread *td, struct faccessat_args *uap)
2050{
2051
2052	if (uap->flag & ~AT_EACCESS)
2053		return (EINVAL);
2054	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2055	    uap->amode));
2056}
2057
2058int
2059kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2060{
2061
2062	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2063}
2064
2065int
2066kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2067    int flag, int amode)
2068{
2069	struct ucred *cred, *tmpcred;
2070	struct vnode *vp;
2071	struct nameidata nd;
2072	cap_rights_t rights;
2073	int error;
2074
2075	/*
2076	 * Create and modify a temporary credential instead of one that
2077	 * is potentially shared.
2078	 */
2079	if (!(flag & AT_EACCESS)) {
2080		cred = td->td_ucred;
2081		tmpcred = crdup(cred);
2082		tmpcred->cr_uid = cred->cr_ruid;
2083		tmpcred->cr_groups[0] = cred->cr_rgid;
2084		td->td_ucred = tmpcred;
2085	} else
2086		cred = tmpcred = td->td_ucred;
2087	AUDIT_ARG_VALUE(amode);
2088	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2089	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2090	    td);
2091	if ((error = namei(&nd)) != 0)
2092		goto out1;
2093	vp = nd.ni_vp;
2094
2095	error = vn_access(vp, amode, tmpcred, td);
2096	NDFREE(&nd, NDF_ONLY_PNBUF);
2097	vput(vp);
2098out1:
2099	if (!(flag & AT_EACCESS)) {
2100		td->td_ucred = cred;
2101		crfree(tmpcred);
2102	}
2103	return (error);
2104}
2105
2106/*
2107 * Check access permissions using "effective" credentials.
2108 */
2109#ifndef _SYS_SYSPROTO_H_
2110struct eaccess_args {
2111	char	*path;
2112	int	amode;
2113};
2114#endif
2115int
2116sys_eaccess(td, uap)
2117	struct thread *td;
2118	register struct eaccess_args /* {
2119		char *path;
2120		int amode;
2121	} */ *uap;
2122{
2123
2124	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2125}
2126
2127int
2128kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2129{
2130
2131	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2132}
2133
2134#if defined(COMPAT_43)
2135/*
2136 * Get file status; this version follows links.
2137 */
2138#ifndef _SYS_SYSPROTO_H_
2139struct ostat_args {
2140	char	*path;
2141	struct ostat *ub;
2142};
2143#endif
2144int
2145ostat(td, uap)
2146	struct thread *td;
2147	register struct ostat_args /* {
2148		char *path;
2149		struct ostat *ub;
2150	} */ *uap;
2151{
2152	struct stat sb;
2153	struct ostat osb;
2154	int error;
2155
2156	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2157	if (error != 0)
2158		return (error);
2159	cvtstat(&sb, &osb);
2160	return (copyout(&osb, uap->ub, sizeof (osb)));
2161}
2162
2163/*
2164 * Get file status; this version does not follow links.
2165 */
2166#ifndef _SYS_SYSPROTO_H_
2167struct olstat_args {
2168	char	*path;
2169	struct ostat *ub;
2170};
2171#endif
2172int
2173olstat(td, uap)
2174	struct thread *td;
2175	register struct olstat_args /* {
2176		char *path;
2177		struct ostat *ub;
2178	} */ *uap;
2179{
2180	struct stat sb;
2181	struct ostat osb;
2182	int error;
2183
2184	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2185	if (error != 0)
2186		return (error);
2187	cvtstat(&sb, &osb);
2188	return (copyout(&osb, uap->ub, sizeof (osb)));
2189}
2190
2191/*
2192 * Convert from an old to a new stat structure.
2193 */
2194void
2195cvtstat(st, ost)
2196	struct stat *st;
2197	struct ostat *ost;
2198{
2199
2200	bzero(ost, sizeof(*ost));
2201	ost->st_dev = st->st_dev;
2202	ost->st_ino = st->st_ino;
2203	ost->st_mode = st->st_mode;
2204	ost->st_nlink = st->st_nlink;
2205	ost->st_uid = st->st_uid;
2206	ost->st_gid = st->st_gid;
2207	ost->st_rdev = st->st_rdev;
2208	if (st->st_size < (quad_t)1 << 32)
2209		ost->st_size = st->st_size;
2210	else
2211		ost->st_size = -2;
2212	ost->st_atim = st->st_atim;
2213	ost->st_mtim = st->st_mtim;
2214	ost->st_ctim = st->st_ctim;
2215	ost->st_blksize = st->st_blksize;
2216	ost->st_blocks = st->st_blocks;
2217	ost->st_flags = st->st_flags;
2218	ost->st_gen = st->st_gen;
2219}
2220#endif /* COMPAT_43 */
2221
2222/*
2223 * Get file status; this version follows links.
2224 */
2225#ifndef _SYS_SYSPROTO_H_
2226struct stat_args {
2227	char	*path;
2228	struct stat *ub;
2229};
2230#endif
2231int
2232sys_stat(td, uap)
2233	struct thread *td;
2234	register struct stat_args /* {
2235		char *path;
2236		struct stat *ub;
2237	} */ *uap;
2238{
2239	struct stat sb;
2240	int error;
2241
2242	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2243	if (error == 0)
2244		error = copyout(&sb, uap->ub, sizeof (sb));
2245	return (error);
2246}
2247
2248#ifndef _SYS_SYSPROTO_H_
2249struct fstatat_args {
2250	int	fd;
2251	char	*path;
2252	struct stat	*buf;
2253	int	flag;
2254}
2255#endif
2256int
2257sys_fstatat(struct thread *td, struct fstatat_args *uap)
2258{
2259	struct stat sb;
2260	int error;
2261
2262	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2263	    UIO_USERSPACE, &sb);
2264	if (error == 0)
2265		error = copyout(&sb, uap->buf, sizeof (sb));
2266	return (error);
2267}
2268
2269int
2270kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2271{
2272
2273	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2274}
2275
2276int
2277kern_statat(struct thread *td, int flag, int fd, char *path,
2278    enum uio_seg pathseg, struct stat *sbp)
2279{
2280
2281	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2282}
2283
2284int
2285kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2286    enum uio_seg pathseg, struct stat *sbp,
2287    void (*hook)(struct vnode *vp, struct stat *sbp))
2288{
2289	struct nameidata nd;
2290	struct stat sb;
2291	cap_rights_t rights;
2292	int error;
2293
2294	if (flag & ~AT_SYMLINK_NOFOLLOW)
2295		return (EINVAL);
2296
2297	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2298	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2299	    cap_rights_init(&rights, CAP_FSTAT), td);
2300
2301	if ((error = namei(&nd)) != 0)
2302		return (error);
2303	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2304	if (error == 0) {
2305		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2306		if (S_ISREG(sb.st_mode))
2307			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2308		if (__predict_false(hook != NULL))
2309			hook(nd.ni_vp, &sb);
2310	}
2311	NDFREE(&nd, NDF_ONLY_PNBUF);
2312	vput(nd.ni_vp);
2313	if (error != 0)
2314		return (error);
2315	*sbp = sb;
2316#ifdef KTRACE
2317	if (KTRPOINT(td, KTR_STRUCT))
2318		ktrstat(&sb);
2319#endif
2320	return (0);
2321}
2322
2323/*
2324 * Get file status; this version does not follow links.
2325 */
2326#ifndef _SYS_SYSPROTO_H_
2327struct lstat_args {
2328	char	*path;
2329	struct stat *ub;
2330};
2331#endif
2332int
2333sys_lstat(td, uap)
2334	struct thread *td;
2335	register struct lstat_args /* {
2336		char *path;
2337		struct stat *ub;
2338	} */ *uap;
2339{
2340	struct stat sb;
2341	int error;
2342
2343	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2344	if (error == 0)
2345		error = copyout(&sb, uap->ub, sizeof (sb));
2346	return (error);
2347}
2348
2349int
2350kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2351{
2352
2353	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2354	    sbp));
2355}
2356
2357/*
2358 * Implementation of the NetBSD [l]stat() functions.
2359 */
2360void
2361cvtnstat(sb, nsb)
2362	struct stat *sb;
2363	struct nstat *nsb;
2364{
2365
2366	bzero(nsb, sizeof *nsb);
2367	nsb->st_dev = sb->st_dev;
2368	nsb->st_ino = sb->st_ino;
2369	nsb->st_mode = sb->st_mode;
2370	nsb->st_nlink = sb->st_nlink;
2371	nsb->st_uid = sb->st_uid;
2372	nsb->st_gid = sb->st_gid;
2373	nsb->st_rdev = sb->st_rdev;
2374	nsb->st_atim = sb->st_atim;
2375	nsb->st_mtim = sb->st_mtim;
2376	nsb->st_ctim = sb->st_ctim;
2377	nsb->st_size = sb->st_size;
2378	nsb->st_blocks = sb->st_blocks;
2379	nsb->st_blksize = sb->st_blksize;
2380	nsb->st_flags = sb->st_flags;
2381	nsb->st_gen = sb->st_gen;
2382	nsb->st_birthtim = sb->st_birthtim;
2383}
2384
2385#ifndef _SYS_SYSPROTO_H_
2386struct nstat_args {
2387	char	*path;
2388	struct nstat *ub;
2389};
2390#endif
2391int
2392sys_nstat(td, uap)
2393	struct thread *td;
2394	register struct nstat_args /* {
2395		char *path;
2396		struct nstat *ub;
2397	} */ *uap;
2398{
2399	struct stat sb;
2400	struct nstat nsb;
2401	int error;
2402
2403	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2404	if (error != 0)
2405		return (error);
2406	cvtnstat(&sb, &nsb);
2407	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2408}
2409
2410/*
2411 * NetBSD lstat.  Get file status; this version does not follow links.
2412 */
2413#ifndef _SYS_SYSPROTO_H_
2414struct lstat_args {
2415	char	*path;
2416	struct stat *ub;
2417};
2418#endif
2419int
2420sys_nlstat(td, uap)
2421	struct thread *td;
2422	register struct nlstat_args /* {
2423		char *path;
2424		struct nstat *ub;
2425	} */ *uap;
2426{
2427	struct stat sb;
2428	struct nstat nsb;
2429	int error;
2430
2431	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2432	if (error != 0)
2433		return (error);
2434	cvtnstat(&sb, &nsb);
2435	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2436}
2437
2438/*
2439 * Get configurable pathname variables.
2440 */
2441#ifndef _SYS_SYSPROTO_H_
2442struct pathconf_args {
2443	char	*path;
2444	int	name;
2445};
2446#endif
2447int
2448sys_pathconf(td, uap)
2449	struct thread *td;
2450	register struct pathconf_args /* {
2451		char *path;
2452		int name;
2453	} */ *uap;
2454{
2455
2456	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2457}
2458
2459#ifndef _SYS_SYSPROTO_H_
2460struct lpathconf_args {
2461	char	*path;
2462	int	name;
2463};
2464#endif
2465int
2466sys_lpathconf(td, uap)
2467	struct thread *td;
2468	register struct lpathconf_args /* {
2469		char *path;
2470		int name;
2471	} */ *uap;
2472{
2473
2474	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2475	    NOFOLLOW));
2476}
2477
2478int
2479kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2480    u_long flags)
2481{
2482	struct nameidata nd;
2483	int error;
2484
2485	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2486	    pathseg, path, td);
2487	if ((error = namei(&nd)) != 0)
2488		return (error);
2489	NDFREE(&nd, NDF_ONLY_PNBUF);
2490
2491	/* If asynchronous I/O is available, it works for all files. */
2492	if (name == _PC_ASYNC_IO)
2493		td->td_retval[0] = async_io_version;
2494	else
2495		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2496	vput(nd.ni_vp);
2497	return (error);
2498}
2499
2500/*
2501 * Return target name of a symbolic link.
2502 */
2503#ifndef _SYS_SYSPROTO_H_
2504struct readlink_args {
2505	char	*path;
2506	char	*buf;
2507	size_t	count;
2508};
2509#endif
2510int
2511sys_readlink(td, uap)
2512	struct thread *td;
2513	register struct readlink_args /* {
2514		char *path;
2515		char *buf;
2516		size_t count;
2517	} */ *uap;
2518{
2519
2520	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2521	    UIO_USERSPACE, uap->count));
2522}
2523#ifndef _SYS_SYSPROTO_H_
2524struct readlinkat_args {
2525	int	fd;
2526	char	*path;
2527	char	*buf;
2528	size_t	bufsize;
2529};
2530#endif
2531int
2532sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2533{
2534
2535	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2536	    uap->buf, UIO_USERSPACE, uap->bufsize));
2537}
2538
2539int
2540kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2541    enum uio_seg bufseg, size_t count)
2542{
2543
2544	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2545	    count));
2546}
2547
2548int
2549kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2550    char *buf, enum uio_seg bufseg, size_t count)
2551{
2552	struct vnode *vp;
2553	struct iovec aiov;
2554	struct uio auio;
2555	struct nameidata nd;
2556	int error;
2557
2558	if (count > IOSIZE_MAX)
2559		return (EINVAL);
2560
2561	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2562	    pathseg, path, fd, td);
2563
2564	if ((error = namei(&nd)) != 0)
2565		return (error);
2566	NDFREE(&nd, NDF_ONLY_PNBUF);
2567	vp = nd.ni_vp;
2568#ifdef MAC
2569	error = mac_vnode_check_readlink(td->td_ucred, vp);
2570	if (error != 0) {
2571		vput(vp);
2572		return (error);
2573	}
2574#endif
2575	if (vp->v_type != VLNK)
2576		error = EINVAL;
2577	else {
2578		aiov.iov_base = buf;
2579		aiov.iov_len = count;
2580		auio.uio_iov = &aiov;
2581		auio.uio_iovcnt = 1;
2582		auio.uio_offset = 0;
2583		auio.uio_rw = UIO_READ;
2584		auio.uio_segflg = bufseg;
2585		auio.uio_td = td;
2586		auio.uio_resid = count;
2587		error = VOP_READLINK(vp, &auio, td->td_ucred);
2588		td->td_retval[0] = count - auio.uio_resid;
2589	}
2590	vput(vp);
2591	return (error);
2592}
2593
2594/*
2595 * Common implementation code for chflags() and fchflags().
2596 */
2597static int
2598setfflags(td, vp, flags)
2599	struct thread *td;
2600	struct vnode *vp;
2601	u_long flags;
2602{
2603	struct mount *mp;
2604	struct vattr vattr;
2605	int error;
2606
2607	/* We can't support the value matching VNOVAL. */
2608	if (flags == VNOVAL)
2609		return (EOPNOTSUPP);
2610
2611	/*
2612	 * Prevent non-root users from setting flags on devices.  When
2613	 * a device is reused, users can retain ownership of the device
2614	 * if they are allowed to set flags and programs assume that
2615	 * chown can't fail when done as root.
2616	 */
2617	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2618		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2619		if (error != 0)
2620			return (error);
2621	}
2622
2623	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2624		return (error);
2625	VATTR_NULL(&vattr);
2626	vattr.va_flags = flags;
2627	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2628#ifdef MAC
2629	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2630	if (error == 0)
2631#endif
2632		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2633	VOP_UNLOCK(vp, 0);
2634	vn_finished_write(mp);
2635	return (error);
2636}
2637
2638/*
2639 * Change flags of a file given a path name.
2640 */
2641#ifndef _SYS_SYSPROTO_H_
2642struct chflags_args {
2643	const char *path;
2644	u_long	flags;
2645};
2646#endif
2647int
2648sys_chflags(td, uap)
2649	struct thread *td;
2650	register struct chflags_args /* {
2651		const char *path;
2652		u_long flags;
2653	} */ *uap;
2654{
2655
2656	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2657}
2658
2659#ifndef _SYS_SYSPROTO_H_
2660struct chflagsat_args {
2661	int	fd;
2662	const char *path;
2663	u_long	flags;
2664	int	atflag;
2665}
2666#endif
2667int
2668sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2669{
2670	int fd = uap->fd;
2671	const char *path = uap->path;
2672	u_long flags = uap->flags;
2673	int atflag = uap->atflag;
2674
2675	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2676		return (EINVAL);
2677
2678	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2679}
2680
2681static int
2682kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2683    u_long flags)
2684{
2685
2686	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2687}
2688
2689/*
2690 * Same as chflags() but doesn't follow symlinks.
2691 */
2692int
2693sys_lchflags(td, uap)
2694	struct thread *td;
2695	register struct lchflags_args /* {
2696		const char *path;
2697		u_long flags;
2698	} */ *uap;
2699{
2700
2701	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2702	    uap->flags, AT_SYMLINK_NOFOLLOW));
2703}
2704
2705static int
2706kern_chflagsat(struct thread *td, int fd, const char *path,
2707    enum uio_seg pathseg, u_long flags, int atflag)
2708{
2709	struct nameidata nd;
2710	cap_rights_t rights;
2711	int error, follow;
2712
2713	AUDIT_ARG_FFLAGS(flags);
2714	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2715	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2716	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2717	if ((error = namei(&nd)) != 0)
2718		return (error);
2719	NDFREE(&nd, NDF_ONLY_PNBUF);
2720	error = setfflags(td, nd.ni_vp, flags);
2721	vrele(nd.ni_vp);
2722	return (error);
2723}
2724
2725/*
2726 * Change flags of a file given a file descriptor.
2727 */
2728#ifndef _SYS_SYSPROTO_H_
2729struct fchflags_args {
2730	int	fd;
2731	u_long	flags;
2732};
2733#endif
2734int
2735sys_fchflags(td, uap)
2736	struct thread *td;
2737	register struct fchflags_args /* {
2738		int fd;
2739		u_long flags;
2740	} */ *uap;
2741{
2742	struct file *fp;
2743	cap_rights_t rights;
2744	int error;
2745
2746	AUDIT_ARG_FD(uap->fd);
2747	AUDIT_ARG_FFLAGS(uap->flags);
2748	error = getvnode(td->td_proc->p_fd, uap->fd,
2749	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2750	if (error != 0)
2751		return (error);
2752#ifdef AUDIT
2753	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2754	AUDIT_ARG_VNODE1(fp->f_vnode);
2755	VOP_UNLOCK(fp->f_vnode, 0);
2756#endif
2757	error = setfflags(td, fp->f_vnode, uap->flags);
2758	fdrop(fp, td);
2759	return (error);
2760}
2761
2762/*
2763 * Common implementation code for chmod(), lchmod() and fchmod().
2764 */
2765int
2766setfmode(td, cred, vp, mode)
2767	struct thread *td;
2768	struct ucred *cred;
2769	struct vnode *vp;
2770	int mode;
2771{
2772	struct mount *mp;
2773	struct vattr vattr;
2774	int error;
2775
2776	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2777		return (error);
2778	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2779	VATTR_NULL(&vattr);
2780	vattr.va_mode = mode & ALLPERMS;
2781#ifdef MAC
2782	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2783	if (error == 0)
2784#endif
2785		error = VOP_SETATTR(vp, &vattr, cred);
2786	VOP_UNLOCK(vp, 0);
2787	vn_finished_write(mp);
2788	return (error);
2789}
2790
2791/*
2792 * Change mode of a file given path name.
2793 */
2794#ifndef _SYS_SYSPROTO_H_
2795struct chmod_args {
2796	char	*path;
2797	int	mode;
2798};
2799#endif
2800int
2801sys_chmod(td, uap)
2802	struct thread *td;
2803	register struct chmod_args /* {
2804		char *path;
2805		int mode;
2806	} */ *uap;
2807{
2808
2809	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2810}
2811
2812#ifndef _SYS_SYSPROTO_H_
2813struct fchmodat_args {
2814	int	dirfd;
2815	char	*path;
2816	mode_t	mode;
2817	int	flag;
2818}
2819#endif
2820int
2821sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2822{
2823	int flag = uap->flag;
2824	int fd = uap->fd;
2825	char *path = uap->path;
2826	mode_t mode = uap->mode;
2827
2828	if (flag & ~AT_SYMLINK_NOFOLLOW)
2829		return (EINVAL);
2830
2831	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2832}
2833
2834int
2835kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2836{
2837
2838	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2839}
2840
2841/*
2842 * Change mode of a file given path name (don't follow links.)
2843 */
2844#ifndef _SYS_SYSPROTO_H_
2845struct lchmod_args {
2846	char	*path;
2847	int	mode;
2848};
2849#endif
2850int
2851sys_lchmod(td, uap)
2852	struct thread *td;
2853	register struct lchmod_args /* {
2854		char *path;
2855		int mode;
2856	} */ *uap;
2857{
2858
2859	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2860	    uap->mode, AT_SYMLINK_NOFOLLOW));
2861}
2862
2863int
2864kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2865    mode_t mode, int flag)
2866{
2867	struct nameidata nd;
2868	cap_rights_t rights;
2869	int error, follow;
2870
2871	AUDIT_ARG_MODE(mode);
2872	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2873	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2874	    cap_rights_init(&rights, CAP_FCHMOD), td);
2875	if ((error = namei(&nd)) != 0)
2876		return (error);
2877	NDFREE(&nd, NDF_ONLY_PNBUF);
2878	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2879	vrele(nd.ni_vp);
2880	return (error);
2881}
2882
2883/*
2884 * Change mode of a file given a file descriptor.
2885 */
2886#ifndef _SYS_SYSPROTO_H_
2887struct fchmod_args {
2888	int	fd;
2889	int	mode;
2890};
2891#endif
2892int
2893sys_fchmod(struct thread *td, struct fchmod_args *uap)
2894{
2895	struct file *fp;
2896	cap_rights_t rights;
2897	int error;
2898
2899	AUDIT_ARG_FD(uap->fd);
2900	AUDIT_ARG_MODE(uap->mode);
2901
2902	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2903	if (error != 0)
2904		return (error);
2905	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2906	fdrop(fp, td);
2907	return (error);
2908}
2909
2910/*
2911 * Common implementation for chown(), lchown(), and fchown()
2912 */
2913int
2914setfown(td, cred, vp, uid, gid)
2915	struct thread *td;
2916	struct ucred *cred;
2917	struct vnode *vp;
2918	uid_t uid;
2919	gid_t gid;
2920{
2921	struct mount *mp;
2922	struct vattr vattr;
2923	int error;
2924
2925	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2926		return (error);
2927	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2928	VATTR_NULL(&vattr);
2929	vattr.va_uid = uid;
2930	vattr.va_gid = gid;
2931#ifdef MAC
2932	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2933	    vattr.va_gid);
2934	if (error == 0)
2935#endif
2936		error = VOP_SETATTR(vp, &vattr, cred);
2937	VOP_UNLOCK(vp, 0);
2938	vn_finished_write(mp);
2939	return (error);
2940}
2941
2942/*
2943 * Set ownership given a path name.
2944 */
2945#ifndef _SYS_SYSPROTO_H_
2946struct chown_args {
2947	char	*path;
2948	int	uid;
2949	int	gid;
2950};
2951#endif
2952int
2953sys_chown(td, uap)
2954	struct thread *td;
2955	register struct chown_args /* {
2956		char *path;
2957		int uid;
2958		int gid;
2959	} */ *uap;
2960{
2961
2962	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2963}
2964
2965#ifndef _SYS_SYSPROTO_H_
2966struct fchownat_args {
2967	int fd;
2968	const char * path;
2969	uid_t uid;
2970	gid_t gid;
2971	int flag;
2972};
2973#endif
2974int
2975sys_fchownat(struct thread *td, struct fchownat_args *uap)
2976{
2977	int flag;
2978
2979	flag = uap->flag;
2980	if (flag & ~AT_SYMLINK_NOFOLLOW)
2981		return (EINVAL);
2982
2983	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2984	    uap->gid, uap->flag));
2985}
2986
2987int
2988kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2989    int gid)
2990{
2991
2992	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2993}
2994
2995int
2996kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2997    int uid, int gid, int flag)
2998{
2999	struct nameidata nd;
3000	cap_rights_t rights;
3001	int error, follow;
3002
3003	AUDIT_ARG_OWNER(uid, gid);
3004	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3005	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3006	    cap_rights_init(&rights, CAP_FCHOWN), td);
3007
3008	if ((error = namei(&nd)) != 0)
3009		return (error);
3010	NDFREE(&nd, NDF_ONLY_PNBUF);
3011	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3012	vrele(nd.ni_vp);
3013	return (error);
3014}
3015
3016/*
3017 * Set ownership given a path name, do not cross symlinks.
3018 */
3019#ifndef _SYS_SYSPROTO_H_
3020struct lchown_args {
3021	char	*path;
3022	int	uid;
3023	int	gid;
3024};
3025#endif
3026int
3027sys_lchown(td, uap)
3028	struct thread *td;
3029	register struct lchown_args /* {
3030		char *path;
3031		int uid;
3032		int gid;
3033	} */ *uap;
3034{
3035
3036	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3037}
3038
3039int
3040kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3041    int gid)
3042{
3043
3044	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3045	    AT_SYMLINK_NOFOLLOW));
3046}
3047
3048/*
3049 * Set ownership given a file descriptor.
3050 */
3051#ifndef _SYS_SYSPROTO_H_
3052struct fchown_args {
3053	int	fd;
3054	int	uid;
3055	int	gid;
3056};
3057#endif
3058int
3059sys_fchown(td, uap)
3060	struct thread *td;
3061	register struct fchown_args /* {
3062		int fd;
3063		int uid;
3064		int gid;
3065	} */ *uap;
3066{
3067	struct file *fp;
3068	cap_rights_t rights;
3069	int error;
3070
3071	AUDIT_ARG_FD(uap->fd);
3072	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3073	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3074	if (error != 0)
3075		return (error);
3076	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3077	fdrop(fp, td);
3078	return (error);
3079}
3080
3081/*
3082 * Common implementation code for utimes(), lutimes(), and futimes().
3083 */
3084static int
3085getutimes(usrtvp, tvpseg, tsp)
3086	const struct timeval *usrtvp;
3087	enum uio_seg tvpseg;
3088	struct timespec *tsp;
3089{
3090	struct timeval tv[2];
3091	const struct timeval *tvp;
3092	int error;
3093
3094	if (usrtvp == NULL) {
3095		vfs_timestamp(&tsp[0]);
3096		tsp[1] = tsp[0];
3097	} else {
3098		if (tvpseg == UIO_SYSSPACE) {
3099			tvp = usrtvp;
3100		} else {
3101			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3102				return (error);
3103			tvp = tv;
3104		}
3105
3106		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3107		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3108			return (EINVAL);
3109		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3110		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3111	}
3112	return (0);
3113}
3114
3115/*
3116 * Common implementation code for futimens(), utimensat().
3117 */
3118#define	UTIMENS_NULL	0x1
3119#define	UTIMENS_EXIT	0x2
3120static int
3121getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
3122    struct timespec *tsp, int *retflags)
3123{
3124	struct timespec tsnow;
3125	int error;
3126
3127	vfs_timestamp(&tsnow);
3128	*retflags = 0;
3129	if (usrtsp == NULL) {
3130		tsp[0] = tsnow;
3131		tsp[1] = tsnow;
3132		*retflags |= UTIMENS_NULL;
3133		return (0);
3134	}
3135	if (tspseg == UIO_SYSSPACE) {
3136		tsp[0] = usrtsp[0];
3137		tsp[1] = usrtsp[1];
3138	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
3139		return (error);
3140	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
3141		*retflags |= UTIMENS_EXIT;
3142	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
3143		*retflags |= UTIMENS_NULL;
3144	if (tsp[0].tv_nsec == UTIME_OMIT)
3145		tsp[0].tv_sec = VNOVAL;
3146	else if (tsp[0].tv_nsec == UTIME_NOW)
3147		tsp[0] = tsnow;
3148	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
3149		return (EINVAL);
3150	if (tsp[1].tv_nsec == UTIME_OMIT)
3151		tsp[1].tv_sec = VNOVAL;
3152	else if (tsp[1].tv_nsec == UTIME_NOW)
3153		tsp[1] = tsnow;
3154	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
3155		return (EINVAL);
3156
3157	return (0);
3158}
3159
3160/*
3161 * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
3162 * and utimensat().
3163 */
3164static int
3165setutimes(td, vp, ts, numtimes, nullflag)
3166	struct thread *td;
3167	struct vnode *vp;
3168	const struct timespec *ts;
3169	int numtimes;
3170	int nullflag;
3171{
3172	struct mount *mp;
3173	struct vattr vattr;
3174	int error, setbirthtime;
3175
3176	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3177		return (error);
3178	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3179	setbirthtime = 0;
3180	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3181	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3182		setbirthtime = 1;
3183	VATTR_NULL(&vattr);
3184	vattr.va_atime = ts[0];
3185	vattr.va_mtime = ts[1];
3186	if (setbirthtime)
3187		vattr.va_birthtime = ts[1];
3188	if (numtimes > 2)
3189		vattr.va_birthtime = ts[2];
3190	if (nullflag)
3191		vattr.va_vaflags |= VA_UTIMES_NULL;
3192#ifdef MAC
3193	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3194	    vattr.va_mtime);
3195#endif
3196	if (error == 0)
3197		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3198	VOP_UNLOCK(vp, 0);
3199	vn_finished_write(mp);
3200	return (error);
3201}
3202
3203/*
3204 * Set the access and modification times of a file.
3205 */
3206#ifndef _SYS_SYSPROTO_H_
3207struct utimes_args {
3208	char	*path;
3209	struct	timeval *tptr;
3210};
3211#endif
3212int
3213sys_utimes(td, uap)
3214	struct thread *td;
3215	register struct utimes_args /* {
3216		char *path;
3217		struct timeval *tptr;
3218	} */ *uap;
3219{
3220
3221	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3222	    UIO_USERSPACE));
3223}
3224
3225#ifndef _SYS_SYSPROTO_H_
3226struct futimesat_args {
3227	int fd;
3228	const char * path;
3229	const struct timeval * times;
3230};
3231#endif
3232int
3233sys_futimesat(struct thread *td, struct futimesat_args *uap)
3234{
3235
3236	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3237	    uap->times, UIO_USERSPACE));
3238}
3239
3240int
3241kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3242    struct timeval *tptr, enum uio_seg tptrseg)
3243{
3244
3245	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3246}
3247
3248int
3249kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3250    struct timeval *tptr, enum uio_seg tptrseg)
3251{
3252	struct nameidata nd;
3253	struct timespec ts[2];
3254	cap_rights_t rights;
3255	int error;
3256
3257	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3258		return (error);
3259	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3260	    cap_rights_init(&rights, CAP_FUTIMES), td);
3261
3262	if ((error = namei(&nd)) != 0)
3263		return (error);
3264	NDFREE(&nd, NDF_ONLY_PNBUF);
3265	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3266	vrele(nd.ni_vp);
3267	return (error);
3268}
3269
3270/*
3271 * Set the access and modification times of a file.
3272 */
3273#ifndef _SYS_SYSPROTO_H_
3274struct lutimes_args {
3275	char	*path;
3276	struct	timeval *tptr;
3277};
3278#endif
3279int
3280sys_lutimes(td, uap)
3281	struct thread *td;
3282	register struct lutimes_args /* {
3283		char *path;
3284		struct timeval *tptr;
3285	} */ *uap;
3286{
3287
3288	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3289	    UIO_USERSPACE));
3290}
3291
3292int
3293kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3294    struct timeval *tptr, enum uio_seg tptrseg)
3295{
3296	struct timespec ts[2];
3297	struct nameidata nd;
3298	int error;
3299
3300	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3301		return (error);
3302	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3303	if ((error = namei(&nd)) != 0)
3304		return (error);
3305	NDFREE(&nd, NDF_ONLY_PNBUF);
3306	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3307	vrele(nd.ni_vp);
3308	return (error);
3309}
3310
3311/*
3312 * Set the access and modification times of a file.
3313 */
3314#ifndef _SYS_SYSPROTO_H_
3315struct futimes_args {
3316	int	fd;
3317	struct	timeval *tptr;
3318};
3319#endif
3320int
3321sys_futimes(td, uap)
3322	struct thread *td;
3323	register struct futimes_args /* {
3324		int  fd;
3325		struct timeval *tptr;
3326	} */ *uap;
3327{
3328
3329	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3330}
3331
3332int
3333kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3334    enum uio_seg tptrseg)
3335{
3336	struct timespec ts[2];
3337	struct file *fp;
3338	cap_rights_t rights;
3339	int error;
3340
3341	AUDIT_ARG_FD(fd);
3342	error = getutimes(tptr, tptrseg, ts);
3343	if (error != 0)
3344		return (error);
3345	error = getvnode(td->td_proc->p_fd, fd,
3346	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3347	if (error != 0)
3348		return (error);
3349#ifdef AUDIT
3350	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3351	AUDIT_ARG_VNODE1(fp->f_vnode);
3352	VOP_UNLOCK(fp->f_vnode, 0);
3353#endif
3354	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3355	fdrop(fp, td);
3356	return (error);
3357}
3358
3359int
3360sys_futimens(struct thread *td, struct futimens_args *uap)
3361{
3362
3363	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3364}
3365
3366int
3367kern_futimens(struct thread *td, int fd, struct timespec *tptr,
3368    enum uio_seg tptrseg)
3369{
3370	struct timespec ts[2];
3371	struct file *fp;
3372	cap_rights_t rights;
3373	int error, flags;
3374
3375	AUDIT_ARG_FD(fd);
3376	error = getutimens(tptr, tptrseg, ts, &flags);
3377	if (error != 0)
3378		return (error);
3379	if (flags & UTIMENS_EXIT)
3380		return (0);
3381	error = getvnode(td->td_proc->p_fd, fd,
3382	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3383	if (error != 0)
3384		return (error);
3385#ifdef AUDIT
3386	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3387	AUDIT_ARG_VNODE1(fp->f_vnode);
3388	VOP_UNLOCK(fp->f_vnode, 0);
3389#endif
3390	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3391	fdrop(fp, td);
3392	return (error);
3393}
3394
3395int
3396sys_utimensat(struct thread *td, struct utimensat_args *uap)
3397{
3398
3399	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3400	    uap->times, UIO_USERSPACE, uap->flag));
3401}
3402
3403int
3404kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3405    struct timespec *tptr, enum uio_seg tptrseg, int flag)
3406{
3407	struct nameidata nd;
3408	struct timespec ts[2];
3409	cap_rights_t rights;
3410	int error, flags;
3411
3412	if (flag & ~AT_SYMLINK_NOFOLLOW)
3413		return (EINVAL);
3414
3415	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3416		return (error);
3417	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
3418	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
3419	    cap_rights_init(&rights, CAP_FUTIMES), td);
3420	if ((error = namei(&nd)) != 0)
3421		return (error);
3422	/*
3423	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3424	 * POSIX states:
3425	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3426	 * "Search permission is denied by a component of the path prefix."
3427	 */
3428	NDFREE(&nd, NDF_ONLY_PNBUF);
3429	if ((flags & UTIMENS_EXIT) == 0)
3430		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3431	vrele(nd.ni_vp);
3432	return (error);
3433}
3434
3435/*
3436 * Truncate a file given its path name.
3437 */
3438#ifndef _SYS_SYSPROTO_H_
3439struct truncate_args {
3440	char	*path;
3441	int	pad;
3442	off_t	length;
3443};
3444#endif
3445int
3446sys_truncate(td, uap)
3447	struct thread *td;
3448	register struct truncate_args /* {
3449		char *path;
3450		int pad;
3451		off_t length;
3452	} */ *uap;
3453{
3454
3455	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3456}
3457
3458int
3459kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3460{
3461	struct mount *mp;
3462	struct vnode *vp;
3463	void *rl_cookie;
3464	struct vattr vattr;
3465	struct nameidata nd;
3466	int error;
3467
3468	if (length < 0)
3469		return(EINVAL);
3470	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3471	if ((error = namei(&nd)) != 0)
3472		return (error);
3473	vp = nd.ni_vp;
3474	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3475	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3476		vn_rangelock_unlock(vp, rl_cookie);
3477		vrele(vp);
3478		return (error);
3479	}
3480	NDFREE(&nd, NDF_ONLY_PNBUF);
3481	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3482	if (vp->v_type == VDIR)
3483		error = EISDIR;
3484#ifdef MAC
3485	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3486	}
3487#endif
3488	else if ((error = vn_writechk(vp)) == 0 &&
3489	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3490		VATTR_NULL(&vattr);
3491		vattr.va_size = length;
3492		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3493	}
3494	VOP_UNLOCK(vp, 0);
3495	vn_finished_write(mp);
3496	vn_rangelock_unlock(vp, rl_cookie);
3497	vrele(vp);
3498	return (error);
3499}
3500
3501#if defined(COMPAT_43)
3502/*
3503 * Truncate a file given its path name.
3504 */
3505#ifndef _SYS_SYSPROTO_H_
3506struct otruncate_args {
3507	char	*path;
3508	long	length;
3509};
3510#endif
3511int
3512otruncate(td, uap)
3513	struct thread *td;
3514	register struct otruncate_args /* {
3515		char *path;
3516		long length;
3517	} */ *uap;
3518{
3519	struct truncate_args /* {
3520		char *path;
3521		int pad;
3522		off_t length;
3523	} */ nuap;
3524
3525	nuap.path = uap->path;
3526	nuap.length = uap->length;
3527	return (sys_truncate(td, &nuap));
3528}
3529#endif /* COMPAT_43 */
3530
3531/* Versions with the pad argument */
3532int
3533freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3534{
3535	struct truncate_args ouap;
3536
3537	ouap.path = uap->path;
3538	ouap.length = uap->length;
3539	return (sys_truncate(td, &ouap));
3540}
3541
3542int
3543freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3544{
3545	struct ftruncate_args ouap;
3546
3547	ouap.fd = uap->fd;
3548	ouap.length = uap->length;
3549	return (sys_ftruncate(td, &ouap));
3550}
3551
3552/*
3553 * Sync an open file.
3554 */
3555#ifndef _SYS_SYSPROTO_H_
3556struct fsync_args {
3557	int	fd;
3558};
3559#endif
3560int
3561sys_fsync(td, uap)
3562	struct thread *td;
3563	struct fsync_args /* {
3564		int fd;
3565	} */ *uap;
3566{
3567	struct vnode *vp;
3568	struct mount *mp;
3569	struct file *fp;
3570	cap_rights_t rights;
3571	int error, lock_flags;
3572
3573	AUDIT_ARG_FD(uap->fd);
3574	error = getvnode(td->td_proc->p_fd, uap->fd,
3575	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3576	if (error != 0)
3577		return (error);
3578	vp = fp->f_vnode;
3579	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3580	if (error != 0)
3581		goto drop;
3582	if (MNT_SHARED_WRITES(mp) ||
3583	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3584		lock_flags = LK_SHARED;
3585	} else {
3586		lock_flags = LK_EXCLUSIVE;
3587	}
3588	vn_lock(vp, lock_flags | LK_RETRY);
3589	AUDIT_ARG_VNODE1(vp);
3590	if (vp->v_object != NULL) {
3591		VM_OBJECT_WLOCK(vp->v_object);
3592		vm_object_page_clean(vp->v_object, 0, 0, 0);
3593		VM_OBJECT_WUNLOCK(vp->v_object);
3594	}
3595	error = VOP_FSYNC(vp, MNT_WAIT, td);
3596
3597	VOP_UNLOCK(vp, 0);
3598	vn_finished_write(mp);
3599drop:
3600	fdrop(fp, td);
3601	return (error);
3602}
3603
3604/*
3605 * Rename files.  Source and destination must either both be directories, or
3606 * both not be directories.  If target is a directory, it must be empty.
3607 */
3608#ifndef _SYS_SYSPROTO_H_
3609struct rename_args {
3610	char	*from;
3611	char	*to;
3612};
3613#endif
3614int
3615sys_rename(td, uap)
3616	struct thread *td;
3617	register struct rename_args /* {
3618		char *from;
3619		char *to;
3620	} */ *uap;
3621{
3622
3623	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3624}
3625
3626#ifndef _SYS_SYSPROTO_H_
3627struct renameat_args {
3628	int	oldfd;
3629	char	*old;
3630	int	newfd;
3631	char	*new;
3632};
3633#endif
3634int
3635sys_renameat(struct thread *td, struct renameat_args *uap)
3636{
3637
3638	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3639	    UIO_USERSPACE));
3640}
3641
3642int
3643kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3644{
3645
3646	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3647}
3648
3649int
3650kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3651    enum uio_seg pathseg)
3652{
3653	struct mount *mp = NULL;
3654	struct vnode *tvp, *fvp, *tdvp;
3655	struct nameidata fromnd, tond;
3656	cap_rights_t rights;
3657	int error;
3658
3659again:
3660	bwillwrite();
3661#ifdef MAC
3662	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3663	    AUDITVNODE1, pathseg, old, oldfd,
3664	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3665#else
3666	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3667	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3668#endif
3669
3670	if ((error = namei(&fromnd)) != 0)
3671		return (error);
3672#ifdef MAC
3673	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3674	    fromnd.ni_vp, &fromnd.ni_cnd);
3675	VOP_UNLOCK(fromnd.ni_dvp, 0);
3676	if (fromnd.ni_dvp != fromnd.ni_vp)
3677		VOP_UNLOCK(fromnd.ni_vp, 0);
3678#endif
3679	fvp = fromnd.ni_vp;
3680	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3681	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3682	    cap_rights_init(&rights, CAP_LINKAT), td);
3683	if (fromnd.ni_vp->v_type == VDIR)
3684		tond.ni_cnd.cn_flags |= WILLBEDIR;
3685	if ((error = namei(&tond)) != 0) {
3686		/* Translate error code for rename("dir1", "dir2/."). */
3687		if (error == EISDIR && fvp->v_type == VDIR)
3688			error = EINVAL;
3689		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3690		vrele(fromnd.ni_dvp);
3691		vrele(fvp);
3692		goto out1;
3693	}
3694	tdvp = tond.ni_dvp;
3695	tvp = tond.ni_vp;
3696	error = vn_start_write(fvp, &mp, V_NOWAIT);
3697	if (error != 0) {
3698		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3699		NDFREE(&tond, NDF_ONLY_PNBUF);
3700		if (tvp != NULL)
3701			vput(tvp);
3702		if (tdvp == tvp)
3703			vrele(tdvp);
3704		else
3705			vput(tdvp);
3706		vrele(fromnd.ni_dvp);
3707		vrele(fvp);
3708		vrele(tond.ni_startdir);
3709		if (fromnd.ni_startdir != NULL)
3710			vrele(fromnd.ni_startdir);
3711		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3712		if (error != 0)
3713			return (error);
3714		goto again;
3715	}
3716	if (tvp != NULL) {
3717		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3718			error = ENOTDIR;
3719			goto out;
3720		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3721			error = EISDIR;
3722			goto out;
3723		}
3724#ifdef CAPABILITIES
3725		if (newfd != AT_FDCWD) {
3726			/*
3727			 * If the target already exists we require CAP_UNLINKAT
3728			 * from 'newfd'.
3729			 */
3730			error = cap_check(&tond.ni_filecaps.fc_rights,
3731			    cap_rights_init(&rights, CAP_UNLINKAT));
3732			if (error != 0)
3733				goto out;
3734		}
3735#endif
3736	}
3737	if (fvp == tdvp) {
3738		error = EINVAL;
3739		goto out;
3740	}
3741	/*
3742	 * If the source is the same as the destination (that is, if they
3743	 * are links to the same vnode), then there is nothing to do.
3744	 */
3745	if (fvp == tvp)
3746		error = -1;
3747#ifdef MAC
3748	else
3749		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3750		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3751#endif
3752out:
3753	if (error == 0) {
3754		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3755		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3756		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3757		NDFREE(&tond, NDF_ONLY_PNBUF);
3758	} else {
3759		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3760		NDFREE(&tond, NDF_ONLY_PNBUF);
3761		if (tvp != NULL)
3762			vput(tvp);
3763		if (tdvp == tvp)
3764			vrele(tdvp);
3765		else
3766			vput(tdvp);
3767		vrele(fromnd.ni_dvp);
3768		vrele(fvp);
3769	}
3770	vrele(tond.ni_startdir);
3771	vn_finished_write(mp);
3772out1:
3773	if (fromnd.ni_startdir)
3774		vrele(fromnd.ni_startdir);
3775	if (error == -1)
3776		return (0);
3777	return (error);
3778}
3779
3780/*
3781 * Make a directory file.
3782 */
3783#ifndef _SYS_SYSPROTO_H_
3784struct mkdir_args {
3785	char	*path;
3786	int	mode;
3787};
3788#endif
3789int
3790sys_mkdir(td, uap)
3791	struct thread *td;
3792	register struct mkdir_args /* {
3793		char *path;
3794		int mode;
3795	} */ *uap;
3796{
3797
3798	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3799}
3800
3801#ifndef _SYS_SYSPROTO_H_
3802struct mkdirat_args {
3803	int	fd;
3804	char	*path;
3805	mode_t	mode;
3806};
3807#endif
3808int
3809sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3810{
3811
3812	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3813}
3814
3815int
3816kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3817{
3818
3819	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3820}
3821
3822int
3823kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3824    int mode)
3825{
3826	struct mount *mp;
3827	struct vnode *vp;
3828	struct vattr vattr;
3829	struct nameidata nd;
3830	cap_rights_t rights;
3831	int error;
3832
3833	AUDIT_ARG_MODE(mode);
3834restart:
3835	bwillwrite();
3836	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3837	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3838	    td);
3839	nd.ni_cnd.cn_flags |= WILLBEDIR;
3840	if ((error = namei(&nd)) != 0)
3841		return (error);
3842	vp = nd.ni_vp;
3843	if (vp != NULL) {
3844		NDFREE(&nd, NDF_ONLY_PNBUF);
3845		/*
3846		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3847		 * the strange behaviour of leaving the vnode unlocked
3848		 * if the target is the same vnode as the parent.
3849		 */
3850		if (vp == nd.ni_dvp)
3851			vrele(nd.ni_dvp);
3852		else
3853			vput(nd.ni_dvp);
3854		vrele(vp);
3855		return (EEXIST);
3856	}
3857	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3858		NDFREE(&nd, NDF_ONLY_PNBUF);
3859		vput(nd.ni_dvp);
3860		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3861			return (error);
3862		goto restart;
3863	}
3864	VATTR_NULL(&vattr);
3865	vattr.va_type = VDIR;
3866	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3867#ifdef MAC
3868	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3869	    &vattr);
3870	if (error != 0)
3871		goto out;
3872#endif
3873	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3874#ifdef MAC
3875out:
3876#endif
3877	NDFREE(&nd, NDF_ONLY_PNBUF);
3878	vput(nd.ni_dvp);
3879	if (error == 0)
3880		vput(nd.ni_vp);
3881	vn_finished_write(mp);
3882	return (error);
3883}
3884
3885/*
3886 * Remove a directory file.
3887 */
3888#ifndef _SYS_SYSPROTO_H_
3889struct rmdir_args {
3890	char	*path;
3891};
3892#endif
3893int
3894sys_rmdir(td, uap)
3895	struct thread *td;
3896	struct rmdir_args /* {
3897		char *path;
3898	} */ *uap;
3899{
3900
3901	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3902}
3903
3904int
3905kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3906{
3907
3908	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3909}
3910
3911int
3912kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3913{
3914	struct mount *mp;
3915	struct vnode *vp;
3916	struct nameidata nd;
3917	cap_rights_t rights;
3918	int error;
3919
3920restart:
3921	bwillwrite();
3922	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3923	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3924	if ((error = namei(&nd)) != 0)
3925		return (error);
3926	vp = nd.ni_vp;
3927	if (vp->v_type != VDIR) {
3928		error = ENOTDIR;
3929		goto out;
3930	}
3931	/*
3932	 * No rmdir "." please.
3933	 */
3934	if (nd.ni_dvp == vp) {
3935		error = EINVAL;
3936		goto out;
3937	}
3938	/*
3939	 * The root of a mounted filesystem cannot be deleted.
3940	 */
3941	if (vp->v_vflag & VV_ROOT) {
3942		error = EBUSY;
3943		goto out;
3944	}
3945#ifdef MAC
3946	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3947	    &nd.ni_cnd);
3948	if (error != 0)
3949		goto out;
3950#endif
3951	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3952		NDFREE(&nd, NDF_ONLY_PNBUF);
3953		vput(vp);
3954		if (nd.ni_dvp == vp)
3955			vrele(nd.ni_dvp);
3956		else
3957			vput(nd.ni_dvp);
3958		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3959			return (error);
3960		goto restart;
3961	}
3962	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3963	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3964	vn_finished_write(mp);
3965out:
3966	NDFREE(&nd, NDF_ONLY_PNBUF);
3967	vput(vp);
3968	if (nd.ni_dvp == vp)
3969		vrele(nd.ni_dvp);
3970	else
3971		vput(nd.ni_dvp);
3972	return (error);
3973}
3974
3975#ifdef COMPAT_43
3976/*
3977 * Read a block of directory entries in a filesystem independent format.
3978 */
3979#ifndef _SYS_SYSPROTO_H_
3980struct ogetdirentries_args {
3981	int	fd;
3982	char	*buf;
3983	u_int	count;
3984	long	*basep;
3985};
3986#endif
3987int
3988ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3989{
3990	long loff;
3991	int error;
3992
3993	error = kern_ogetdirentries(td, uap, &loff);
3994	if (error == 0)
3995		error = copyout(&loff, uap->basep, sizeof(long));
3996	return (error);
3997}
3998
3999int
4000kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
4001    long *ploff)
4002{
4003	struct vnode *vp;
4004	struct file *fp;
4005	struct uio auio, kuio;
4006	struct iovec aiov, kiov;
4007	struct dirent *dp, *edp;
4008	cap_rights_t rights;
4009	caddr_t dirbuf;
4010	int error, eofflag, readcnt;
4011	long loff;
4012	off_t foffset;
4013
4014	/* XXX arbitrary sanity limit on `count'. */
4015	if (uap->count > 64 * 1024)
4016		return (EINVAL);
4017	error = getvnode(td->td_proc->p_fd, uap->fd,
4018	    cap_rights_init(&rights, CAP_READ), &fp);
4019	if (error != 0)
4020		return (error);
4021	if ((fp->f_flag & FREAD) == 0) {
4022		fdrop(fp, td);
4023		return (EBADF);
4024	}
4025	vp = fp->f_vnode;
4026	foffset = foffset_lock(fp, 0);
4027unionread:
4028	if (vp->v_type != VDIR) {
4029		foffset_unlock(fp, foffset, 0);
4030		fdrop(fp, td);
4031		return (EINVAL);
4032	}
4033	aiov.iov_base = uap->buf;
4034	aiov.iov_len = uap->count;
4035	auio.uio_iov = &aiov;
4036	auio.uio_iovcnt = 1;
4037	auio.uio_rw = UIO_READ;
4038	auio.uio_segflg = UIO_USERSPACE;
4039	auio.uio_td = td;
4040	auio.uio_resid = uap->count;
4041	vn_lock(vp, LK_SHARED | LK_RETRY);
4042	loff = auio.uio_offset = foffset;
4043#ifdef MAC
4044	error = mac_vnode_check_readdir(td->td_ucred, vp);
4045	if (error != 0) {
4046		VOP_UNLOCK(vp, 0);
4047		foffset_unlock(fp, foffset, FOF_NOUPDATE);
4048		fdrop(fp, td);
4049		return (error);
4050	}
4051#endif
4052#	if (BYTE_ORDER != LITTLE_ENDIAN)
4053		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
4054			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
4055			    NULL, NULL);
4056			foffset = auio.uio_offset;
4057		} else
4058#	endif
4059	{
4060		kuio = auio;
4061		kuio.uio_iov = &kiov;
4062		kuio.uio_segflg = UIO_SYSSPACE;
4063		kiov.iov_len = uap->count;
4064		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
4065		kiov.iov_base = dirbuf;
4066		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
4067			    NULL, NULL);
4068		foffset = kuio.uio_offset;
4069		if (error == 0) {
4070			readcnt = uap->count - kuio.uio_resid;
4071			edp = (struct dirent *)&dirbuf[readcnt];
4072			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
4073#				if (BYTE_ORDER == LITTLE_ENDIAN)
4074					/*
4075					 * The expected low byte of
4076					 * dp->d_namlen is our dp->d_type.
4077					 * The high MBZ byte of dp->d_namlen
4078					 * is our dp->d_namlen.
4079					 */
4080					dp->d_type = dp->d_namlen;
4081					dp->d_namlen = 0;
4082#				else
4083					/*
4084					 * The dp->d_type is the high byte
4085					 * of the expected dp->d_namlen,
4086					 * so must be zero'ed.
4087					 */
4088					dp->d_type = 0;
4089#				endif
4090				if (dp->d_reclen > 0) {
4091					dp = (struct dirent *)
4092					    ((char *)dp + dp->d_reclen);
4093				} else {
4094					error = EIO;
4095					break;
4096				}
4097			}
4098			if (dp >= edp)
4099				error = uiomove(dirbuf, readcnt, &auio);
4100		}
4101		free(dirbuf, M_TEMP);
4102	}
4103	if (error != 0) {
4104		VOP_UNLOCK(vp, 0);
4105		foffset_unlock(fp, foffset, 0);
4106		fdrop(fp, td);
4107		return (error);
4108	}
4109	if (uap->count == auio.uio_resid &&
4110	    (vp->v_vflag & VV_ROOT) &&
4111	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4112		struct vnode *tvp = vp;
4113		vp = vp->v_mount->mnt_vnodecovered;
4114		VREF(vp);
4115		fp->f_vnode = vp;
4116		fp->f_data = vp;
4117		foffset = 0;
4118		vput(tvp);
4119		goto unionread;
4120	}
4121	VOP_UNLOCK(vp, 0);
4122	foffset_unlock(fp, foffset, 0);
4123	fdrop(fp, td);
4124	td->td_retval[0] = uap->count - auio.uio_resid;
4125	if (error == 0)
4126		*ploff = loff;
4127	return (error);
4128}
4129#endif /* COMPAT_43 */
4130
4131/*
4132 * Read a block of directory entries in a filesystem independent format.
4133 */
4134#ifndef _SYS_SYSPROTO_H_
4135struct getdirentries_args {
4136	int	fd;
4137	char	*buf;
4138	u_int	count;
4139	long	*basep;
4140};
4141#endif
4142int
4143sys_getdirentries(td, uap)
4144	struct thread *td;
4145	register struct getdirentries_args /* {
4146		int fd;
4147		char *buf;
4148		u_int count;
4149		long *basep;
4150	} */ *uap;
4151{
4152	long base;
4153	int error;
4154
4155	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4156	    NULL, UIO_USERSPACE);
4157	if (error != 0)
4158		return (error);
4159	if (uap->basep != NULL)
4160		error = copyout(&base, uap->basep, sizeof(long));
4161	return (error);
4162}
4163
4164int
4165kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4166    long *basep, ssize_t *residp, enum uio_seg bufseg)
4167{
4168	struct vnode *vp;
4169	struct file *fp;
4170	struct uio auio;
4171	struct iovec aiov;
4172	cap_rights_t rights;
4173	long loff;
4174	int error, eofflag;
4175	off_t foffset;
4176
4177	AUDIT_ARG_FD(fd);
4178	if (count > IOSIZE_MAX)
4179		return (EINVAL);
4180	auio.uio_resid = count;
4181	error = getvnode(td->td_proc->p_fd, fd,
4182	    cap_rights_init(&rights, CAP_READ), &fp);
4183	if (error != 0)
4184		return (error);
4185	if ((fp->f_flag & FREAD) == 0) {
4186		fdrop(fp, td);
4187		return (EBADF);
4188	}
4189	vp = fp->f_vnode;
4190	foffset = foffset_lock(fp, 0);
4191unionread:
4192	if (vp->v_type != VDIR) {
4193		error = EINVAL;
4194		goto fail;
4195	}
4196	aiov.iov_base = buf;
4197	aiov.iov_len = count;
4198	auio.uio_iov = &aiov;
4199	auio.uio_iovcnt = 1;
4200	auio.uio_rw = UIO_READ;
4201	auio.uio_segflg = bufseg;
4202	auio.uio_td = td;
4203	vn_lock(vp, LK_SHARED | LK_RETRY);
4204	AUDIT_ARG_VNODE1(vp);
4205	loff = auio.uio_offset = foffset;
4206#ifdef MAC
4207	error = mac_vnode_check_readdir(td->td_ucred, vp);
4208	if (error == 0)
4209#endif
4210		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4211		    NULL);
4212	foffset = auio.uio_offset;
4213	if (error != 0) {
4214		VOP_UNLOCK(vp, 0);
4215		goto fail;
4216	}
4217	if (count == auio.uio_resid &&
4218	    (vp->v_vflag & VV_ROOT) &&
4219	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4220		struct vnode *tvp = vp;
4221
4222		vp = vp->v_mount->mnt_vnodecovered;
4223		VREF(vp);
4224		fp->f_vnode = vp;
4225		fp->f_data = vp;
4226		foffset = 0;
4227		vput(tvp);
4228		goto unionread;
4229	}
4230	VOP_UNLOCK(vp, 0);
4231	*basep = loff;
4232	if (residp != NULL)
4233		*residp = auio.uio_resid;
4234	td->td_retval[0] = count - auio.uio_resid;
4235fail:
4236	foffset_unlock(fp, foffset, 0);
4237	fdrop(fp, td);
4238	return (error);
4239}
4240
4241#ifndef _SYS_SYSPROTO_H_
4242struct getdents_args {
4243	int fd;
4244	char *buf;
4245	size_t count;
4246};
4247#endif
4248int
4249sys_getdents(td, uap)
4250	struct thread *td;
4251	register struct getdents_args /* {
4252		int fd;
4253		char *buf;
4254		u_int count;
4255	} */ *uap;
4256{
4257	struct getdirentries_args ap;
4258
4259	ap.fd = uap->fd;
4260	ap.buf = uap->buf;
4261	ap.count = uap->count;
4262	ap.basep = NULL;
4263	return (sys_getdirentries(td, &ap));
4264}
4265
4266/*
4267 * Set the mode mask for creation of filesystem nodes.
4268 */
4269#ifndef _SYS_SYSPROTO_H_
4270struct umask_args {
4271	int	newmask;
4272};
4273#endif
4274int
4275sys_umask(td, uap)
4276	struct thread *td;
4277	struct umask_args /* {
4278		int newmask;
4279	} */ *uap;
4280{
4281	register struct filedesc *fdp;
4282
4283	FILEDESC_XLOCK(td->td_proc->p_fd);
4284	fdp = td->td_proc->p_fd;
4285	td->td_retval[0] = fdp->fd_cmask;
4286	fdp->fd_cmask = uap->newmask & ALLPERMS;
4287	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4288	return (0);
4289}
4290
4291/*
4292 * Void all references to file by ripping underlying filesystem away from
4293 * vnode.
4294 */
4295#ifndef _SYS_SYSPROTO_H_
4296struct revoke_args {
4297	char	*path;
4298};
4299#endif
4300int
4301sys_revoke(td, uap)
4302	struct thread *td;
4303	register struct revoke_args /* {
4304		char *path;
4305	} */ *uap;
4306{
4307	struct vnode *vp;
4308	struct vattr vattr;
4309	struct nameidata nd;
4310	int error;
4311
4312	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4313	    uap->path, td);
4314	if ((error = namei(&nd)) != 0)
4315		return (error);
4316	vp = nd.ni_vp;
4317	NDFREE(&nd, NDF_ONLY_PNBUF);
4318	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4319		error = EINVAL;
4320		goto out;
4321	}
4322#ifdef MAC
4323	error = mac_vnode_check_revoke(td->td_ucred, vp);
4324	if (error != 0)
4325		goto out;
4326#endif
4327	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4328	if (error != 0)
4329		goto out;
4330	if (td->td_ucred->cr_uid != vattr.va_uid) {
4331		error = priv_check(td, PRIV_VFS_ADMIN);
4332		if (error != 0)
4333			goto out;
4334	}
4335	if (vcount(vp) > 1)
4336		VOP_REVOKE(vp, REVOKEALL);
4337out:
4338	vput(vp);
4339	return (error);
4340}
4341
4342/*
4343 * Convert a user file descriptor to a kernel file entry and check that, if it
4344 * is a capability, the correct rights are present. A reference on the file
4345 * entry is held upon returning.
4346 */
4347int
4348getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4349{
4350	struct file *fp;
4351	int error;
4352
4353	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4354	if (error != 0)
4355		return (error);
4356
4357	/*
4358	 * The file could be not of the vnode type, or it may be not
4359	 * yet fully initialized, in which case the f_vnode pointer
4360	 * may be set, but f_ops is still badfileops.  E.g.,
4361	 * devfs_open() transiently create such situation to
4362	 * facilitate csw d_fdopen().
4363	 *
4364	 * Dupfdopen() handling in kern_openat() installs the
4365	 * half-baked file into the process descriptor table, allowing
4366	 * other thread to dereference it. Guard against the race by
4367	 * checking f_ops.
4368	 */
4369	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4370		fdrop(fp, curthread);
4371		return (EINVAL);
4372	}
4373	*fpp = fp;
4374	return (0);
4375}
4376
4377
4378/*
4379 * Get an (NFS) file handle.
4380 */
4381#ifndef _SYS_SYSPROTO_H_
4382struct lgetfh_args {
4383	char	*fname;
4384	fhandle_t *fhp;
4385};
4386#endif
4387int
4388sys_lgetfh(td, uap)
4389	struct thread *td;
4390	register struct lgetfh_args *uap;
4391{
4392	struct nameidata nd;
4393	fhandle_t fh;
4394	register struct vnode *vp;
4395	int error;
4396
4397	error = priv_check(td, PRIV_VFS_GETFH);
4398	if (error != 0)
4399		return (error);
4400	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4401	    uap->fname, td);
4402	error = namei(&nd);
4403	if (error != 0)
4404		return (error);
4405	NDFREE(&nd, NDF_ONLY_PNBUF);
4406	vp = nd.ni_vp;
4407	bzero(&fh, sizeof(fh));
4408	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4409	error = VOP_VPTOFH(vp, &fh.fh_fid);
4410	vput(vp);
4411	if (error == 0)
4412		error = copyout(&fh, uap->fhp, sizeof (fh));
4413	return (error);
4414}
4415
4416#ifndef _SYS_SYSPROTO_H_
4417struct getfh_args {
4418	char	*fname;
4419	fhandle_t *fhp;
4420};
4421#endif
4422int
4423sys_getfh(td, uap)
4424	struct thread *td;
4425	register struct getfh_args *uap;
4426{
4427	struct nameidata nd;
4428	fhandle_t fh;
4429	register struct vnode *vp;
4430	int error;
4431
4432	error = priv_check(td, PRIV_VFS_GETFH);
4433	if (error != 0)
4434		return (error);
4435	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4436	    uap->fname, td);
4437	error = namei(&nd);
4438	if (error != 0)
4439		return (error);
4440	NDFREE(&nd, NDF_ONLY_PNBUF);
4441	vp = nd.ni_vp;
4442	bzero(&fh, sizeof(fh));
4443	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4444	error = VOP_VPTOFH(vp, &fh.fh_fid);
4445	vput(vp);
4446	if (error == 0)
4447		error = copyout(&fh, uap->fhp, sizeof (fh));
4448	return (error);
4449}
4450
4451/*
4452 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4453 * open descriptor.
4454 *
4455 * warning: do not remove the priv_check() call or this becomes one giant
4456 * security hole.
4457 */
4458#ifndef _SYS_SYSPROTO_H_
4459struct fhopen_args {
4460	const struct fhandle *u_fhp;
4461	int flags;
4462};
4463#endif
4464int
4465sys_fhopen(td, uap)
4466	struct thread *td;
4467	struct fhopen_args /* {
4468		const struct fhandle *u_fhp;
4469		int flags;
4470	} */ *uap;
4471{
4472	struct mount *mp;
4473	struct vnode *vp;
4474	struct fhandle fhp;
4475	struct file *fp;
4476	int fmode, error;
4477	int indx;
4478
4479	error = priv_check(td, PRIV_VFS_FHOPEN);
4480	if (error != 0)
4481		return (error);
4482	indx = -1;
4483	fmode = FFLAGS(uap->flags);
4484	/* why not allow a non-read/write open for our lockd? */
4485	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4486		return (EINVAL);
4487	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4488	if (error != 0)
4489		return(error);
4490	/* find the mount point */
4491	mp = vfs_busyfs(&fhp.fh_fsid);
4492	if (mp == NULL)
4493		return (ESTALE);
4494	/* now give me my vnode, it gets returned to me locked */
4495	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4496	vfs_unbusy(mp);
4497	if (error != 0)
4498		return (error);
4499
4500	error = falloc_noinstall(td, &fp);
4501	if (error != 0) {
4502		vput(vp);
4503		return (error);
4504	}
4505	/*
4506	 * An extra reference on `fp' has been held for us by
4507	 * falloc_noinstall().
4508	 */
4509
4510#ifdef INVARIANTS
4511	td->td_dupfd = -1;
4512#endif
4513	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4514	if (error != 0) {
4515		KASSERT(fp->f_ops == &badfileops,
4516		    ("VOP_OPEN in fhopen() set f_ops"));
4517		KASSERT(td->td_dupfd < 0,
4518		    ("fhopen() encountered fdopen()"));
4519
4520		vput(vp);
4521		goto bad;
4522	}
4523#ifdef INVARIANTS
4524	td->td_dupfd = 0;
4525#endif
4526	fp->f_vnode = vp;
4527	fp->f_seqcount = 1;
4528	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4529	    &vnops);
4530	VOP_UNLOCK(vp, 0);
4531	if ((fmode & O_TRUNC) != 0) {
4532		error = fo_truncate(fp, 0, td->td_ucred, td);
4533		if (error != 0)
4534			goto bad;
4535	}
4536
4537	error = finstall(td, fp, &indx, fmode, NULL);
4538bad:
4539	fdrop(fp, td);
4540	td->td_retval[0] = indx;
4541	return (error);
4542}
4543
4544/*
4545 * Stat an (NFS) file handle.
4546 */
4547#ifndef _SYS_SYSPROTO_H_
4548struct fhstat_args {
4549	struct fhandle *u_fhp;
4550	struct stat *sb;
4551};
4552#endif
4553int
4554sys_fhstat(td, uap)
4555	struct thread *td;
4556	register struct fhstat_args /* {
4557		struct fhandle *u_fhp;
4558		struct stat *sb;
4559	} */ *uap;
4560{
4561	struct stat sb;
4562	struct fhandle fh;
4563	int error;
4564
4565	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4566	if (error != 0)
4567		return (error);
4568	error = kern_fhstat(td, fh, &sb);
4569	if (error == 0)
4570		error = copyout(&sb, uap->sb, sizeof(sb));
4571	return (error);
4572}
4573
4574int
4575kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4576{
4577	struct mount *mp;
4578	struct vnode *vp;
4579	int error;
4580
4581	error = priv_check(td, PRIV_VFS_FHSTAT);
4582	if (error != 0)
4583		return (error);
4584	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4585		return (ESTALE);
4586	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4587	vfs_unbusy(mp);
4588	if (error != 0)
4589		return (error);
4590	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4591	vput(vp);
4592	return (error);
4593}
4594
4595/*
4596 * Implement fstatfs() for (NFS) file handles.
4597 */
4598#ifndef _SYS_SYSPROTO_H_
4599struct fhstatfs_args {
4600	struct fhandle *u_fhp;
4601	struct statfs *buf;
4602};
4603#endif
4604int
4605sys_fhstatfs(td, uap)
4606	struct thread *td;
4607	struct fhstatfs_args /* {
4608		struct fhandle *u_fhp;
4609		struct statfs *buf;
4610	} */ *uap;
4611{
4612	struct statfs sf;
4613	fhandle_t fh;
4614	int error;
4615
4616	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4617	if (error != 0)
4618		return (error);
4619	error = kern_fhstatfs(td, fh, &sf);
4620	if (error != 0)
4621		return (error);
4622	return (copyout(&sf, uap->buf, sizeof(sf)));
4623}
4624
4625int
4626kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4627{
4628	struct statfs *sp;
4629	struct mount *mp;
4630	struct vnode *vp;
4631	int error;
4632
4633	error = priv_check(td, PRIV_VFS_FHSTATFS);
4634	if (error != 0)
4635		return (error);
4636	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4637		return (ESTALE);
4638	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4639	if (error != 0) {
4640		vfs_unbusy(mp);
4641		return (error);
4642	}
4643	vput(vp);
4644	error = prison_canseemount(td->td_ucred, mp);
4645	if (error != 0)
4646		goto out;
4647#ifdef MAC
4648	error = mac_mount_check_stat(td->td_ucred, mp);
4649	if (error != 0)
4650		goto out;
4651#endif
4652	/*
4653	 * Set these in case the underlying filesystem fails to do so.
4654	 */
4655	sp = &mp->mnt_stat;
4656	sp->f_version = STATFS_VERSION;
4657	sp->f_namemax = NAME_MAX;
4658	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4659	error = VFS_STATFS(mp, sp);
4660	if (error == 0)
4661		*buf = *sp;
4662out:
4663	vfs_unbusy(mp);
4664	return (error);
4665}
4666
4667int
4668kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4669{
4670	struct file *fp;
4671	struct mount *mp;
4672	struct vnode *vp;
4673	cap_rights_t rights;
4674	off_t olen, ooffset;
4675	int error;
4676
4677	if (offset < 0 || len <= 0)
4678		return (EINVAL);
4679	/* Check for wrap. */
4680	if (offset > OFF_MAX - len)
4681		return (EFBIG);
4682	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4683	if (error != 0)
4684		return (error);
4685	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4686		error = ESPIPE;
4687		goto out;
4688	}
4689	if ((fp->f_flag & FWRITE) == 0) {
4690		error = EBADF;
4691		goto out;
4692	}
4693	if (fp->f_type != DTYPE_VNODE) {
4694		error = ENODEV;
4695		goto out;
4696	}
4697	vp = fp->f_vnode;
4698	if (vp->v_type != VREG) {
4699		error = ENODEV;
4700		goto out;
4701	}
4702
4703	/* Allocating blocks may take a long time, so iterate. */
4704	for (;;) {
4705		olen = len;
4706		ooffset = offset;
4707
4708		bwillwrite();
4709		mp = NULL;
4710		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4711		if (error != 0)
4712			break;
4713		error = vn_lock(vp, LK_EXCLUSIVE);
4714		if (error != 0) {
4715			vn_finished_write(mp);
4716			break;
4717		}
4718#ifdef MAC
4719		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4720		if (error == 0)
4721#endif
4722			error = VOP_ALLOCATE(vp, &offset, &len);
4723		VOP_UNLOCK(vp, 0);
4724		vn_finished_write(mp);
4725
4726		if (olen + ooffset != offset + len) {
4727			panic("offset + len changed from %jx/%jx to %jx/%jx",
4728			    ooffset, olen, offset, len);
4729		}
4730		if (error != 0 || len == 0)
4731			break;
4732		KASSERT(olen > len, ("Iteration did not make progress?"));
4733		maybe_yield();
4734	}
4735 out:
4736	fdrop(fp, td);
4737	return (error);
4738}
4739
4740int
4741sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4742{
4743
4744	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4745	    uap->len);
4746	return (0);
4747}
4748
4749/*
4750 * Unlike madvise(2), we do not make a best effort to remember every
4751 * possible caching hint.  Instead, we remember the last setting with
4752 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4753 * region of any current setting.
4754 */
4755int
4756kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4757    int advice)
4758{
4759	struct fadvise_info *fa, *new;
4760	struct file *fp;
4761	struct vnode *vp;
4762	cap_rights_t rights;
4763	off_t end;
4764	int error;
4765
4766	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4767		return (EINVAL);
4768	switch (advice) {
4769	case POSIX_FADV_SEQUENTIAL:
4770	case POSIX_FADV_RANDOM:
4771	case POSIX_FADV_NOREUSE:
4772		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4773		break;
4774	case POSIX_FADV_NORMAL:
4775	case POSIX_FADV_WILLNEED:
4776	case POSIX_FADV_DONTNEED:
4777		new = NULL;
4778		break;
4779	default:
4780		return (EINVAL);
4781	}
4782	/* XXX: CAP_POSIX_FADVISE? */
4783	error = fget(td, fd, cap_rights_init(&rights), &fp);
4784	if (error != 0)
4785		goto out;
4786	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4787		error = ESPIPE;
4788		goto out;
4789	}
4790	if (fp->f_type != DTYPE_VNODE) {
4791		error = ENODEV;
4792		goto out;
4793	}
4794	vp = fp->f_vnode;
4795	if (vp->v_type != VREG) {
4796		error = ENODEV;
4797		goto out;
4798	}
4799	if (len == 0)
4800		end = OFF_MAX;
4801	else
4802		end = offset + len - 1;
4803	switch (advice) {
4804	case POSIX_FADV_SEQUENTIAL:
4805	case POSIX_FADV_RANDOM:
4806	case POSIX_FADV_NOREUSE:
4807		/*
4808		 * Try to merge any existing non-standard region with
4809		 * this new region if possible, otherwise create a new
4810		 * non-standard region for this request.
4811		 */
4812		mtx_pool_lock(mtxpool_sleep, fp);
4813		fa = fp->f_advice;
4814		if (fa != NULL && fa->fa_advice == advice &&
4815		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4816		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4817		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4818			if (offset < fa->fa_start)
4819				fa->fa_start = offset;
4820			if (end > fa->fa_end)
4821				fa->fa_end = end;
4822		} else {
4823			new->fa_advice = advice;
4824			new->fa_start = offset;
4825			new->fa_end = end;
4826			new->fa_prevstart = 0;
4827			new->fa_prevend = 0;
4828			fp->f_advice = new;
4829			new = fa;
4830		}
4831		mtx_pool_unlock(mtxpool_sleep, fp);
4832		break;
4833	case POSIX_FADV_NORMAL:
4834		/*
4835		 * If a the "normal" region overlaps with an existing
4836		 * non-standard region, trim or remove the
4837		 * non-standard region.
4838		 */
4839		mtx_pool_lock(mtxpool_sleep, fp);
4840		fa = fp->f_advice;
4841		if (fa != NULL) {
4842			if (offset <= fa->fa_start && end >= fa->fa_end) {
4843				new = fa;
4844				fp->f_advice = NULL;
4845			} else if (offset <= fa->fa_start &&
4846			    end >= fa->fa_start)
4847				fa->fa_start = end + 1;
4848			else if (offset <= fa->fa_end && end >= fa->fa_end)
4849				fa->fa_end = offset - 1;
4850			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4851				/*
4852				 * If the "normal" region is a middle
4853				 * portion of the existing
4854				 * non-standard region, just remove
4855				 * the whole thing rather than picking
4856				 * one side or the other to
4857				 * preserve.
4858				 */
4859				new = fa;
4860				fp->f_advice = NULL;
4861			}
4862		}
4863		mtx_pool_unlock(mtxpool_sleep, fp);
4864		break;
4865	case POSIX_FADV_WILLNEED:
4866	case POSIX_FADV_DONTNEED:
4867		error = VOP_ADVISE(vp, offset, end, advice);
4868		break;
4869	}
4870out:
4871	if (fp != NULL)
4872		fdrop(fp, td);
4873	free(new, M_FADVISE);
4874	return (error);
4875}
4876
4877int
4878sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4879{
4880
4881	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4882	    uap->len, uap->advice);
4883	return (0);
4884}
4885