vfs_syscalls.c revision 289798
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 289798 2015-10-23 07:40:43Z avg $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capsicum.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <ufs/ufs/quota.h>
91
92MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93
94SDT_PROVIDER_DEFINE(vfs);
95SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
97
98static int chroot_refuse_vdir_fds(struct filedesc *fdp);
99static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
100static int kern_chflags(struct thread *td, const char *path,
101    enum uio_seg pathseg, u_long flags);
102static int kern_chflagsat(struct thread *td, int fd, const char *path,
103    enum uio_seg pathseg, u_long flags, int atflag);
104static int setfflags(struct thread *td, struct vnode *, u_long);
105static int setutimes(struct thread *td, struct vnode *,
106    const struct timespec *, int, int);
107static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
108    struct thread *td);
109
110/*
111 * The module initialization routine for POSIX asynchronous I/O will
112 * set this to the version of AIO that it implements.  (Zero means
113 * that it is not implemented.)  This value is used here by pathconf()
114 * and in kern_descrip.c by fpathconf().
115 */
116int async_io_version;
117
118/*
119 * Sync each mounted filesystem.
120 */
121#ifndef _SYS_SYSPROTO_H_
122struct sync_args {
123	int     dummy;
124};
125#endif
126/* ARGSUSED */
127int
128sys_sync(td, uap)
129	struct thread *td;
130	struct sync_args *uap;
131{
132	struct mount *mp, *nmp;
133	int save;
134
135	mtx_lock(&mountlist_mtx);
136	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
137		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
138			nmp = TAILQ_NEXT(mp, mnt_list);
139			continue;
140		}
141		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
142		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
143			save = curthread_pflags_set(TDP_SYNCIO);
144			vfs_msync(mp, MNT_NOWAIT);
145			VFS_SYNC(mp, MNT_NOWAIT);
146			curthread_pflags_restore(save);
147			vn_finished_write(mp);
148		}
149		mtx_lock(&mountlist_mtx);
150		nmp = TAILQ_NEXT(mp, mnt_list);
151		vfs_unbusy(mp);
152	}
153	mtx_unlock(&mountlist_mtx);
154	return (0);
155}
156
157/*
158 * Change filesystem quotas.
159 */
160#ifndef _SYS_SYSPROTO_H_
161struct quotactl_args {
162	char *path;
163	int cmd;
164	int uid;
165	caddr_t arg;
166};
167#endif
168int
169sys_quotactl(td, uap)
170	struct thread *td;
171	register struct quotactl_args /* {
172		char *path;
173		int cmd;
174		int uid;
175		caddr_t arg;
176	} */ *uap;
177{
178	struct mount *mp;
179	struct nameidata nd;
180	int error;
181
182	AUDIT_ARG_CMD(uap->cmd);
183	AUDIT_ARG_UID(uap->uid);
184	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
185		return (EPERM);
186	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
187	    uap->path, td);
188	if ((error = namei(&nd)) != 0)
189		return (error);
190	NDFREE(&nd, NDF_ONLY_PNBUF);
191	mp = nd.ni_vp->v_mount;
192	vfs_ref(mp);
193	vput(nd.ni_vp);
194	error = vfs_busy(mp, 0);
195	vfs_rel(mp);
196	if (error != 0)
197		return (error);
198	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
199
200	/*
201	 * Since quota on operation typically needs to open quota
202	 * file, the Q_QUOTAON handler needs to unbusy the mount point
203	 * before calling into namei.  Otherwise, unmount might be
204	 * started between two vfs_busy() invocations (first is our,
205	 * second is from mount point cross-walk code in lookup()),
206	 * causing deadlock.
207	 *
208	 * Require that Q_QUOTAON handles the vfs_busy() reference on
209	 * its own, always returning with ubusied mount point.
210	 */
211	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
212		vfs_unbusy(mp);
213	return (error);
214}
215
216/*
217 * Used by statfs conversion routines to scale the block size up if
218 * necessary so that all of the block counts are <= 'max_size'.  Note
219 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
220 * value of 'n'.
221 */
222void
223statfs_scale_blocks(struct statfs *sf, long max_size)
224{
225	uint64_t count;
226	int shift;
227
228	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
229
230	/*
231	 * Attempt to scale the block counts to give a more accurate
232	 * overview to userland of the ratio of free space to used
233	 * space.  To do this, find the largest block count and compute
234	 * a divisor that lets it fit into a signed integer <= max_size.
235	 */
236	if (sf->f_bavail < 0)
237		count = -sf->f_bavail;
238	else
239		count = sf->f_bavail;
240	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
241	if (count <= max_size)
242		return;
243
244	count >>= flsl(max_size);
245	shift = 0;
246	while (count > 0) {
247		shift++;
248		count >>=1;
249	}
250
251	sf->f_bsize <<= shift;
252	sf->f_blocks >>= shift;
253	sf->f_bfree >>= shift;
254	sf->f_bavail >>= shift;
255}
256
257/*
258 * Get filesystem statistics.
259 */
260#ifndef _SYS_SYSPROTO_H_
261struct statfs_args {
262	char *path;
263	struct statfs *buf;
264};
265#endif
266int
267sys_statfs(td, uap)
268	struct thread *td;
269	register struct statfs_args /* {
270		char *path;
271		struct statfs *buf;
272	} */ *uap;
273{
274	struct statfs sf;
275	int error;
276
277	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
278	if (error == 0)
279		error = copyout(&sf, uap->buf, sizeof(sf));
280	return (error);
281}
282
283int
284kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
285    struct statfs *buf)
286{
287	struct mount *mp;
288	struct statfs *sp, sb;
289	struct nameidata nd;
290	int error;
291
292	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
293	    pathseg, path, td);
294	error = namei(&nd);
295	if (error != 0)
296		return (error);
297	mp = nd.ni_vp->v_mount;
298	vfs_ref(mp);
299	NDFREE(&nd, NDF_ONLY_PNBUF);
300	vput(nd.ni_vp);
301	error = vfs_busy(mp, 0);
302	vfs_rel(mp);
303	if (error != 0)
304		return (error);
305#ifdef MAC
306	error = mac_mount_check_stat(td->td_ucred, mp);
307	if (error != 0)
308		goto out;
309#endif
310	/*
311	 * Set these in case the underlying filesystem fails to do so.
312	 */
313	sp = &mp->mnt_stat;
314	sp->f_version = STATFS_VERSION;
315	sp->f_namemax = NAME_MAX;
316	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
317	error = VFS_STATFS(mp, sp);
318	if (error != 0)
319		goto out;
320	if (priv_check(td, PRIV_VFS_GENERATION)) {
321		bcopy(sp, &sb, sizeof(sb));
322		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
323		prison_enforce_statfs(td->td_ucred, mp, &sb);
324		sp = &sb;
325	}
326	*buf = *sp;
327out:
328	vfs_unbusy(mp);
329	return (error);
330}
331
332/*
333 * Get filesystem statistics.
334 */
335#ifndef _SYS_SYSPROTO_H_
336struct fstatfs_args {
337	int fd;
338	struct statfs *buf;
339};
340#endif
341int
342sys_fstatfs(td, uap)
343	struct thread *td;
344	register struct fstatfs_args /* {
345		int fd;
346		struct statfs *buf;
347	} */ *uap;
348{
349	struct statfs sf;
350	int error;
351
352	error = kern_fstatfs(td, uap->fd, &sf);
353	if (error == 0)
354		error = copyout(&sf, uap->buf, sizeof(sf));
355	return (error);
356}
357
358int
359kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
360{
361	struct file *fp;
362	struct mount *mp;
363	struct statfs *sp, sb;
364	struct vnode *vp;
365	cap_rights_t rights;
366	int error;
367
368	AUDIT_ARG_FD(fd);
369	error = getvnode(td->td_proc->p_fd, fd,
370	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
371	if (error != 0)
372		return (error);
373	vp = fp->f_vnode;
374	vn_lock(vp, LK_SHARED | LK_RETRY);
375#ifdef AUDIT
376	AUDIT_ARG_VNODE1(vp);
377#endif
378	mp = vp->v_mount;
379	if (mp)
380		vfs_ref(mp);
381	VOP_UNLOCK(vp, 0);
382	fdrop(fp, td);
383	if (mp == NULL) {
384		error = EBADF;
385		goto out;
386	}
387	error = vfs_busy(mp, 0);
388	vfs_rel(mp);
389	if (error != 0)
390		return (error);
391#ifdef MAC
392	error = mac_mount_check_stat(td->td_ucred, mp);
393	if (error != 0)
394		goto out;
395#endif
396	/*
397	 * Set these in case the underlying filesystem fails to do so.
398	 */
399	sp = &mp->mnt_stat;
400	sp->f_version = STATFS_VERSION;
401	sp->f_namemax = NAME_MAX;
402	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
403	error = VFS_STATFS(mp, sp);
404	if (error != 0)
405		goto out;
406	if (priv_check(td, PRIV_VFS_GENERATION)) {
407		bcopy(sp, &sb, sizeof(sb));
408		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
409		prison_enforce_statfs(td->td_ucred, mp, &sb);
410		sp = &sb;
411	}
412	*buf = *sp;
413out:
414	if (mp)
415		vfs_unbusy(mp);
416	return (error);
417}
418
419/*
420 * Get statistics on all filesystems.
421 */
422#ifndef _SYS_SYSPROTO_H_
423struct getfsstat_args {
424	struct statfs *buf;
425	long bufsize;
426	int flags;
427};
428#endif
429int
430sys_getfsstat(td, uap)
431	struct thread *td;
432	register struct getfsstat_args /* {
433		struct statfs *buf;
434		long bufsize;
435		int flags;
436	} */ *uap;
437{
438
439	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
440	    uap->flags));
441}
442
443/*
444 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
445 *	The caller is responsible for freeing memory which will be allocated
446 *	in '*buf'.
447 */
448int
449kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
450    enum uio_seg bufseg, int flags)
451{
452	struct mount *mp, *nmp;
453	struct statfs *sfsp, *sp, sb;
454	size_t count, maxcount;
455	int error;
456
457	maxcount = bufsize / sizeof(struct statfs);
458	if (bufsize == 0)
459		sfsp = NULL;
460	else if (bufseg == UIO_USERSPACE)
461		sfsp = *buf;
462	else /* if (bufseg == UIO_SYSSPACE) */ {
463		count = 0;
464		mtx_lock(&mountlist_mtx);
465		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
466			count++;
467		}
468		mtx_unlock(&mountlist_mtx);
469		if (maxcount > count)
470			maxcount = count;
471		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
472		    M_WAITOK);
473	}
474	count = 0;
475	mtx_lock(&mountlist_mtx);
476	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
477		if (prison_canseemount(td->td_ucred, mp) != 0) {
478			nmp = TAILQ_NEXT(mp, mnt_list);
479			continue;
480		}
481#ifdef MAC
482		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
483			nmp = TAILQ_NEXT(mp, mnt_list);
484			continue;
485		}
486#endif
487		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
488			nmp = TAILQ_NEXT(mp, mnt_list);
489			continue;
490		}
491		if (sfsp && count < maxcount) {
492			sp = &mp->mnt_stat;
493			/*
494			 * Set these in case the underlying filesystem
495			 * fails to do so.
496			 */
497			sp->f_version = STATFS_VERSION;
498			sp->f_namemax = NAME_MAX;
499			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
500			/*
501			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
502			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
503			 * overrides MNT_WAIT.
504			 */
505			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
506			    (flags & MNT_WAIT)) &&
507			    (error = VFS_STATFS(mp, sp))) {
508				mtx_lock(&mountlist_mtx);
509				nmp = TAILQ_NEXT(mp, mnt_list);
510				vfs_unbusy(mp);
511				continue;
512			}
513			if (priv_check(td, PRIV_VFS_GENERATION)) {
514				bcopy(sp, &sb, sizeof(sb));
515				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
516				prison_enforce_statfs(td->td_ucred, mp, &sb);
517				sp = &sb;
518			}
519			if (bufseg == UIO_SYSSPACE)
520				bcopy(sp, sfsp, sizeof(*sp));
521			else /* if (bufseg == UIO_USERSPACE) */ {
522				error = copyout(sp, sfsp, sizeof(*sp));
523				if (error != 0) {
524					vfs_unbusy(mp);
525					return (error);
526				}
527			}
528			sfsp++;
529		}
530		count++;
531		mtx_lock(&mountlist_mtx);
532		nmp = TAILQ_NEXT(mp, mnt_list);
533		vfs_unbusy(mp);
534	}
535	mtx_unlock(&mountlist_mtx);
536	if (sfsp && count > maxcount)
537		td->td_retval[0] = maxcount;
538	else
539		td->td_retval[0] = count;
540	return (0);
541}
542
543#ifdef COMPAT_FREEBSD4
544/*
545 * Get old format filesystem statistics.
546 */
547static void cvtstatfs(struct statfs *, struct ostatfs *);
548
549#ifndef _SYS_SYSPROTO_H_
550struct freebsd4_statfs_args {
551	char *path;
552	struct ostatfs *buf;
553};
554#endif
555int
556freebsd4_statfs(td, uap)
557	struct thread *td;
558	struct freebsd4_statfs_args /* {
559		char *path;
560		struct ostatfs *buf;
561	} */ *uap;
562{
563	struct ostatfs osb;
564	struct statfs sf;
565	int error;
566
567	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
568	if (error != 0)
569		return (error);
570	cvtstatfs(&sf, &osb);
571	return (copyout(&osb, uap->buf, sizeof(osb)));
572}
573
574/*
575 * Get filesystem statistics.
576 */
577#ifndef _SYS_SYSPROTO_H_
578struct freebsd4_fstatfs_args {
579	int fd;
580	struct ostatfs *buf;
581};
582#endif
583int
584freebsd4_fstatfs(td, uap)
585	struct thread *td;
586	struct freebsd4_fstatfs_args /* {
587		int fd;
588		struct ostatfs *buf;
589	} */ *uap;
590{
591	struct ostatfs osb;
592	struct statfs sf;
593	int error;
594
595	error = kern_fstatfs(td, uap->fd, &sf);
596	if (error != 0)
597		return (error);
598	cvtstatfs(&sf, &osb);
599	return (copyout(&osb, uap->buf, sizeof(osb)));
600}
601
602/*
603 * Get statistics on all filesystems.
604 */
605#ifndef _SYS_SYSPROTO_H_
606struct freebsd4_getfsstat_args {
607	struct ostatfs *buf;
608	long bufsize;
609	int flags;
610};
611#endif
612int
613freebsd4_getfsstat(td, uap)
614	struct thread *td;
615	register struct freebsd4_getfsstat_args /* {
616		struct ostatfs *buf;
617		long bufsize;
618		int flags;
619	} */ *uap;
620{
621	struct statfs *buf, *sp;
622	struct ostatfs osb;
623	size_t count, size;
624	int error;
625
626	count = uap->bufsize / sizeof(struct ostatfs);
627	size = count * sizeof(struct statfs);
628	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
629	if (size > 0) {
630		count = td->td_retval[0];
631		sp = buf;
632		while (count > 0 && error == 0) {
633			cvtstatfs(sp, &osb);
634			error = copyout(&osb, uap->buf, sizeof(osb));
635			sp++;
636			uap->buf++;
637			count--;
638		}
639		free(buf, M_TEMP);
640	}
641	return (error);
642}
643
644/*
645 * Implement fstatfs() for (NFS) file handles.
646 */
647#ifndef _SYS_SYSPROTO_H_
648struct freebsd4_fhstatfs_args {
649	struct fhandle *u_fhp;
650	struct ostatfs *buf;
651};
652#endif
653int
654freebsd4_fhstatfs(td, uap)
655	struct thread *td;
656	struct freebsd4_fhstatfs_args /* {
657		struct fhandle *u_fhp;
658		struct ostatfs *buf;
659	} */ *uap;
660{
661	struct ostatfs osb;
662	struct statfs sf;
663	fhandle_t fh;
664	int error;
665
666	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
667	if (error != 0)
668		return (error);
669	error = kern_fhstatfs(td, fh, &sf);
670	if (error != 0)
671		return (error);
672	cvtstatfs(&sf, &osb);
673	return (copyout(&osb, uap->buf, sizeof(osb)));
674}
675
676/*
677 * Convert a new format statfs structure to an old format statfs structure.
678 */
679static void
680cvtstatfs(nsp, osp)
681	struct statfs *nsp;
682	struct ostatfs *osp;
683{
684
685	statfs_scale_blocks(nsp, LONG_MAX);
686	bzero(osp, sizeof(*osp));
687	osp->f_bsize = nsp->f_bsize;
688	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
689	osp->f_blocks = nsp->f_blocks;
690	osp->f_bfree = nsp->f_bfree;
691	osp->f_bavail = nsp->f_bavail;
692	osp->f_files = MIN(nsp->f_files, LONG_MAX);
693	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
694	osp->f_owner = nsp->f_owner;
695	osp->f_type = nsp->f_type;
696	osp->f_flags = nsp->f_flags;
697	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
698	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
699	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
700	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
701	strlcpy(osp->f_fstypename, nsp->f_fstypename,
702	    MIN(MFSNAMELEN, OMFSNAMELEN));
703	strlcpy(osp->f_mntonname, nsp->f_mntonname,
704	    MIN(MNAMELEN, OMNAMELEN));
705	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
706	    MIN(MNAMELEN, OMNAMELEN));
707	osp->f_fsid = nsp->f_fsid;
708}
709#endif /* COMPAT_FREEBSD4 */
710
711/*
712 * Change current working directory to a given file descriptor.
713 */
714#ifndef _SYS_SYSPROTO_H_
715struct fchdir_args {
716	int	fd;
717};
718#endif
719int
720sys_fchdir(td, uap)
721	struct thread *td;
722	struct fchdir_args /* {
723		int fd;
724	} */ *uap;
725{
726	register struct filedesc *fdp = td->td_proc->p_fd;
727	struct vnode *vp, *tdp, *vpold;
728	struct mount *mp;
729	struct file *fp;
730	cap_rights_t rights;
731	int error;
732
733	AUDIT_ARG_FD(uap->fd);
734	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
735	    &fp);
736	if (error != 0)
737		return (error);
738	vp = fp->f_vnode;
739	VREF(vp);
740	fdrop(fp, td);
741	vn_lock(vp, LK_SHARED | LK_RETRY);
742	AUDIT_ARG_VNODE1(vp);
743	error = change_dir(vp, td);
744	while (!error && (mp = vp->v_mountedhere) != NULL) {
745		if (vfs_busy(mp, 0))
746			continue;
747		error = VFS_ROOT(mp, LK_SHARED, &tdp);
748		vfs_unbusy(mp);
749		if (error != 0)
750			break;
751		vput(vp);
752		vp = tdp;
753	}
754	if (error != 0) {
755		vput(vp);
756		return (error);
757	}
758	VOP_UNLOCK(vp, 0);
759	FILEDESC_XLOCK(fdp);
760	vpold = fdp->fd_cdir;
761	fdp->fd_cdir = vp;
762	FILEDESC_XUNLOCK(fdp);
763	vrele(vpold);
764	return (0);
765}
766
767/*
768 * Change current working directory (``.'').
769 */
770#ifndef _SYS_SYSPROTO_H_
771struct chdir_args {
772	char	*path;
773};
774#endif
775int
776sys_chdir(td, uap)
777	struct thread *td;
778	struct chdir_args /* {
779		char *path;
780	} */ *uap;
781{
782
783	return (kern_chdir(td, uap->path, UIO_USERSPACE));
784}
785
786int
787kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
788{
789	register struct filedesc *fdp = td->td_proc->p_fd;
790	struct nameidata nd;
791	struct vnode *vp;
792	int error;
793
794	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
795	    pathseg, path, td);
796	if ((error = namei(&nd)) != 0)
797		return (error);
798	if ((error = change_dir(nd.ni_vp, td)) != 0) {
799		vput(nd.ni_vp);
800		NDFREE(&nd, NDF_ONLY_PNBUF);
801		return (error);
802	}
803	VOP_UNLOCK(nd.ni_vp, 0);
804	NDFREE(&nd, NDF_ONLY_PNBUF);
805	FILEDESC_XLOCK(fdp);
806	vp = fdp->fd_cdir;
807	fdp->fd_cdir = nd.ni_vp;
808	FILEDESC_XUNLOCK(fdp);
809	vrele(vp);
810	return (0);
811}
812
813/*
814 * Helper function for raised chroot(2) security function:  Refuse if
815 * any filedescriptors are open directories.
816 */
817static int
818chroot_refuse_vdir_fds(fdp)
819	struct filedesc *fdp;
820{
821	struct vnode *vp;
822	struct file *fp;
823	int fd;
824
825	FILEDESC_LOCK_ASSERT(fdp);
826
827	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
828		fp = fget_locked(fdp, fd);
829		if (fp == NULL)
830			continue;
831		if (fp->f_type == DTYPE_VNODE) {
832			vp = fp->f_vnode;
833			if (vp->v_type == VDIR)
834				return (EPERM);
835		}
836	}
837	return (0);
838}
839
840/*
841 * This sysctl determines if we will allow a process to chroot(2) if it
842 * has a directory open:
843 *	0: disallowed for all processes.
844 *	1: allowed for processes that were not already chroot(2)'ed.
845 *	2: allowed for all processes.
846 */
847
848static int chroot_allow_open_directories = 1;
849
850SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
851     &chroot_allow_open_directories, 0,
852     "Allow a process to chroot(2) if it has a directory open");
853
854/*
855 * Change notion of root (``/'') directory.
856 */
857#ifndef _SYS_SYSPROTO_H_
858struct chroot_args {
859	char	*path;
860};
861#endif
862int
863sys_chroot(td, uap)
864	struct thread *td;
865	struct chroot_args /* {
866		char *path;
867	} */ *uap;
868{
869	struct nameidata nd;
870	int error;
871
872	error = priv_check(td, PRIV_VFS_CHROOT);
873	if (error != 0)
874		return (error);
875	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
876	    UIO_USERSPACE, uap->path, td);
877	error = namei(&nd);
878	if (error != 0)
879		goto error;
880	error = change_dir(nd.ni_vp, td);
881	if (error != 0)
882		goto e_vunlock;
883#ifdef MAC
884	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
885	if (error != 0)
886		goto e_vunlock;
887#endif
888	VOP_UNLOCK(nd.ni_vp, 0);
889	error = change_root(nd.ni_vp, td);
890	vrele(nd.ni_vp);
891	NDFREE(&nd, NDF_ONLY_PNBUF);
892	return (error);
893e_vunlock:
894	vput(nd.ni_vp);
895error:
896	NDFREE(&nd, NDF_ONLY_PNBUF);
897	return (error);
898}
899
900/*
901 * Common routine for chroot and chdir.  Callers must provide a locked vnode
902 * instance.
903 */
904int
905change_dir(vp, td)
906	struct vnode *vp;
907	struct thread *td;
908{
909#ifdef MAC
910	int error;
911#endif
912
913	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
914	if (vp->v_type != VDIR)
915		return (ENOTDIR);
916#ifdef MAC
917	error = mac_vnode_check_chdir(td->td_ucred, vp);
918	if (error != 0)
919		return (error);
920#endif
921	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
922}
923
924/*
925 * Common routine for kern_chroot() and jail_attach().  The caller is
926 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
927 * authorize this operation.
928 */
929int
930change_root(vp, td)
931	struct vnode *vp;
932	struct thread *td;
933{
934	struct filedesc *fdp;
935	struct vnode *oldvp;
936	int error;
937
938	fdp = td->td_proc->p_fd;
939	FILEDESC_XLOCK(fdp);
940	if (chroot_allow_open_directories == 0 ||
941	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
942		error = chroot_refuse_vdir_fds(fdp);
943		if (error != 0) {
944			FILEDESC_XUNLOCK(fdp);
945			return (error);
946		}
947	}
948	oldvp = fdp->fd_rdir;
949	fdp->fd_rdir = vp;
950	VREF(fdp->fd_rdir);
951	if (!fdp->fd_jdir) {
952		fdp->fd_jdir = vp;
953		VREF(fdp->fd_jdir);
954	}
955	FILEDESC_XUNLOCK(fdp);
956	vrele(oldvp);
957	return (0);
958}
959
960static __inline void
961flags_to_rights(int flags, cap_rights_t *rightsp)
962{
963
964	if (flags & O_EXEC) {
965		cap_rights_set(rightsp, CAP_FEXECVE);
966	} else {
967		switch ((flags & O_ACCMODE)) {
968		case O_RDONLY:
969			cap_rights_set(rightsp, CAP_READ);
970			break;
971		case O_RDWR:
972			cap_rights_set(rightsp, CAP_READ);
973			/* FALLTHROUGH */
974		case O_WRONLY:
975			cap_rights_set(rightsp, CAP_WRITE);
976			if (!(flags & (O_APPEND | O_TRUNC)))
977				cap_rights_set(rightsp, CAP_SEEK);
978			break;
979		}
980	}
981
982	if (flags & O_CREAT)
983		cap_rights_set(rightsp, CAP_CREATE);
984
985	if (flags & O_TRUNC)
986		cap_rights_set(rightsp, CAP_FTRUNCATE);
987
988	if (flags & (O_SYNC | O_FSYNC))
989		cap_rights_set(rightsp, CAP_FSYNC);
990
991	if (flags & (O_EXLOCK | O_SHLOCK))
992		cap_rights_set(rightsp, CAP_FLOCK);
993}
994
995/*
996 * Check permissions, allocate an open file structure, and call the device
997 * open routine if any.
998 */
999#ifndef _SYS_SYSPROTO_H_
1000struct open_args {
1001	char	*path;
1002	int	flags;
1003	int	mode;
1004};
1005#endif
1006int
1007sys_open(td, uap)
1008	struct thread *td;
1009	register struct open_args /* {
1010		char *path;
1011		int flags;
1012		int mode;
1013	} */ *uap;
1014{
1015
1016	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1017}
1018
1019#ifndef _SYS_SYSPROTO_H_
1020struct openat_args {
1021	int	fd;
1022	char	*path;
1023	int	flag;
1024	int	mode;
1025};
1026#endif
1027int
1028sys_openat(struct thread *td, struct openat_args *uap)
1029{
1030
1031	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1032	    uap->mode));
1033}
1034
1035int
1036kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1037    int mode)
1038{
1039
1040	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1041}
1042
1043int
1044kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1045    int flags, int mode)
1046{
1047	struct proc *p = td->td_proc;
1048	struct filedesc *fdp = p->p_fd;
1049	struct file *fp;
1050	struct vnode *vp;
1051	struct nameidata nd;
1052	cap_rights_t rights;
1053	int cmode, error, indx;
1054
1055	indx = -1;
1056
1057	AUDIT_ARG_FFLAGS(flags);
1058	AUDIT_ARG_MODE(mode);
1059	/* XXX: audit dirfd */
1060	cap_rights_init(&rights, CAP_LOOKUP);
1061	flags_to_rights(flags, &rights);
1062	/*
1063	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1064	 * may be specified.
1065	 */
1066	if (flags & O_EXEC) {
1067		if (flags & O_ACCMODE)
1068			return (EINVAL);
1069	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1070		return (EINVAL);
1071	} else {
1072		flags = FFLAGS(flags);
1073	}
1074
1075	/*
1076	 * Allocate the file descriptor, but don't install a descriptor yet.
1077	 */
1078	error = falloc_noinstall(td, &fp);
1079	if (error != 0)
1080		return (error);
1081	/*
1082	 * An extra reference on `fp' has been held for us by
1083	 * falloc_noinstall().
1084	 */
1085	/* Set the flags early so the finit in devfs can pick them up. */
1086	fp->f_flag = flags & FMASK;
1087	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1088	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1089	    &rights, td);
1090	td->td_dupfd = -1;		/* XXX check for fdopen */
1091	error = vn_open(&nd, &flags, cmode, fp);
1092	if (error != 0) {
1093		/*
1094		 * If the vn_open replaced the method vector, something
1095		 * wonderous happened deep below and we just pass it up
1096		 * pretending we know what we do.
1097		 */
1098		if (error == ENXIO && fp->f_ops != &badfileops)
1099			goto success;
1100
1101		/*
1102		 * Handle special fdopen() case. bleh.
1103		 *
1104		 * Don't do this for relative (capability) lookups; we don't
1105		 * understand exactly what would happen, and we don't think
1106		 * that it ever should.
1107		 */
1108		if (nd.ni_strictrelative == 0 &&
1109		    (error == ENODEV || error == ENXIO) &&
1110		    td->td_dupfd >= 0) {
1111			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1112			    &indx);
1113			if (error == 0)
1114				goto success;
1115		}
1116
1117		goto bad;
1118	}
1119	td->td_dupfd = 0;
1120	NDFREE(&nd, NDF_ONLY_PNBUF);
1121	vp = nd.ni_vp;
1122
1123	/*
1124	 * Store the vnode, for any f_type. Typically, the vnode use
1125	 * count is decremented by direct call to vn_closefile() for
1126	 * files that switched type in the cdevsw fdopen() method.
1127	 */
1128	fp->f_vnode = vp;
1129	/*
1130	 * If the file wasn't claimed by devfs bind it to the normal
1131	 * vnode operations here.
1132	 */
1133	if (fp->f_ops == &badfileops) {
1134		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1135		fp->f_seqcount = 1;
1136		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1137		    DTYPE_VNODE, vp, &vnops);
1138	}
1139
1140	VOP_UNLOCK(vp, 0);
1141	if (flags & O_TRUNC) {
1142		error = fo_truncate(fp, 0, td->td_ucred, td);
1143		if (error != 0)
1144			goto bad;
1145	}
1146success:
1147	/*
1148	 * If we haven't already installed the FD (for dupfdopen), do so now.
1149	 */
1150	if (indx == -1) {
1151		struct filecaps *fcaps;
1152
1153#ifdef CAPABILITIES
1154		if (nd.ni_strictrelative == 1)
1155			fcaps = &nd.ni_filecaps;
1156		else
1157#endif
1158			fcaps = NULL;
1159		error = finstall(td, fp, &indx, flags, fcaps);
1160		/* On success finstall() consumes fcaps. */
1161		if (error != 0) {
1162			filecaps_free(&nd.ni_filecaps);
1163			goto bad;
1164		}
1165	} else {
1166		filecaps_free(&nd.ni_filecaps);
1167	}
1168
1169	/*
1170	 * Release our private reference, leaving the one associated with
1171	 * the descriptor table intact.
1172	 */
1173	fdrop(fp, td);
1174	td->td_retval[0] = indx;
1175	return (0);
1176bad:
1177	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1178	fdrop(fp, td);
1179	return (error);
1180}
1181
1182#ifdef COMPAT_43
1183/*
1184 * Create a file.
1185 */
1186#ifndef _SYS_SYSPROTO_H_
1187struct ocreat_args {
1188	char	*path;
1189	int	mode;
1190};
1191#endif
1192int
1193ocreat(td, uap)
1194	struct thread *td;
1195	register struct ocreat_args /* {
1196		char *path;
1197		int mode;
1198	} */ *uap;
1199{
1200
1201	return (kern_open(td, uap->path, UIO_USERSPACE,
1202	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1203}
1204#endif /* COMPAT_43 */
1205
1206/*
1207 * Create a special file.
1208 */
1209#ifndef _SYS_SYSPROTO_H_
1210struct mknod_args {
1211	char	*path;
1212	int	mode;
1213	int	dev;
1214};
1215#endif
1216int
1217sys_mknod(td, uap)
1218	struct thread *td;
1219	register struct mknod_args /* {
1220		char *path;
1221		int mode;
1222		int dev;
1223	} */ *uap;
1224{
1225
1226	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1227}
1228
1229#ifndef _SYS_SYSPROTO_H_
1230struct mknodat_args {
1231	int	fd;
1232	char	*path;
1233	mode_t	mode;
1234	dev_t	dev;
1235};
1236#endif
1237int
1238sys_mknodat(struct thread *td, struct mknodat_args *uap)
1239{
1240
1241	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1242	    uap->dev));
1243}
1244
1245int
1246kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1247    int dev)
1248{
1249
1250	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1251}
1252
1253int
1254kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1255    int mode, int dev)
1256{
1257	struct vnode *vp;
1258	struct mount *mp;
1259	struct vattr vattr;
1260	struct nameidata nd;
1261	cap_rights_t rights;
1262	int error, whiteout = 0;
1263
1264	AUDIT_ARG_MODE(mode);
1265	AUDIT_ARG_DEV(dev);
1266	switch (mode & S_IFMT) {
1267	case S_IFCHR:
1268	case S_IFBLK:
1269		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1270		break;
1271	case S_IFMT:
1272		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1273		break;
1274	case S_IFWHT:
1275		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1276		break;
1277	case S_IFIFO:
1278		if (dev == 0)
1279			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1280		/* FALLTHROUGH */
1281	default:
1282		error = EINVAL;
1283		break;
1284	}
1285	if (error != 0)
1286		return (error);
1287restart:
1288	bwillwrite();
1289	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1290	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1291	    td);
1292	if ((error = namei(&nd)) != 0)
1293		return (error);
1294	vp = nd.ni_vp;
1295	if (vp != NULL) {
1296		NDFREE(&nd, NDF_ONLY_PNBUF);
1297		if (vp == nd.ni_dvp)
1298			vrele(nd.ni_dvp);
1299		else
1300			vput(nd.ni_dvp);
1301		vrele(vp);
1302		return (EEXIST);
1303	} else {
1304		VATTR_NULL(&vattr);
1305		vattr.va_mode = (mode & ALLPERMS) &
1306		    ~td->td_proc->p_fd->fd_cmask;
1307		vattr.va_rdev = dev;
1308		whiteout = 0;
1309
1310		switch (mode & S_IFMT) {
1311		case S_IFMT:	/* used by badsect to flag bad sectors */
1312			vattr.va_type = VBAD;
1313			break;
1314		case S_IFCHR:
1315			vattr.va_type = VCHR;
1316			break;
1317		case S_IFBLK:
1318			vattr.va_type = VBLK;
1319			break;
1320		case S_IFWHT:
1321			whiteout = 1;
1322			break;
1323		default:
1324			panic("kern_mknod: invalid mode");
1325		}
1326	}
1327	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1328		NDFREE(&nd, NDF_ONLY_PNBUF);
1329		vput(nd.ni_dvp);
1330		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1331			return (error);
1332		goto restart;
1333	}
1334#ifdef MAC
1335	if (error == 0 && !whiteout)
1336		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1337		    &nd.ni_cnd, &vattr);
1338#endif
1339	if (error == 0) {
1340		if (whiteout)
1341			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1342		else {
1343			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1344						&nd.ni_cnd, &vattr);
1345			if (error == 0)
1346				vput(nd.ni_vp);
1347		}
1348	}
1349	NDFREE(&nd, NDF_ONLY_PNBUF);
1350	vput(nd.ni_dvp);
1351	vn_finished_write(mp);
1352	return (error);
1353}
1354
1355/*
1356 * Create a named pipe.
1357 */
1358#ifndef _SYS_SYSPROTO_H_
1359struct mkfifo_args {
1360	char	*path;
1361	int	mode;
1362};
1363#endif
1364int
1365sys_mkfifo(td, uap)
1366	struct thread *td;
1367	register struct mkfifo_args /* {
1368		char *path;
1369		int mode;
1370	} */ *uap;
1371{
1372
1373	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1374}
1375
1376#ifndef _SYS_SYSPROTO_H_
1377struct mkfifoat_args {
1378	int	fd;
1379	char	*path;
1380	mode_t	mode;
1381};
1382#endif
1383int
1384sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1385{
1386
1387	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1388	    uap->mode));
1389}
1390
1391int
1392kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1393{
1394
1395	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1396}
1397
1398int
1399kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1400    int mode)
1401{
1402	struct mount *mp;
1403	struct vattr vattr;
1404	struct nameidata nd;
1405	cap_rights_t rights;
1406	int error;
1407
1408	AUDIT_ARG_MODE(mode);
1409restart:
1410	bwillwrite();
1411	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1412	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1413	    td);
1414	if ((error = namei(&nd)) != 0)
1415		return (error);
1416	if (nd.ni_vp != NULL) {
1417		NDFREE(&nd, NDF_ONLY_PNBUF);
1418		if (nd.ni_vp == nd.ni_dvp)
1419			vrele(nd.ni_dvp);
1420		else
1421			vput(nd.ni_dvp);
1422		vrele(nd.ni_vp);
1423		return (EEXIST);
1424	}
1425	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1426		NDFREE(&nd, NDF_ONLY_PNBUF);
1427		vput(nd.ni_dvp);
1428		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1429			return (error);
1430		goto restart;
1431	}
1432	VATTR_NULL(&vattr);
1433	vattr.va_type = VFIFO;
1434	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1435#ifdef MAC
1436	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1437	    &vattr);
1438	if (error != 0)
1439		goto out;
1440#endif
1441	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1442	if (error == 0)
1443		vput(nd.ni_vp);
1444#ifdef MAC
1445out:
1446#endif
1447	vput(nd.ni_dvp);
1448	vn_finished_write(mp);
1449	NDFREE(&nd, NDF_ONLY_PNBUF);
1450	return (error);
1451}
1452
1453/*
1454 * Make a hard file link.
1455 */
1456#ifndef _SYS_SYSPROTO_H_
1457struct link_args {
1458	char	*path;
1459	char	*link;
1460};
1461#endif
1462int
1463sys_link(td, uap)
1464	struct thread *td;
1465	register struct link_args /* {
1466		char *path;
1467		char *link;
1468	} */ *uap;
1469{
1470
1471	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1472}
1473
1474#ifndef _SYS_SYSPROTO_H_
1475struct linkat_args {
1476	int	fd1;
1477	char	*path1;
1478	int	fd2;
1479	char	*path2;
1480	int	flag;
1481};
1482#endif
1483int
1484sys_linkat(struct thread *td, struct linkat_args *uap)
1485{
1486	int flag;
1487
1488	flag = uap->flag;
1489	if (flag & ~AT_SYMLINK_FOLLOW)
1490		return (EINVAL);
1491
1492	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1493	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1494}
1495
1496int hardlink_check_uid = 0;
1497SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1498    &hardlink_check_uid, 0,
1499    "Unprivileged processes cannot create hard links to files owned by other "
1500    "users");
1501static int hardlink_check_gid = 0;
1502SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1503    &hardlink_check_gid, 0,
1504    "Unprivileged processes cannot create hard links to files owned by other "
1505    "groups");
1506
1507static int
1508can_hardlink(struct vnode *vp, struct ucred *cred)
1509{
1510	struct vattr va;
1511	int error;
1512
1513	if (!hardlink_check_uid && !hardlink_check_gid)
1514		return (0);
1515
1516	error = VOP_GETATTR(vp, &va, cred);
1517	if (error != 0)
1518		return (error);
1519
1520	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1521		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1522		if (error != 0)
1523			return (error);
1524	}
1525
1526	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1527		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1528		if (error != 0)
1529			return (error);
1530	}
1531
1532	return (0);
1533}
1534
1535int
1536kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1537{
1538
1539	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1540}
1541
1542int
1543kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1544    enum uio_seg segflg, int follow)
1545{
1546	struct vnode *vp;
1547	struct mount *mp;
1548	struct nameidata nd;
1549	cap_rights_t rights;
1550	int error;
1551
1552again:
1553	bwillwrite();
1554	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1555
1556	if ((error = namei(&nd)) != 0)
1557		return (error);
1558	NDFREE(&nd, NDF_ONLY_PNBUF);
1559	vp = nd.ni_vp;
1560	if (vp->v_type == VDIR) {
1561		vrele(vp);
1562		return (EPERM);		/* POSIX */
1563	}
1564	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
1565	    NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
1566	    td);
1567	if ((error = namei(&nd)) == 0) {
1568		if (nd.ni_vp != NULL) {
1569			NDFREE(&nd, NDF_ONLY_PNBUF);
1570			if (nd.ni_dvp == nd.ni_vp)
1571				vrele(nd.ni_dvp);
1572			else
1573				vput(nd.ni_dvp);
1574			vrele(nd.ni_vp);
1575			vrele(vp);
1576			return (EEXIST);
1577		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1578			/*
1579			 * Cross-device link.  No need to recheck
1580			 * vp->v_type, since it cannot change, except
1581			 * to VBAD.
1582			 */
1583			NDFREE(&nd, NDF_ONLY_PNBUF);
1584			vput(nd.ni_dvp);
1585			vrele(vp);
1586			return (EXDEV);
1587		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1588			error = can_hardlink(vp, td->td_ucred);
1589#ifdef MAC
1590			if (error == 0)
1591				error = mac_vnode_check_link(td->td_ucred,
1592				    nd.ni_dvp, vp, &nd.ni_cnd);
1593#endif
1594			if (error != 0) {
1595				vput(vp);
1596				vput(nd.ni_dvp);
1597				NDFREE(&nd, NDF_ONLY_PNBUF);
1598				return (error);
1599			}
1600			error = vn_start_write(vp, &mp, V_NOWAIT);
1601			if (error != 0) {
1602				vput(vp);
1603				vput(nd.ni_dvp);
1604				NDFREE(&nd, NDF_ONLY_PNBUF);
1605				error = vn_start_write(NULL, &mp,
1606				    V_XSLEEP | PCATCH);
1607				if (error != 0)
1608					return (error);
1609				goto again;
1610			}
1611			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1612			VOP_UNLOCK(vp, 0);
1613			vput(nd.ni_dvp);
1614			vn_finished_write(mp);
1615			NDFREE(&nd, NDF_ONLY_PNBUF);
1616		} else {
1617			vput(nd.ni_dvp);
1618			NDFREE(&nd, NDF_ONLY_PNBUF);
1619			vrele(vp);
1620			goto again;
1621		}
1622	}
1623	vrele(vp);
1624	return (error);
1625}
1626
1627/*
1628 * Make a symbolic link.
1629 */
1630#ifndef _SYS_SYSPROTO_H_
1631struct symlink_args {
1632	char	*path;
1633	char	*link;
1634};
1635#endif
1636int
1637sys_symlink(td, uap)
1638	struct thread *td;
1639	register struct symlink_args /* {
1640		char *path;
1641		char *link;
1642	} */ *uap;
1643{
1644
1645	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1646}
1647
1648#ifndef _SYS_SYSPROTO_H_
1649struct symlinkat_args {
1650	char	*path;
1651	int	fd;
1652	char	*path2;
1653};
1654#endif
1655int
1656sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1657{
1658
1659	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1660	    UIO_USERSPACE));
1661}
1662
1663int
1664kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1665{
1666
1667	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1668}
1669
1670int
1671kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1672    enum uio_seg segflg)
1673{
1674	struct mount *mp;
1675	struct vattr vattr;
1676	char *syspath;
1677	struct nameidata nd;
1678	int error;
1679	cap_rights_t rights;
1680
1681	if (segflg == UIO_SYSSPACE) {
1682		syspath = path1;
1683	} else {
1684		syspath = uma_zalloc(namei_zone, M_WAITOK);
1685		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1686			goto out;
1687	}
1688	AUDIT_ARG_TEXT(syspath);
1689restart:
1690	bwillwrite();
1691	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1692	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1693	    td);
1694	if ((error = namei(&nd)) != 0)
1695		goto out;
1696	if (nd.ni_vp) {
1697		NDFREE(&nd, NDF_ONLY_PNBUF);
1698		if (nd.ni_vp == nd.ni_dvp)
1699			vrele(nd.ni_dvp);
1700		else
1701			vput(nd.ni_dvp);
1702		vrele(nd.ni_vp);
1703		error = EEXIST;
1704		goto out;
1705	}
1706	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1707		NDFREE(&nd, NDF_ONLY_PNBUF);
1708		vput(nd.ni_dvp);
1709		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1710			goto out;
1711		goto restart;
1712	}
1713	VATTR_NULL(&vattr);
1714	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1715#ifdef MAC
1716	vattr.va_type = VLNK;
1717	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1718	    &vattr);
1719	if (error != 0)
1720		goto out2;
1721#endif
1722	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1723	if (error == 0)
1724		vput(nd.ni_vp);
1725#ifdef MAC
1726out2:
1727#endif
1728	NDFREE(&nd, NDF_ONLY_PNBUF);
1729	vput(nd.ni_dvp);
1730	vn_finished_write(mp);
1731out:
1732	if (segflg != UIO_SYSSPACE)
1733		uma_zfree(namei_zone, syspath);
1734	return (error);
1735}
1736
1737/*
1738 * Delete a whiteout from the filesystem.
1739 */
1740int
1741sys_undelete(td, uap)
1742	struct thread *td;
1743	register struct undelete_args /* {
1744		char *path;
1745	} */ *uap;
1746{
1747	struct mount *mp;
1748	struct nameidata nd;
1749	int error;
1750
1751restart:
1752	bwillwrite();
1753	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1754	    UIO_USERSPACE, uap->path, td);
1755	error = namei(&nd);
1756	if (error != 0)
1757		return (error);
1758
1759	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1760		NDFREE(&nd, NDF_ONLY_PNBUF);
1761		if (nd.ni_vp == nd.ni_dvp)
1762			vrele(nd.ni_dvp);
1763		else
1764			vput(nd.ni_dvp);
1765		if (nd.ni_vp)
1766			vrele(nd.ni_vp);
1767		return (EEXIST);
1768	}
1769	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1770		NDFREE(&nd, NDF_ONLY_PNBUF);
1771		vput(nd.ni_dvp);
1772		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773			return (error);
1774		goto restart;
1775	}
1776	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1777	NDFREE(&nd, NDF_ONLY_PNBUF);
1778	vput(nd.ni_dvp);
1779	vn_finished_write(mp);
1780	return (error);
1781}
1782
1783/*
1784 * Delete a name from the filesystem.
1785 */
1786#ifndef _SYS_SYSPROTO_H_
1787struct unlink_args {
1788	char	*path;
1789};
1790#endif
1791int
1792sys_unlink(td, uap)
1793	struct thread *td;
1794	struct unlink_args /* {
1795		char *path;
1796	} */ *uap;
1797{
1798
1799	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1800}
1801
1802#ifndef _SYS_SYSPROTO_H_
1803struct unlinkat_args {
1804	int	fd;
1805	char	*path;
1806	int	flag;
1807};
1808#endif
1809int
1810sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1811{
1812	int flag = uap->flag;
1813	int fd = uap->fd;
1814	char *path = uap->path;
1815
1816	if (flag & ~AT_REMOVEDIR)
1817		return (EINVAL);
1818
1819	if (flag & AT_REMOVEDIR)
1820		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1821	else
1822		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1823}
1824
1825int
1826kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1827{
1828
1829	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1830}
1831
1832int
1833kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1834    ino_t oldinum)
1835{
1836	struct mount *mp;
1837	struct vnode *vp;
1838	struct nameidata nd;
1839	struct stat sb;
1840	cap_rights_t rights;
1841	int error;
1842
1843restart:
1844	bwillwrite();
1845	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1846	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1847	if ((error = namei(&nd)) != 0)
1848		return (error == EINVAL ? EPERM : error);
1849	vp = nd.ni_vp;
1850	if (vp->v_type == VDIR && oldinum == 0) {
1851		error = EPERM;		/* POSIX */
1852	} else if (oldinum != 0 &&
1853		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1854		  sb.st_ino != oldinum) {
1855			error = EIDRM;	/* Identifier removed */
1856	} else {
1857		/*
1858		 * The root of a mounted filesystem cannot be deleted.
1859		 *
1860		 * XXX: can this only be a VDIR case?
1861		 */
1862		if (vp->v_vflag & VV_ROOT)
1863			error = EBUSY;
1864	}
1865	if (error == 0) {
1866		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1867			NDFREE(&nd, NDF_ONLY_PNBUF);
1868			vput(nd.ni_dvp);
1869			if (vp == nd.ni_dvp)
1870				vrele(vp);
1871			else
1872				vput(vp);
1873			if ((error = vn_start_write(NULL, &mp,
1874			    V_XSLEEP | PCATCH)) != 0)
1875				return (error);
1876			goto restart;
1877		}
1878#ifdef MAC
1879		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1880		    &nd.ni_cnd);
1881		if (error != 0)
1882			goto out;
1883#endif
1884		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1885		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1886#ifdef MAC
1887out:
1888#endif
1889		vn_finished_write(mp);
1890	}
1891	NDFREE(&nd, NDF_ONLY_PNBUF);
1892	vput(nd.ni_dvp);
1893	if (vp == nd.ni_dvp)
1894		vrele(vp);
1895	else
1896		vput(vp);
1897	return (error);
1898}
1899
1900/*
1901 * Reposition read/write file offset.
1902 */
1903#ifndef _SYS_SYSPROTO_H_
1904struct lseek_args {
1905	int	fd;
1906	int	pad;
1907	off_t	offset;
1908	int	whence;
1909};
1910#endif
1911int
1912sys_lseek(td, uap)
1913	struct thread *td;
1914	register struct lseek_args /* {
1915		int fd;
1916		int pad;
1917		off_t offset;
1918		int whence;
1919	} */ *uap;
1920{
1921	struct file *fp;
1922	cap_rights_t rights;
1923	int error;
1924
1925	AUDIT_ARG_FD(uap->fd);
1926	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1927	if (error != 0)
1928		return (error);
1929	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1930	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1931	fdrop(fp, td);
1932	return (error);
1933}
1934
1935#if defined(COMPAT_43)
1936/*
1937 * Reposition read/write file offset.
1938 */
1939#ifndef _SYS_SYSPROTO_H_
1940struct olseek_args {
1941	int	fd;
1942	long	offset;
1943	int	whence;
1944};
1945#endif
1946int
1947olseek(td, uap)
1948	struct thread *td;
1949	register struct olseek_args /* {
1950		int fd;
1951		long offset;
1952		int whence;
1953	} */ *uap;
1954{
1955	struct lseek_args /* {
1956		int fd;
1957		int pad;
1958		off_t offset;
1959		int whence;
1960	} */ nuap;
1961
1962	nuap.fd = uap->fd;
1963	nuap.offset = uap->offset;
1964	nuap.whence = uap->whence;
1965	return (sys_lseek(td, &nuap));
1966}
1967#endif /* COMPAT_43 */
1968
1969/* Version with the 'pad' argument */
1970int
1971freebsd6_lseek(td, uap)
1972	struct thread *td;
1973	register struct freebsd6_lseek_args *uap;
1974{
1975	struct lseek_args ouap;
1976
1977	ouap.fd = uap->fd;
1978	ouap.offset = uap->offset;
1979	ouap.whence = uap->whence;
1980	return (sys_lseek(td, &ouap));
1981}
1982
1983/*
1984 * Check access permissions using passed credentials.
1985 */
1986static int
1987vn_access(vp, user_flags, cred, td)
1988	struct vnode	*vp;
1989	int		user_flags;
1990	struct ucred	*cred;
1991	struct thread	*td;
1992{
1993	accmode_t accmode;
1994	int error;
1995
1996	/* Flags == 0 means only check for existence. */
1997	error = 0;
1998	if (user_flags) {
1999		accmode = 0;
2000		if (user_flags & R_OK)
2001			accmode |= VREAD;
2002		if (user_flags & W_OK)
2003			accmode |= VWRITE;
2004		if (user_flags & X_OK)
2005			accmode |= VEXEC;
2006#ifdef MAC
2007		error = mac_vnode_check_access(cred, vp, accmode);
2008		if (error != 0)
2009			return (error);
2010#endif
2011		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2012			error = VOP_ACCESS(vp, accmode, cred, td);
2013	}
2014	return (error);
2015}
2016
2017/*
2018 * Check access permissions using "real" credentials.
2019 */
2020#ifndef _SYS_SYSPROTO_H_
2021struct access_args {
2022	char	*path;
2023	int	amode;
2024};
2025#endif
2026int
2027sys_access(td, uap)
2028	struct thread *td;
2029	register struct access_args /* {
2030		char *path;
2031		int amode;
2032	} */ *uap;
2033{
2034
2035	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2036}
2037
2038#ifndef _SYS_SYSPROTO_H_
2039struct faccessat_args {
2040	int	dirfd;
2041	char	*path;
2042	int	amode;
2043	int	flag;
2044}
2045#endif
2046int
2047sys_faccessat(struct thread *td, struct faccessat_args *uap)
2048{
2049
2050	if (uap->flag & ~AT_EACCESS)
2051		return (EINVAL);
2052	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2053	    uap->amode));
2054}
2055
2056int
2057kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2058{
2059
2060	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2061}
2062
2063int
2064kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2065    int flag, int amode)
2066{
2067	struct ucred *cred, *tmpcred;
2068	struct vnode *vp;
2069	struct nameidata nd;
2070	cap_rights_t rights;
2071	int error;
2072
2073	/*
2074	 * Create and modify a temporary credential instead of one that
2075	 * is potentially shared.
2076	 */
2077	if (!(flag & AT_EACCESS)) {
2078		cred = td->td_ucred;
2079		tmpcred = crdup(cred);
2080		tmpcred->cr_uid = cred->cr_ruid;
2081		tmpcred->cr_groups[0] = cred->cr_rgid;
2082		td->td_ucred = tmpcred;
2083	} else
2084		cred = tmpcred = td->td_ucred;
2085	AUDIT_ARG_VALUE(amode);
2086	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2087	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2088	    td);
2089	if ((error = namei(&nd)) != 0)
2090		goto out1;
2091	vp = nd.ni_vp;
2092
2093	error = vn_access(vp, amode, tmpcred, td);
2094	NDFREE(&nd, NDF_ONLY_PNBUF);
2095	vput(vp);
2096out1:
2097	if (!(flag & AT_EACCESS)) {
2098		td->td_ucred = cred;
2099		crfree(tmpcred);
2100	}
2101	return (error);
2102}
2103
2104/*
2105 * Check access permissions using "effective" credentials.
2106 */
2107#ifndef _SYS_SYSPROTO_H_
2108struct eaccess_args {
2109	char	*path;
2110	int	amode;
2111};
2112#endif
2113int
2114sys_eaccess(td, uap)
2115	struct thread *td;
2116	register struct eaccess_args /* {
2117		char *path;
2118		int amode;
2119	} */ *uap;
2120{
2121
2122	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2123}
2124
2125int
2126kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2127{
2128
2129	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2130}
2131
2132#if defined(COMPAT_43)
2133/*
2134 * Get file status; this version follows links.
2135 */
2136#ifndef _SYS_SYSPROTO_H_
2137struct ostat_args {
2138	char	*path;
2139	struct ostat *ub;
2140};
2141#endif
2142int
2143ostat(td, uap)
2144	struct thread *td;
2145	register struct ostat_args /* {
2146		char *path;
2147		struct ostat *ub;
2148	} */ *uap;
2149{
2150	struct stat sb;
2151	struct ostat osb;
2152	int error;
2153
2154	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2155	if (error != 0)
2156		return (error);
2157	cvtstat(&sb, &osb);
2158	return (copyout(&osb, uap->ub, sizeof (osb)));
2159}
2160
2161/*
2162 * Get file status; this version does not follow links.
2163 */
2164#ifndef _SYS_SYSPROTO_H_
2165struct olstat_args {
2166	char	*path;
2167	struct ostat *ub;
2168};
2169#endif
2170int
2171olstat(td, uap)
2172	struct thread *td;
2173	register struct olstat_args /* {
2174		char *path;
2175		struct ostat *ub;
2176	} */ *uap;
2177{
2178	struct stat sb;
2179	struct ostat osb;
2180	int error;
2181
2182	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2183	if (error != 0)
2184		return (error);
2185	cvtstat(&sb, &osb);
2186	return (copyout(&osb, uap->ub, sizeof (osb)));
2187}
2188
2189/*
2190 * Convert from an old to a new stat structure.
2191 */
2192void
2193cvtstat(st, ost)
2194	struct stat *st;
2195	struct ostat *ost;
2196{
2197
2198	ost->st_dev = st->st_dev;
2199	ost->st_ino = st->st_ino;
2200	ost->st_mode = st->st_mode;
2201	ost->st_nlink = st->st_nlink;
2202	ost->st_uid = st->st_uid;
2203	ost->st_gid = st->st_gid;
2204	ost->st_rdev = st->st_rdev;
2205	if (st->st_size < (quad_t)1 << 32)
2206		ost->st_size = st->st_size;
2207	else
2208		ost->st_size = -2;
2209	ost->st_atim = st->st_atim;
2210	ost->st_mtim = st->st_mtim;
2211	ost->st_ctim = st->st_ctim;
2212	ost->st_blksize = st->st_blksize;
2213	ost->st_blocks = st->st_blocks;
2214	ost->st_flags = st->st_flags;
2215	ost->st_gen = st->st_gen;
2216}
2217#endif /* COMPAT_43 */
2218
2219/*
2220 * Get file status; this version follows links.
2221 */
2222#ifndef _SYS_SYSPROTO_H_
2223struct stat_args {
2224	char	*path;
2225	struct stat *ub;
2226};
2227#endif
2228int
2229sys_stat(td, uap)
2230	struct thread *td;
2231	register struct stat_args /* {
2232		char *path;
2233		struct stat *ub;
2234	} */ *uap;
2235{
2236	struct stat sb;
2237	int error;
2238
2239	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2240	if (error == 0)
2241		error = copyout(&sb, uap->ub, sizeof (sb));
2242	return (error);
2243}
2244
2245#ifndef _SYS_SYSPROTO_H_
2246struct fstatat_args {
2247	int	fd;
2248	char	*path;
2249	struct stat	*buf;
2250	int	flag;
2251}
2252#endif
2253int
2254sys_fstatat(struct thread *td, struct fstatat_args *uap)
2255{
2256	struct stat sb;
2257	int error;
2258
2259	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2260	    UIO_USERSPACE, &sb);
2261	if (error == 0)
2262		error = copyout(&sb, uap->buf, sizeof (sb));
2263	return (error);
2264}
2265
2266int
2267kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2268{
2269
2270	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2271}
2272
2273int
2274kern_statat(struct thread *td, int flag, int fd, char *path,
2275    enum uio_seg pathseg, struct stat *sbp)
2276{
2277
2278	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2279}
2280
2281int
2282kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2283    enum uio_seg pathseg, struct stat *sbp,
2284    void (*hook)(struct vnode *vp, struct stat *sbp))
2285{
2286	struct nameidata nd;
2287	struct stat sb;
2288	cap_rights_t rights;
2289	int error;
2290
2291	if (flag & ~AT_SYMLINK_NOFOLLOW)
2292		return (EINVAL);
2293
2294	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2295	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2296	    cap_rights_init(&rights, CAP_FSTAT), td);
2297
2298	if ((error = namei(&nd)) != 0)
2299		return (error);
2300	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2301	if (error == 0) {
2302		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2303		if (S_ISREG(sb.st_mode))
2304			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2305		if (__predict_false(hook != NULL))
2306			hook(nd.ni_vp, &sb);
2307	}
2308	NDFREE(&nd, NDF_ONLY_PNBUF);
2309	vput(nd.ni_vp);
2310	if (error != 0)
2311		return (error);
2312	*sbp = sb;
2313#ifdef KTRACE
2314	if (KTRPOINT(td, KTR_STRUCT))
2315		ktrstat(&sb);
2316#endif
2317	return (0);
2318}
2319
2320/*
2321 * Get file status; this version does not follow links.
2322 */
2323#ifndef _SYS_SYSPROTO_H_
2324struct lstat_args {
2325	char	*path;
2326	struct stat *ub;
2327};
2328#endif
2329int
2330sys_lstat(td, uap)
2331	struct thread *td;
2332	register struct lstat_args /* {
2333		char *path;
2334		struct stat *ub;
2335	} */ *uap;
2336{
2337	struct stat sb;
2338	int error;
2339
2340	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2341	if (error == 0)
2342		error = copyout(&sb, uap->ub, sizeof (sb));
2343	return (error);
2344}
2345
2346int
2347kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2348{
2349
2350	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2351	    sbp));
2352}
2353
2354/*
2355 * Implementation of the NetBSD [l]stat() functions.
2356 */
2357void
2358cvtnstat(sb, nsb)
2359	struct stat *sb;
2360	struct nstat *nsb;
2361{
2362
2363	bzero(nsb, sizeof *nsb);
2364	nsb->st_dev = sb->st_dev;
2365	nsb->st_ino = sb->st_ino;
2366	nsb->st_mode = sb->st_mode;
2367	nsb->st_nlink = sb->st_nlink;
2368	nsb->st_uid = sb->st_uid;
2369	nsb->st_gid = sb->st_gid;
2370	nsb->st_rdev = sb->st_rdev;
2371	nsb->st_atim = sb->st_atim;
2372	nsb->st_mtim = sb->st_mtim;
2373	nsb->st_ctim = sb->st_ctim;
2374	nsb->st_size = sb->st_size;
2375	nsb->st_blocks = sb->st_blocks;
2376	nsb->st_blksize = sb->st_blksize;
2377	nsb->st_flags = sb->st_flags;
2378	nsb->st_gen = sb->st_gen;
2379	nsb->st_birthtim = sb->st_birthtim;
2380}
2381
2382#ifndef _SYS_SYSPROTO_H_
2383struct nstat_args {
2384	char	*path;
2385	struct nstat *ub;
2386};
2387#endif
2388int
2389sys_nstat(td, uap)
2390	struct thread *td;
2391	register struct nstat_args /* {
2392		char *path;
2393		struct nstat *ub;
2394	} */ *uap;
2395{
2396	struct stat sb;
2397	struct nstat nsb;
2398	int error;
2399
2400	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2401	if (error != 0)
2402		return (error);
2403	cvtnstat(&sb, &nsb);
2404	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2405}
2406
2407/*
2408 * NetBSD lstat.  Get file status; this version does not follow links.
2409 */
2410#ifndef _SYS_SYSPROTO_H_
2411struct lstat_args {
2412	char	*path;
2413	struct stat *ub;
2414};
2415#endif
2416int
2417sys_nlstat(td, uap)
2418	struct thread *td;
2419	register struct nlstat_args /* {
2420		char *path;
2421		struct nstat *ub;
2422	} */ *uap;
2423{
2424	struct stat sb;
2425	struct nstat nsb;
2426	int error;
2427
2428	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2429	if (error != 0)
2430		return (error);
2431	cvtnstat(&sb, &nsb);
2432	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2433}
2434
2435/*
2436 * Get configurable pathname variables.
2437 */
2438#ifndef _SYS_SYSPROTO_H_
2439struct pathconf_args {
2440	char	*path;
2441	int	name;
2442};
2443#endif
2444int
2445sys_pathconf(td, uap)
2446	struct thread *td;
2447	register struct pathconf_args /* {
2448		char *path;
2449		int name;
2450	} */ *uap;
2451{
2452
2453	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2454}
2455
2456#ifndef _SYS_SYSPROTO_H_
2457struct lpathconf_args {
2458	char	*path;
2459	int	name;
2460};
2461#endif
2462int
2463sys_lpathconf(td, uap)
2464	struct thread *td;
2465	register struct lpathconf_args /* {
2466		char *path;
2467		int name;
2468	} */ *uap;
2469{
2470
2471	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2472	    NOFOLLOW));
2473}
2474
2475int
2476kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2477    u_long flags)
2478{
2479	struct nameidata nd;
2480	int error;
2481
2482	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2483	    pathseg, path, td);
2484	if ((error = namei(&nd)) != 0)
2485		return (error);
2486	NDFREE(&nd, NDF_ONLY_PNBUF);
2487
2488	/* If asynchronous I/O is available, it works for all files. */
2489	if (name == _PC_ASYNC_IO)
2490		td->td_retval[0] = async_io_version;
2491	else
2492		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2493	vput(nd.ni_vp);
2494	return (error);
2495}
2496
2497/*
2498 * Return target name of a symbolic link.
2499 */
2500#ifndef _SYS_SYSPROTO_H_
2501struct readlink_args {
2502	char	*path;
2503	char	*buf;
2504	size_t	count;
2505};
2506#endif
2507int
2508sys_readlink(td, uap)
2509	struct thread *td;
2510	register struct readlink_args /* {
2511		char *path;
2512		char *buf;
2513		size_t count;
2514	} */ *uap;
2515{
2516
2517	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2518	    UIO_USERSPACE, uap->count));
2519}
2520#ifndef _SYS_SYSPROTO_H_
2521struct readlinkat_args {
2522	int	fd;
2523	char	*path;
2524	char	*buf;
2525	size_t	bufsize;
2526};
2527#endif
2528int
2529sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2530{
2531
2532	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2533	    uap->buf, UIO_USERSPACE, uap->bufsize));
2534}
2535
2536int
2537kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2538    enum uio_seg bufseg, size_t count)
2539{
2540
2541	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2542	    count));
2543}
2544
2545int
2546kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2547    char *buf, enum uio_seg bufseg, size_t count)
2548{
2549	struct vnode *vp;
2550	struct iovec aiov;
2551	struct uio auio;
2552	struct nameidata nd;
2553	int error;
2554
2555	if (count > IOSIZE_MAX)
2556		return (EINVAL);
2557
2558	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2559	    pathseg, path, fd, td);
2560
2561	if ((error = namei(&nd)) != 0)
2562		return (error);
2563	NDFREE(&nd, NDF_ONLY_PNBUF);
2564	vp = nd.ni_vp;
2565#ifdef MAC
2566	error = mac_vnode_check_readlink(td->td_ucred, vp);
2567	if (error != 0) {
2568		vput(vp);
2569		return (error);
2570	}
2571#endif
2572	if (vp->v_type != VLNK)
2573		error = EINVAL;
2574	else {
2575		aiov.iov_base = buf;
2576		aiov.iov_len = count;
2577		auio.uio_iov = &aiov;
2578		auio.uio_iovcnt = 1;
2579		auio.uio_offset = 0;
2580		auio.uio_rw = UIO_READ;
2581		auio.uio_segflg = bufseg;
2582		auio.uio_td = td;
2583		auio.uio_resid = count;
2584		error = VOP_READLINK(vp, &auio, td->td_ucred);
2585		td->td_retval[0] = count - auio.uio_resid;
2586	}
2587	vput(vp);
2588	return (error);
2589}
2590
2591/*
2592 * Common implementation code for chflags() and fchflags().
2593 */
2594static int
2595setfflags(td, vp, flags)
2596	struct thread *td;
2597	struct vnode *vp;
2598	u_long flags;
2599{
2600	struct mount *mp;
2601	struct vattr vattr;
2602	int error;
2603
2604	/* We can't support the value matching VNOVAL. */
2605	if (flags == VNOVAL)
2606		return (EOPNOTSUPP);
2607
2608	/*
2609	 * Prevent non-root users from setting flags on devices.  When
2610	 * a device is reused, users can retain ownership of the device
2611	 * if they are allowed to set flags and programs assume that
2612	 * chown can't fail when done as root.
2613	 */
2614	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2615		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2616		if (error != 0)
2617			return (error);
2618	}
2619
2620	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2621		return (error);
2622	VATTR_NULL(&vattr);
2623	vattr.va_flags = flags;
2624	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2625#ifdef MAC
2626	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2627	if (error == 0)
2628#endif
2629		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2630	VOP_UNLOCK(vp, 0);
2631	vn_finished_write(mp);
2632	return (error);
2633}
2634
2635/*
2636 * Change flags of a file given a path name.
2637 */
2638#ifndef _SYS_SYSPROTO_H_
2639struct chflags_args {
2640	const char *path;
2641	u_long	flags;
2642};
2643#endif
2644int
2645sys_chflags(td, uap)
2646	struct thread *td;
2647	register struct chflags_args /* {
2648		const char *path;
2649		u_long flags;
2650	} */ *uap;
2651{
2652
2653	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2654}
2655
2656#ifndef _SYS_SYSPROTO_H_
2657struct chflagsat_args {
2658	int	fd;
2659	const char *path;
2660	u_long	flags;
2661	int	atflag;
2662}
2663#endif
2664int
2665sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2666{
2667	int fd = uap->fd;
2668	const char *path = uap->path;
2669	u_long flags = uap->flags;
2670	int atflag = uap->atflag;
2671
2672	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2673		return (EINVAL);
2674
2675	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2676}
2677
2678static int
2679kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2680    u_long flags)
2681{
2682
2683	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2684}
2685
2686/*
2687 * Same as chflags() but doesn't follow symlinks.
2688 */
2689int
2690sys_lchflags(td, uap)
2691	struct thread *td;
2692	register struct lchflags_args /* {
2693		const char *path;
2694		u_long flags;
2695	} */ *uap;
2696{
2697
2698	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2699	    uap->flags, AT_SYMLINK_NOFOLLOW));
2700}
2701
2702static int
2703kern_chflagsat(struct thread *td, int fd, const char *path,
2704    enum uio_seg pathseg, u_long flags, int atflag)
2705{
2706	struct nameidata nd;
2707	cap_rights_t rights;
2708	int error, follow;
2709
2710	AUDIT_ARG_FFLAGS(flags);
2711	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2712	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2713	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2714	if ((error = namei(&nd)) != 0)
2715		return (error);
2716	NDFREE(&nd, NDF_ONLY_PNBUF);
2717	error = setfflags(td, nd.ni_vp, flags);
2718	vrele(nd.ni_vp);
2719	return (error);
2720}
2721
2722/*
2723 * Change flags of a file given a file descriptor.
2724 */
2725#ifndef _SYS_SYSPROTO_H_
2726struct fchflags_args {
2727	int	fd;
2728	u_long	flags;
2729};
2730#endif
2731int
2732sys_fchflags(td, uap)
2733	struct thread *td;
2734	register struct fchflags_args /* {
2735		int fd;
2736		u_long flags;
2737	} */ *uap;
2738{
2739	struct file *fp;
2740	cap_rights_t rights;
2741	int error;
2742
2743	AUDIT_ARG_FD(uap->fd);
2744	AUDIT_ARG_FFLAGS(uap->flags);
2745	error = getvnode(td->td_proc->p_fd, uap->fd,
2746	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2747	if (error != 0)
2748		return (error);
2749#ifdef AUDIT
2750	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2751	AUDIT_ARG_VNODE1(fp->f_vnode);
2752	VOP_UNLOCK(fp->f_vnode, 0);
2753#endif
2754	error = setfflags(td, fp->f_vnode, uap->flags);
2755	fdrop(fp, td);
2756	return (error);
2757}
2758
2759/*
2760 * Common implementation code for chmod(), lchmod() and fchmod().
2761 */
2762int
2763setfmode(td, cred, vp, mode)
2764	struct thread *td;
2765	struct ucred *cred;
2766	struct vnode *vp;
2767	int mode;
2768{
2769	struct mount *mp;
2770	struct vattr vattr;
2771	int error;
2772
2773	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2774		return (error);
2775	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2776	VATTR_NULL(&vattr);
2777	vattr.va_mode = mode & ALLPERMS;
2778#ifdef MAC
2779	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2780	if (error == 0)
2781#endif
2782		error = VOP_SETATTR(vp, &vattr, cred);
2783	VOP_UNLOCK(vp, 0);
2784	vn_finished_write(mp);
2785	return (error);
2786}
2787
2788/*
2789 * Change mode of a file given path name.
2790 */
2791#ifndef _SYS_SYSPROTO_H_
2792struct chmod_args {
2793	char	*path;
2794	int	mode;
2795};
2796#endif
2797int
2798sys_chmod(td, uap)
2799	struct thread *td;
2800	register struct chmod_args /* {
2801		char *path;
2802		int mode;
2803	} */ *uap;
2804{
2805
2806	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2807}
2808
2809#ifndef _SYS_SYSPROTO_H_
2810struct fchmodat_args {
2811	int	dirfd;
2812	char	*path;
2813	mode_t	mode;
2814	int	flag;
2815}
2816#endif
2817int
2818sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2819{
2820	int flag = uap->flag;
2821	int fd = uap->fd;
2822	char *path = uap->path;
2823	mode_t mode = uap->mode;
2824
2825	if (flag & ~AT_SYMLINK_NOFOLLOW)
2826		return (EINVAL);
2827
2828	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2829}
2830
2831int
2832kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2833{
2834
2835	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2836}
2837
2838/*
2839 * Change mode of a file given path name (don't follow links.)
2840 */
2841#ifndef _SYS_SYSPROTO_H_
2842struct lchmod_args {
2843	char	*path;
2844	int	mode;
2845};
2846#endif
2847int
2848sys_lchmod(td, uap)
2849	struct thread *td;
2850	register struct lchmod_args /* {
2851		char *path;
2852		int mode;
2853	} */ *uap;
2854{
2855
2856	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2857	    uap->mode, AT_SYMLINK_NOFOLLOW));
2858}
2859
2860int
2861kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2862    mode_t mode, int flag)
2863{
2864	struct nameidata nd;
2865	cap_rights_t rights;
2866	int error, follow;
2867
2868	AUDIT_ARG_MODE(mode);
2869	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2870	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2871	    cap_rights_init(&rights, CAP_FCHMOD), td);
2872	if ((error = namei(&nd)) != 0)
2873		return (error);
2874	NDFREE(&nd, NDF_ONLY_PNBUF);
2875	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2876	vrele(nd.ni_vp);
2877	return (error);
2878}
2879
2880/*
2881 * Change mode of a file given a file descriptor.
2882 */
2883#ifndef _SYS_SYSPROTO_H_
2884struct fchmod_args {
2885	int	fd;
2886	int	mode;
2887};
2888#endif
2889int
2890sys_fchmod(struct thread *td, struct fchmod_args *uap)
2891{
2892	struct file *fp;
2893	cap_rights_t rights;
2894	int error;
2895
2896	AUDIT_ARG_FD(uap->fd);
2897	AUDIT_ARG_MODE(uap->mode);
2898
2899	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2900	if (error != 0)
2901		return (error);
2902	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2903	fdrop(fp, td);
2904	return (error);
2905}
2906
2907/*
2908 * Common implementation for chown(), lchown(), and fchown()
2909 */
2910int
2911setfown(td, cred, vp, uid, gid)
2912	struct thread *td;
2913	struct ucred *cred;
2914	struct vnode *vp;
2915	uid_t uid;
2916	gid_t gid;
2917{
2918	struct mount *mp;
2919	struct vattr vattr;
2920	int error;
2921
2922	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2923		return (error);
2924	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2925	VATTR_NULL(&vattr);
2926	vattr.va_uid = uid;
2927	vattr.va_gid = gid;
2928#ifdef MAC
2929	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2930	    vattr.va_gid);
2931	if (error == 0)
2932#endif
2933		error = VOP_SETATTR(vp, &vattr, cred);
2934	VOP_UNLOCK(vp, 0);
2935	vn_finished_write(mp);
2936	return (error);
2937}
2938
2939/*
2940 * Set ownership given a path name.
2941 */
2942#ifndef _SYS_SYSPROTO_H_
2943struct chown_args {
2944	char	*path;
2945	int	uid;
2946	int	gid;
2947};
2948#endif
2949int
2950sys_chown(td, uap)
2951	struct thread *td;
2952	register struct chown_args /* {
2953		char *path;
2954		int uid;
2955		int gid;
2956	} */ *uap;
2957{
2958
2959	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2960}
2961
2962#ifndef _SYS_SYSPROTO_H_
2963struct fchownat_args {
2964	int fd;
2965	const char * path;
2966	uid_t uid;
2967	gid_t gid;
2968	int flag;
2969};
2970#endif
2971int
2972sys_fchownat(struct thread *td, struct fchownat_args *uap)
2973{
2974	int flag;
2975
2976	flag = uap->flag;
2977	if (flag & ~AT_SYMLINK_NOFOLLOW)
2978		return (EINVAL);
2979
2980	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2981	    uap->gid, uap->flag));
2982}
2983
2984int
2985kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2986    int gid)
2987{
2988
2989	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2990}
2991
2992int
2993kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2994    int uid, int gid, int flag)
2995{
2996	struct nameidata nd;
2997	cap_rights_t rights;
2998	int error, follow;
2999
3000	AUDIT_ARG_OWNER(uid, gid);
3001	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3002	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3003	    cap_rights_init(&rights, CAP_FCHOWN), td);
3004
3005	if ((error = namei(&nd)) != 0)
3006		return (error);
3007	NDFREE(&nd, NDF_ONLY_PNBUF);
3008	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3009	vrele(nd.ni_vp);
3010	return (error);
3011}
3012
3013/*
3014 * Set ownership given a path name, do not cross symlinks.
3015 */
3016#ifndef _SYS_SYSPROTO_H_
3017struct lchown_args {
3018	char	*path;
3019	int	uid;
3020	int	gid;
3021};
3022#endif
3023int
3024sys_lchown(td, uap)
3025	struct thread *td;
3026	register struct lchown_args /* {
3027		char *path;
3028		int uid;
3029		int gid;
3030	} */ *uap;
3031{
3032
3033	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3034}
3035
3036int
3037kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3038    int gid)
3039{
3040
3041	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3042	    AT_SYMLINK_NOFOLLOW));
3043}
3044
3045/*
3046 * Set ownership given a file descriptor.
3047 */
3048#ifndef _SYS_SYSPROTO_H_
3049struct fchown_args {
3050	int	fd;
3051	int	uid;
3052	int	gid;
3053};
3054#endif
3055int
3056sys_fchown(td, uap)
3057	struct thread *td;
3058	register struct fchown_args /* {
3059		int fd;
3060		int uid;
3061		int gid;
3062	} */ *uap;
3063{
3064	struct file *fp;
3065	cap_rights_t rights;
3066	int error;
3067
3068	AUDIT_ARG_FD(uap->fd);
3069	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3070	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3071	if (error != 0)
3072		return (error);
3073	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3074	fdrop(fp, td);
3075	return (error);
3076}
3077
3078/*
3079 * Common implementation code for utimes(), lutimes(), and futimes().
3080 */
3081static int
3082getutimes(usrtvp, tvpseg, tsp)
3083	const struct timeval *usrtvp;
3084	enum uio_seg tvpseg;
3085	struct timespec *tsp;
3086{
3087	struct timeval tv[2];
3088	const struct timeval *tvp;
3089	int error;
3090
3091	if (usrtvp == NULL) {
3092		vfs_timestamp(&tsp[0]);
3093		tsp[1] = tsp[0];
3094	} else {
3095		if (tvpseg == UIO_SYSSPACE) {
3096			tvp = usrtvp;
3097		} else {
3098			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3099				return (error);
3100			tvp = tv;
3101		}
3102
3103		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3104		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3105			return (EINVAL);
3106		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3107		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3108	}
3109	return (0);
3110}
3111
3112/*
3113 * Common implementation code for utimes(), lutimes(), and futimes().
3114 */
3115static int
3116setutimes(td, vp, ts, numtimes, nullflag)
3117	struct thread *td;
3118	struct vnode *vp;
3119	const struct timespec *ts;
3120	int numtimes;
3121	int nullflag;
3122{
3123	struct mount *mp;
3124	struct vattr vattr;
3125	int error, setbirthtime;
3126
3127	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3128		return (error);
3129	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3130	setbirthtime = 0;
3131	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3132	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3133		setbirthtime = 1;
3134	VATTR_NULL(&vattr);
3135	vattr.va_atime = ts[0];
3136	vattr.va_mtime = ts[1];
3137	if (setbirthtime)
3138		vattr.va_birthtime = ts[1];
3139	if (numtimes > 2)
3140		vattr.va_birthtime = ts[2];
3141	if (nullflag)
3142		vattr.va_vaflags |= VA_UTIMES_NULL;
3143#ifdef MAC
3144	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3145	    vattr.va_mtime);
3146#endif
3147	if (error == 0)
3148		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3149	VOP_UNLOCK(vp, 0);
3150	vn_finished_write(mp);
3151	return (error);
3152}
3153
3154/*
3155 * Set the access and modification times of a file.
3156 */
3157#ifndef _SYS_SYSPROTO_H_
3158struct utimes_args {
3159	char	*path;
3160	struct	timeval *tptr;
3161};
3162#endif
3163int
3164sys_utimes(td, uap)
3165	struct thread *td;
3166	register struct utimes_args /* {
3167		char *path;
3168		struct timeval *tptr;
3169	} */ *uap;
3170{
3171
3172	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3173	    UIO_USERSPACE));
3174}
3175
3176#ifndef _SYS_SYSPROTO_H_
3177struct futimesat_args {
3178	int fd;
3179	const char * path;
3180	const struct timeval * times;
3181};
3182#endif
3183int
3184sys_futimesat(struct thread *td, struct futimesat_args *uap)
3185{
3186
3187	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3188	    uap->times, UIO_USERSPACE));
3189}
3190
3191int
3192kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3193    struct timeval *tptr, enum uio_seg tptrseg)
3194{
3195
3196	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3197}
3198
3199int
3200kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3201    struct timeval *tptr, enum uio_seg tptrseg)
3202{
3203	struct nameidata nd;
3204	struct timespec ts[2];
3205	cap_rights_t rights;
3206	int error;
3207
3208	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3209		return (error);
3210	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3211	    cap_rights_init(&rights, CAP_FUTIMES), td);
3212
3213	if ((error = namei(&nd)) != 0)
3214		return (error);
3215	NDFREE(&nd, NDF_ONLY_PNBUF);
3216	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3217	vrele(nd.ni_vp);
3218	return (error);
3219}
3220
3221/*
3222 * Set the access and modification times of a file.
3223 */
3224#ifndef _SYS_SYSPROTO_H_
3225struct lutimes_args {
3226	char	*path;
3227	struct	timeval *tptr;
3228};
3229#endif
3230int
3231sys_lutimes(td, uap)
3232	struct thread *td;
3233	register struct lutimes_args /* {
3234		char *path;
3235		struct timeval *tptr;
3236	} */ *uap;
3237{
3238
3239	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3240	    UIO_USERSPACE));
3241}
3242
3243int
3244kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3245    struct timeval *tptr, enum uio_seg tptrseg)
3246{
3247	struct timespec ts[2];
3248	struct nameidata nd;
3249	int error;
3250
3251	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3252		return (error);
3253	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3254	if ((error = namei(&nd)) != 0)
3255		return (error);
3256	NDFREE(&nd, NDF_ONLY_PNBUF);
3257	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3258	vrele(nd.ni_vp);
3259	return (error);
3260}
3261
3262/*
3263 * Set the access and modification times of a file.
3264 */
3265#ifndef _SYS_SYSPROTO_H_
3266struct futimes_args {
3267	int	fd;
3268	struct	timeval *tptr;
3269};
3270#endif
3271int
3272sys_futimes(td, uap)
3273	struct thread *td;
3274	register struct futimes_args /* {
3275		int  fd;
3276		struct timeval *tptr;
3277	} */ *uap;
3278{
3279
3280	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3281}
3282
3283int
3284kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3285    enum uio_seg tptrseg)
3286{
3287	struct timespec ts[2];
3288	struct file *fp;
3289	cap_rights_t rights;
3290	int error;
3291
3292	AUDIT_ARG_FD(fd);
3293	error = getutimes(tptr, tptrseg, ts);
3294	if (error != 0)
3295		return (error);
3296	error = getvnode(td->td_proc->p_fd, fd,
3297	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3298	if (error != 0)
3299		return (error);
3300#ifdef AUDIT
3301	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3302	AUDIT_ARG_VNODE1(fp->f_vnode);
3303	VOP_UNLOCK(fp->f_vnode, 0);
3304#endif
3305	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3306	fdrop(fp, td);
3307	return (error);
3308}
3309
3310/*
3311 * Truncate a file given its path name.
3312 */
3313#ifndef _SYS_SYSPROTO_H_
3314struct truncate_args {
3315	char	*path;
3316	int	pad;
3317	off_t	length;
3318};
3319#endif
3320int
3321sys_truncate(td, uap)
3322	struct thread *td;
3323	register struct truncate_args /* {
3324		char *path;
3325		int pad;
3326		off_t length;
3327	} */ *uap;
3328{
3329
3330	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3331}
3332
3333int
3334kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3335{
3336	struct mount *mp;
3337	struct vnode *vp;
3338	void *rl_cookie;
3339	struct vattr vattr;
3340	struct nameidata nd;
3341	int error;
3342
3343	if (length < 0)
3344		return(EINVAL);
3345	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3346	if ((error = namei(&nd)) != 0)
3347		return (error);
3348	vp = nd.ni_vp;
3349	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3350	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3351		vn_rangelock_unlock(vp, rl_cookie);
3352		vrele(vp);
3353		return (error);
3354	}
3355	NDFREE(&nd, NDF_ONLY_PNBUF);
3356	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3357	if (vp->v_type == VDIR)
3358		error = EISDIR;
3359#ifdef MAC
3360	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3361	}
3362#endif
3363	else if ((error = vn_writechk(vp)) == 0 &&
3364	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3365		VATTR_NULL(&vattr);
3366		vattr.va_size = length;
3367		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3368	}
3369	VOP_UNLOCK(vp, 0);
3370	vn_finished_write(mp);
3371	vn_rangelock_unlock(vp, rl_cookie);
3372	vrele(vp);
3373	return (error);
3374}
3375
3376#if defined(COMPAT_43)
3377/*
3378 * Truncate a file given its path name.
3379 */
3380#ifndef _SYS_SYSPROTO_H_
3381struct otruncate_args {
3382	char	*path;
3383	long	length;
3384};
3385#endif
3386int
3387otruncate(td, uap)
3388	struct thread *td;
3389	register struct otruncate_args /* {
3390		char *path;
3391		long length;
3392	} */ *uap;
3393{
3394	struct truncate_args /* {
3395		char *path;
3396		int pad;
3397		off_t length;
3398	} */ nuap;
3399
3400	nuap.path = uap->path;
3401	nuap.length = uap->length;
3402	return (sys_truncate(td, &nuap));
3403}
3404#endif /* COMPAT_43 */
3405
3406/* Versions with the pad argument */
3407int
3408freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3409{
3410	struct truncate_args ouap;
3411
3412	ouap.path = uap->path;
3413	ouap.length = uap->length;
3414	return (sys_truncate(td, &ouap));
3415}
3416
3417int
3418freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3419{
3420	struct ftruncate_args ouap;
3421
3422	ouap.fd = uap->fd;
3423	ouap.length = uap->length;
3424	return (sys_ftruncate(td, &ouap));
3425}
3426
3427/*
3428 * Sync an open file.
3429 */
3430#ifndef _SYS_SYSPROTO_H_
3431struct fsync_args {
3432	int	fd;
3433};
3434#endif
3435int
3436sys_fsync(td, uap)
3437	struct thread *td;
3438	struct fsync_args /* {
3439		int fd;
3440	} */ *uap;
3441{
3442	struct vnode *vp;
3443	struct mount *mp;
3444	struct file *fp;
3445	cap_rights_t rights;
3446	int error, lock_flags;
3447
3448	AUDIT_ARG_FD(uap->fd);
3449	error = getvnode(td->td_proc->p_fd, uap->fd,
3450	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3451	if (error != 0)
3452		return (error);
3453	vp = fp->f_vnode;
3454	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3455	if (error != 0)
3456		goto drop;
3457	if (MNT_SHARED_WRITES(mp) ||
3458	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3459		lock_flags = LK_SHARED;
3460	} else {
3461		lock_flags = LK_EXCLUSIVE;
3462	}
3463	vn_lock(vp, lock_flags | LK_RETRY);
3464	AUDIT_ARG_VNODE1(vp);
3465	if (vp->v_object != NULL) {
3466		VM_OBJECT_WLOCK(vp->v_object);
3467		vm_object_page_clean(vp->v_object, 0, 0, 0);
3468		VM_OBJECT_WUNLOCK(vp->v_object);
3469	}
3470	error = VOP_FSYNC(vp, MNT_WAIT, td);
3471
3472	VOP_UNLOCK(vp, 0);
3473	vn_finished_write(mp);
3474drop:
3475	fdrop(fp, td);
3476	return (error);
3477}
3478
3479/*
3480 * Rename files.  Source and destination must either both be directories, or
3481 * both not be directories.  If target is a directory, it must be empty.
3482 */
3483#ifndef _SYS_SYSPROTO_H_
3484struct rename_args {
3485	char	*from;
3486	char	*to;
3487};
3488#endif
3489int
3490sys_rename(td, uap)
3491	struct thread *td;
3492	register struct rename_args /* {
3493		char *from;
3494		char *to;
3495	} */ *uap;
3496{
3497
3498	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3499}
3500
3501#ifndef _SYS_SYSPROTO_H_
3502struct renameat_args {
3503	int	oldfd;
3504	char	*old;
3505	int	newfd;
3506	char	*new;
3507};
3508#endif
3509int
3510sys_renameat(struct thread *td, struct renameat_args *uap)
3511{
3512
3513	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3514	    UIO_USERSPACE));
3515}
3516
3517int
3518kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3519{
3520
3521	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3522}
3523
3524int
3525kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3526    enum uio_seg pathseg)
3527{
3528	struct mount *mp = NULL;
3529	struct vnode *tvp, *fvp, *tdvp;
3530	struct nameidata fromnd, tond;
3531	cap_rights_t rights;
3532	int error;
3533
3534again:
3535	bwillwrite();
3536#ifdef MAC
3537	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3538	    AUDITVNODE1, pathseg, old, oldfd,
3539	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3540#else
3541	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3542	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3543#endif
3544
3545	if ((error = namei(&fromnd)) != 0)
3546		return (error);
3547#ifdef MAC
3548	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3549	    fromnd.ni_vp, &fromnd.ni_cnd);
3550	VOP_UNLOCK(fromnd.ni_dvp, 0);
3551	if (fromnd.ni_dvp != fromnd.ni_vp)
3552		VOP_UNLOCK(fromnd.ni_vp, 0);
3553#endif
3554	fvp = fromnd.ni_vp;
3555	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3556	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3557	    cap_rights_init(&rights, CAP_LINKAT), td);
3558	if (fromnd.ni_vp->v_type == VDIR)
3559		tond.ni_cnd.cn_flags |= WILLBEDIR;
3560	if ((error = namei(&tond)) != 0) {
3561		/* Translate error code for rename("dir1", "dir2/."). */
3562		if (error == EISDIR && fvp->v_type == VDIR)
3563			error = EINVAL;
3564		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3565		vrele(fromnd.ni_dvp);
3566		vrele(fvp);
3567		goto out1;
3568	}
3569	tdvp = tond.ni_dvp;
3570	tvp = tond.ni_vp;
3571	error = vn_start_write(fvp, &mp, V_NOWAIT);
3572	if (error != 0) {
3573		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3574		NDFREE(&tond, NDF_ONLY_PNBUF);
3575		if (tvp != NULL)
3576			vput(tvp);
3577		if (tdvp == tvp)
3578			vrele(tdvp);
3579		else
3580			vput(tdvp);
3581		vrele(fromnd.ni_dvp);
3582		vrele(fvp);
3583		vrele(tond.ni_startdir);
3584		if (fromnd.ni_startdir != NULL)
3585			vrele(fromnd.ni_startdir);
3586		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3587		if (error != 0)
3588			return (error);
3589		goto again;
3590	}
3591	if (tvp != NULL) {
3592		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3593			error = ENOTDIR;
3594			goto out;
3595		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3596			error = EISDIR;
3597			goto out;
3598		}
3599#ifdef CAPABILITIES
3600		if (newfd != AT_FDCWD) {
3601			/*
3602			 * If the target already exists we require CAP_UNLINKAT
3603			 * from 'newfd'.
3604			 */
3605			error = cap_check(&tond.ni_filecaps.fc_rights,
3606			    cap_rights_init(&rights, CAP_UNLINKAT));
3607			if (error != 0)
3608				goto out;
3609		}
3610#endif
3611	}
3612	if (fvp == tdvp) {
3613		error = EINVAL;
3614		goto out;
3615	}
3616	/*
3617	 * If the source is the same as the destination (that is, if they
3618	 * are links to the same vnode), then there is nothing to do.
3619	 */
3620	if (fvp == tvp)
3621		error = -1;
3622#ifdef MAC
3623	else
3624		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3625		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3626#endif
3627out:
3628	if (error == 0) {
3629		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3630		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3631		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3632		NDFREE(&tond, NDF_ONLY_PNBUF);
3633	} else {
3634		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3635		NDFREE(&tond, NDF_ONLY_PNBUF);
3636		if (tvp != NULL)
3637			vput(tvp);
3638		if (tdvp == tvp)
3639			vrele(tdvp);
3640		else
3641			vput(tdvp);
3642		vrele(fromnd.ni_dvp);
3643		vrele(fvp);
3644	}
3645	vrele(tond.ni_startdir);
3646	vn_finished_write(mp);
3647out1:
3648	if (fromnd.ni_startdir)
3649		vrele(fromnd.ni_startdir);
3650	if (error == -1)
3651		return (0);
3652	return (error);
3653}
3654
3655/*
3656 * Make a directory file.
3657 */
3658#ifndef _SYS_SYSPROTO_H_
3659struct mkdir_args {
3660	char	*path;
3661	int	mode;
3662};
3663#endif
3664int
3665sys_mkdir(td, uap)
3666	struct thread *td;
3667	register struct mkdir_args /* {
3668		char *path;
3669		int mode;
3670	} */ *uap;
3671{
3672
3673	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3674}
3675
3676#ifndef _SYS_SYSPROTO_H_
3677struct mkdirat_args {
3678	int	fd;
3679	char	*path;
3680	mode_t	mode;
3681};
3682#endif
3683int
3684sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3685{
3686
3687	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3688}
3689
3690int
3691kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3692{
3693
3694	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3695}
3696
3697int
3698kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3699    int mode)
3700{
3701	struct mount *mp;
3702	struct vnode *vp;
3703	struct vattr vattr;
3704	struct nameidata nd;
3705	cap_rights_t rights;
3706	int error;
3707
3708	AUDIT_ARG_MODE(mode);
3709restart:
3710	bwillwrite();
3711	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3712	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3713	    td);
3714	nd.ni_cnd.cn_flags |= WILLBEDIR;
3715	if ((error = namei(&nd)) != 0)
3716		return (error);
3717	vp = nd.ni_vp;
3718	if (vp != NULL) {
3719		NDFREE(&nd, NDF_ONLY_PNBUF);
3720		/*
3721		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3722		 * the strange behaviour of leaving the vnode unlocked
3723		 * if the target is the same vnode as the parent.
3724		 */
3725		if (vp == nd.ni_dvp)
3726			vrele(nd.ni_dvp);
3727		else
3728			vput(nd.ni_dvp);
3729		vrele(vp);
3730		return (EEXIST);
3731	}
3732	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3733		NDFREE(&nd, NDF_ONLY_PNBUF);
3734		vput(nd.ni_dvp);
3735		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3736			return (error);
3737		goto restart;
3738	}
3739	VATTR_NULL(&vattr);
3740	vattr.va_type = VDIR;
3741	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3742#ifdef MAC
3743	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3744	    &vattr);
3745	if (error != 0)
3746		goto out;
3747#endif
3748	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3749#ifdef MAC
3750out:
3751#endif
3752	NDFREE(&nd, NDF_ONLY_PNBUF);
3753	vput(nd.ni_dvp);
3754	if (error == 0)
3755		vput(nd.ni_vp);
3756	vn_finished_write(mp);
3757	return (error);
3758}
3759
3760/*
3761 * Remove a directory file.
3762 */
3763#ifndef _SYS_SYSPROTO_H_
3764struct rmdir_args {
3765	char	*path;
3766};
3767#endif
3768int
3769sys_rmdir(td, uap)
3770	struct thread *td;
3771	struct rmdir_args /* {
3772		char *path;
3773	} */ *uap;
3774{
3775
3776	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3777}
3778
3779int
3780kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3781{
3782
3783	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3784}
3785
3786int
3787kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3788{
3789	struct mount *mp;
3790	struct vnode *vp;
3791	struct nameidata nd;
3792	cap_rights_t rights;
3793	int error;
3794
3795restart:
3796	bwillwrite();
3797	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3798	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3799	if ((error = namei(&nd)) != 0)
3800		return (error);
3801	vp = nd.ni_vp;
3802	if (vp->v_type != VDIR) {
3803		error = ENOTDIR;
3804		goto out;
3805	}
3806	/*
3807	 * No rmdir "." please.
3808	 */
3809	if (nd.ni_dvp == vp) {
3810		error = EINVAL;
3811		goto out;
3812	}
3813	/*
3814	 * The root of a mounted filesystem cannot be deleted.
3815	 */
3816	if (vp->v_vflag & VV_ROOT) {
3817		error = EBUSY;
3818		goto out;
3819	}
3820#ifdef MAC
3821	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3822	    &nd.ni_cnd);
3823	if (error != 0)
3824		goto out;
3825#endif
3826	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3827		NDFREE(&nd, NDF_ONLY_PNBUF);
3828		vput(vp);
3829		if (nd.ni_dvp == vp)
3830			vrele(nd.ni_dvp);
3831		else
3832			vput(nd.ni_dvp);
3833		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3834			return (error);
3835		goto restart;
3836	}
3837	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3838	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3839	vn_finished_write(mp);
3840out:
3841	NDFREE(&nd, NDF_ONLY_PNBUF);
3842	vput(vp);
3843	if (nd.ni_dvp == vp)
3844		vrele(nd.ni_dvp);
3845	else
3846		vput(nd.ni_dvp);
3847	return (error);
3848}
3849
3850#ifdef COMPAT_43
3851/*
3852 * Read a block of directory entries in a filesystem independent format.
3853 */
3854#ifndef _SYS_SYSPROTO_H_
3855struct ogetdirentries_args {
3856	int	fd;
3857	char	*buf;
3858	u_int	count;
3859	long	*basep;
3860};
3861#endif
3862int
3863ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3864{
3865	long loff;
3866	int error;
3867
3868	error = kern_ogetdirentries(td, uap, &loff);
3869	if (error == 0)
3870		error = copyout(&loff, uap->basep, sizeof(long));
3871	return (error);
3872}
3873
3874int
3875kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3876    long *ploff)
3877{
3878	struct vnode *vp;
3879	struct file *fp;
3880	struct uio auio, kuio;
3881	struct iovec aiov, kiov;
3882	struct dirent *dp, *edp;
3883	cap_rights_t rights;
3884	caddr_t dirbuf;
3885	int error, eofflag, readcnt;
3886	long loff;
3887	off_t foffset;
3888
3889	/* XXX arbitrary sanity limit on `count'. */
3890	if (uap->count > 64 * 1024)
3891		return (EINVAL);
3892	error = getvnode(td->td_proc->p_fd, uap->fd,
3893	    cap_rights_init(&rights, CAP_READ), &fp);
3894	if (error != 0)
3895		return (error);
3896	if ((fp->f_flag & FREAD) == 0) {
3897		fdrop(fp, td);
3898		return (EBADF);
3899	}
3900	vp = fp->f_vnode;
3901	foffset = foffset_lock(fp, 0);
3902unionread:
3903	if (vp->v_type != VDIR) {
3904		foffset_unlock(fp, foffset, 0);
3905		fdrop(fp, td);
3906		return (EINVAL);
3907	}
3908	aiov.iov_base = uap->buf;
3909	aiov.iov_len = uap->count;
3910	auio.uio_iov = &aiov;
3911	auio.uio_iovcnt = 1;
3912	auio.uio_rw = UIO_READ;
3913	auio.uio_segflg = UIO_USERSPACE;
3914	auio.uio_td = td;
3915	auio.uio_resid = uap->count;
3916	vn_lock(vp, LK_SHARED | LK_RETRY);
3917	loff = auio.uio_offset = foffset;
3918#ifdef MAC
3919	error = mac_vnode_check_readdir(td->td_ucred, vp);
3920	if (error != 0) {
3921		VOP_UNLOCK(vp, 0);
3922		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3923		fdrop(fp, td);
3924		return (error);
3925	}
3926#endif
3927#	if (BYTE_ORDER != LITTLE_ENDIAN)
3928		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3929			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3930			    NULL, NULL);
3931			foffset = auio.uio_offset;
3932		} else
3933#	endif
3934	{
3935		kuio = auio;
3936		kuio.uio_iov = &kiov;
3937		kuio.uio_segflg = UIO_SYSSPACE;
3938		kiov.iov_len = uap->count;
3939		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3940		kiov.iov_base = dirbuf;
3941		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3942			    NULL, NULL);
3943		foffset = kuio.uio_offset;
3944		if (error == 0) {
3945			readcnt = uap->count - kuio.uio_resid;
3946			edp = (struct dirent *)&dirbuf[readcnt];
3947			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3948#				if (BYTE_ORDER == LITTLE_ENDIAN)
3949					/*
3950					 * The expected low byte of
3951					 * dp->d_namlen is our dp->d_type.
3952					 * The high MBZ byte of dp->d_namlen
3953					 * is our dp->d_namlen.
3954					 */
3955					dp->d_type = dp->d_namlen;
3956					dp->d_namlen = 0;
3957#				else
3958					/*
3959					 * The dp->d_type is the high byte
3960					 * of the expected dp->d_namlen,
3961					 * so must be zero'ed.
3962					 */
3963					dp->d_type = 0;
3964#				endif
3965				if (dp->d_reclen > 0) {
3966					dp = (struct dirent *)
3967					    ((char *)dp + dp->d_reclen);
3968				} else {
3969					error = EIO;
3970					break;
3971				}
3972			}
3973			if (dp >= edp)
3974				error = uiomove(dirbuf, readcnt, &auio);
3975		}
3976		free(dirbuf, M_TEMP);
3977	}
3978	if (error != 0) {
3979		VOP_UNLOCK(vp, 0);
3980		foffset_unlock(fp, foffset, 0);
3981		fdrop(fp, td);
3982		return (error);
3983	}
3984	if (uap->count == auio.uio_resid &&
3985	    (vp->v_vflag & VV_ROOT) &&
3986	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3987		struct vnode *tvp = vp;
3988		vp = vp->v_mount->mnt_vnodecovered;
3989		VREF(vp);
3990		fp->f_vnode = vp;
3991		fp->f_data = vp;
3992		foffset = 0;
3993		vput(tvp);
3994		goto unionread;
3995	}
3996	VOP_UNLOCK(vp, 0);
3997	foffset_unlock(fp, foffset, 0);
3998	fdrop(fp, td);
3999	td->td_retval[0] = uap->count - auio.uio_resid;
4000	if (error == 0)
4001		*ploff = loff;
4002	return (error);
4003}
4004#endif /* COMPAT_43 */
4005
4006/*
4007 * Read a block of directory entries in a filesystem independent format.
4008 */
4009#ifndef _SYS_SYSPROTO_H_
4010struct getdirentries_args {
4011	int	fd;
4012	char	*buf;
4013	u_int	count;
4014	long	*basep;
4015};
4016#endif
4017int
4018sys_getdirentries(td, uap)
4019	struct thread *td;
4020	register struct getdirentries_args /* {
4021		int fd;
4022		char *buf;
4023		u_int count;
4024		long *basep;
4025	} */ *uap;
4026{
4027	long base;
4028	int error;
4029
4030	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4031	    NULL, UIO_USERSPACE);
4032	if (error != 0)
4033		return (error);
4034	if (uap->basep != NULL)
4035		error = copyout(&base, uap->basep, sizeof(long));
4036	return (error);
4037}
4038
4039int
4040kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4041    long *basep, ssize_t *residp, enum uio_seg bufseg)
4042{
4043	struct vnode *vp;
4044	struct file *fp;
4045	struct uio auio;
4046	struct iovec aiov;
4047	cap_rights_t rights;
4048	long loff;
4049	int error, eofflag;
4050	off_t foffset;
4051
4052	AUDIT_ARG_FD(fd);
4053	if (count > IOSIZE_MAX)
4054		return (EINVAL);
4055	auio.uio_resid = count;
4056	error = getvnode(td->td_proc->p_fd, fd,
4057	    cap_rights_init(&rights, CAP_READ), &fp);
4058	if (error != 0)
4059		return (error);
4060	if ((fp->f_flag & FREAD) == 0) {
4061		fdrop(fp, td);
4062		return (EBADF);
4063	}
4064	vp = fp->f_vnode;
4065	foffset = foffset_lock(fp, 0);
4066unionread:
4067	if (vp->v_type != VDIR) {
4068		error = EINVAL;
4069		goto fail;
4070	}
4071	aiov.iov_base = buf;
4072	aiov.iov_len = count;
4073	auio.uio_iov = &aiov;
4074	auio.uio_iovcnt = 1;
4075	auio.uio_rw = UIO_READ;
4076	auio.uio_segflg = bufseg;
4077	auio.uio_td = td;
4078	vn_lock(vp, LK_SHARED | LK_RETRY);
4079	AUDIT_ARG_VNODE1(vp);
4080	loff = auio.uio_offset = foffset;
4081#ifdef MAC
4082	error = mac_vnode_check_readdir(td->td_ucred, vp);
4083	if (error == 0)
4084#endif
4085		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4086		    NULL);
4087	foffset = auio.uio_offset;
4088	if (error != 0) {
4089		VOP_UNLOCK(vp, 0);
4090		goto fail;
4091	}
4092	if (count == auio.uio_resid &&
4093	    (vp->v_vflag & VV_ROOT) &&
4094	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4095		struct vnode *tvp = vp;
4096
4097		vp = vp->v_mount->mnt_vnodecovered;
4098		VREF(vp);
4099		fp->f_vnode = vp;
4100		fp->f_data = vp;
4101		foffset = 0;
4102		vput(tvp);
4103		goto unionread;
4104	}
4105	VOP_UNLOCK(vp, 0);
4106	*basep = loff;
4107	if (residp != NULL)
4108		*residp = auio.uio_resid;
4109	td->td_retval[0] = count - auio.uio_resid;
4110fail:
4111	foffset_unlock(fp, foffset, 0);
4112	fdrop(fp, td);
4113	return (error);
4114}
4115
4116#ifndef _SYS_SYSPROTO_H_
4117struct getdents_args {
4118	int fd;
4119	char *buf;
4120	size_t count;
4121};
4122#endif
4123int
4124sys_getdents(td, uap)
4125	struct thread *td;
4126	register struct getdents_args /* {
4127		int fd;
4128		char *buf;
4129		u_int count;
4130	} */ *uap;
4131{
4132	struct getdirentries_args ap;
4133
4134	ap.fd = uap->fd;
4135	ap.buf = uap->buf;
4136	ap.count = uap->count;
4137	ap.basep = NULL;
4138	return (sys_getdirentries(td, &ap));
4139}
4140
4141/*
4142 * Set the mode mask for creation of filesystem nodes.
4143 */
4144#ifndef _SYS_SYSPROTO_H_
4145struct umask_args {
4146	int	newmask;
4147};
4148#endif
4149int
4150sys_umask(td, uap)
4151	struct thread *td;
4152	struct umask_args /* {
4153		int newmask;
4154	} */ *uap;
4155{
4156	register struct filedesc *fdp;
4157
4158	FILEDESC_XLOCK(td->td_proc->p_fd);
4159	fdp = td->td_proc->p_fd;
4160	td->td_retval[0] = fdp->fd_cmask;
4161	fdp->fd_cmask = uap->newmask & ALLPERMS;
4162	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4163	return (0);
4164}
4165
4166/*
4167 * Void all references to file by ripping underlying filesystem away from
4168 * vnode.
4169 */
4170#ifndef _SYS_SYSPROTO_H_
4171struct revoke_args {
4172	char	*path;
4173};
4174#endif
4175int
4176sys_revoke(td, uap)
4177	struct thread *td;
4178	register struct revoke_args /* {
4179		char *path;
4180	} */ *uap;
4181{
4182	struct vnode *vp;
4183	struct vattr vattr;
4184	struct nameidata nd;
4185	int error;
4186
4187	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4188	    uap->path, td);
4189	if ((error = namei(&nd)) != 0)
4190		return (error);
4191	vp = nd.ni_vp;
4192	NDFREE(&nd, NDF_ONLY_PNBUF);
4193	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4194		error = EINVAL;
4195		goto out;
4196	}
4197#ifdef MAC
4198	error = mac_vnode_check_revoke(td->td_ucred, vp);
4199	if (error != 0)
4200		goto out;
4201#endif
4202	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4203	if (error != 0)
4204		goto out;
4205	if (td->td_ucred->cr_uid != vattr.va_uid) {
4206		error = priv_check(td, PRIV_VFS_ADMIN);
4207		if (error != 0)
4208			goto out;
4209	}
4210	if (vcount(vp) > 1)
4211		VOP_REVOKE(vp, REVOKEALL);
4212out:
4213	vput(vp);
4214	return (error);
4215}
4216
4217/*
4218 * Convert a user file descriptor to a kernel file entry and check that, if it
4219 * is a capability, the correct rights are present. A reference on the file
4220 * entry is held upon returning.
4221 */
4222int
4223getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4224{
4225	struct file *fp;
4226	int error;
4227
4228	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4229	if (error != 0)
4230		return (error);
4231
4232	/*
4233	 * The file could be not of the vnode type, or it may be not
4234	 * yet fully initialized, in which case the f_vnode pointer
4235	 * may be set, but f_ops is still badfileops.  E.g.,
4236	 * devfs_open() transiently create such situation to
4237	 * facilitate csw d_fdopen().
4238	 *
4239	 * Dupfdopen() handling in kern_openat() installs the
4240	 * half-baked file into the process descriptor table, allowing
4241	 * other thread to dereference it. Guard against the race by
4242	 * checking f_ops.
4243	 */
4244	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4245		fdrop(fp, curthread);
4246		return (EINVAL);
4247	}
4248	*fpp = fp;
4249	return (0);
4250}
4251
4252
4253/*
4254 * Get an (NFS) file handle.
4255 */
4256#ifndef _SYS_SYSPROTO_H_
4257struct lgetfh_args {
4258	char	*fname;
4259	fhandle_t *fhp;
4260};
4261#endif
4262int
4263sys_lgetfh(td, uap)
4264	struct thread *td;
4265	register struct lgetfh_args *uap;
4266{
4267	struct nameidata nd;
4268	fhandle_t fh;
4269	register struct vnode *vp;
4270	int error;
4271
4272	error = priv_check(td, PRIV_VFS_GETFH);
4273	if (error != 0)
4274		return (error);
4275	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4276	    uap->fname, td);
4277	error = namei(&nd);
4278	if (error != 0)
4279		return (error);
4280	NDFREE(&nd, NDF_ONLY_PNBUF);
4281	vp = nd.ni_vp;
4282	bzero(&fh, sizeof(fh));
4283	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4284	error = VOP_VPTOFH(vp, &fh.fh_fid);
4285	vput(vp);
4286	if (error == 0)
4287		error = copyout(&fh, uap->fhp, sizeof (fh));
4288	return (error);
4289}
4290
4291#ifndef _SYS_SYSPROTO_H_
4292struct getfh_args {
4293	char	*fname;
4294	fhandle_t *fhp;
4295};
4296#endif
4297int
4298sys_getfh(td, uap)
4299	struct thread *td;
4300	register struct getfh_args *uap;
4301{
4302	struct nameidata nd;
4303	fhandle_t fh;
4304	register struct vnode *vp;
4305	int error;
4306
4307	error = priv_check(td, PRIV_VFS_GETFH);
4308	if (error != 0)
4309		return (error);
4310	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4311	    uap->fname, td);
4312	error = namei(&nd);
4313	if (error != 0)
4314		return (error);
4315	NDFREE(&nd, NDF_ONLY_PNBUF);
4316	vp = nd.ni_vp;
4317	bzero(&fh, sizeof(fh));
4318	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4319	error = VOP_VPTOFH(vp, &fh.fh_fid);
4320	vput(vp);
4321	if (error == 0)
4322		error = copyout(&fh, uap->fhp, sizeof (fh));
4323	return (error);
4324}
4325
4326/*
4327 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4328 * open descriptor.
4329 *
4330 * warning: do not remove the priv_check() call or this becomes one giant
4331 * security hole.
4332 */
4333#ifndef _SYS_SYSPROTO_H_
4334struct fhopen_args {
4335	const struct fhandle *u_fhp;
4336	int flags;
4337};
4338#endif
4339int
4340sys_fhopen(td, uap)
4341	struct thread *td;
4342	struct fhopen_args /* {
4343		const struct fhandle *u_fhp;
4344		int flags;
4345	} */ *uap;
4346{
4347	struct mount *mp;
4348	struct vnode *vp;
4349	struct fhandle fhp;
4350	struct file *fp;
4351	int fmode, error;
4352	int indx;
4353
4354	error = priv_check(td, PRIV_VFS_FHOPEN);
4355	if (error != 0)
4356		return (error);
4357	indx = -1;
4358	fmode = FFLAGS(uap->flags);
4359	/* why not allow a non-read/write open for our lockd? */
4360	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4361		return (EINVAL);
4362	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4363	if (error != 0)
4364		return(error);
4365	/* find the mount point */
4366	mp = vfs_busyfs(&fhp.fh_fsid);
4367	if (mp == NULL)
4368		return (ESTALE);
4369	/* now give me my vnode, it gets returned to me locked */
4370	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4371	vfs_unbusy(mp);
4372	if (error != 0)
4373		return (error);
4374
4375	error = falloc_noinstall(td, &fp);
4376	if (error != 0) {
4377		vput(vp);
4378		return (error);
4379	}
4380	/*
4381	 * An extra reference on `fp' has been held for us by
4382	 * falloc_noinstall().
4383	 */
4384
4385#ifdef INVARIANTS
4386	td->td_dupfd = -1;
4387#endif
4388	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4389	if (error != 0) {
4390		KASSERT(fp->f_ops == &badfileops,
4391		    ("VOP_OPEN in fhopen() set f_ops"));
4392		KASSERT(td->td_dupfd < 0,
4393		    ("fhopen() encountered fdopen()"));
4394
4395		vput(vp);
4396		goto bad;
4397	}
4398#ifdef INVARIANTS
4399	td->td_dupfd = 0;
4400#endif
4401	fp->f_vnode = vp;
4402	fp->f_seqcount = 1;
4403	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4404	    &vnops);
4405	VOP_UNLOCK(vp, 0);
4406	if ((fmode & O_TRUNC) != 0) {
4407		error = fo_truncate(fp, 0, td->td_ucred, td);
4408		if (error != 0)
4409			goto bad;
4410	}
4411
4412	error = finstall(td, fp, &indx, fmode, NULL);
4413bad:
4414	fdrop(fp, td);
4415	td->td_retval[0] = indx;
4416	return (error);
4417}
4418
4419/*
4420 * Stat an (NFS) file handle.
4421 */
4422#ifndef _SYS_SYSPROTO_H_
4423struct fhstat_args {
4424	struct fhandle *u_fhp;
4425	struct stat *sb;
4426};
4427#endif
4428int
4429sys_fhstat(td, uap)
4430	struct thread *td;
4431	register struct fhstat_args /* {
4432		struct fhandle *u_fhp;
4433		struct stat *sb;
4434	} */ *uap;
4435{
4436	struct stat sb;
4437	struct fhandle fh;
4438	int error;
4439
4440	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4441	if (error != 0)
4442		return (error);
4443	error = kern_fhstat(td, fh, &sb);
4444	if (error == 0)
4445		error = copyout(&sb, uap->sb, sizeof(sb));
4446	return (error);
4447}
4448
4449int
4450kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4451{
4452	struct mount *mp;
4453	struct vnode *vp;
4454	int error;
4455
4456	error = priv_check(td, PRIV_VFS_FHSTAT);
4457	if (error != 0)
4458		return (error);
4459	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4460		return (ESTALE);
4461	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4462	vfs_unbusy(mp);
4463	if (error != 0)
4464		return (error);
4465	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4466	vput(vp);
4467	return (error);
4468}
4469
4470/*
4471 * Implement fstatfs() for (NFS) file handles.
4472 */
4473#ifndef _SYS_SYSPROTO_H_
4474struct fhstatfs_args {
4475	struct fhandle *u_fhp;
4476	struct statfs *buf;
4477};
4478#endif
4479int
4480sys_fhstatfs(td, uap)
4481	struct thread *td;
4482	struct fhstatfs_args /* {
4483		struct fhandle *u_fhp;
4484		struct statfs *buf;
4485	} */ *uap;
4486{
4487	struct statfs sf;
4488	fhandle_t fh;
4489	int error;
4490
4491	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4492	if (error != 0)
4493		return (error);
4494	error = kern_fhstatfs(td, fh, &sf);
4495	if (error != 0)
4496		return (error);
4497	return (copyout(&sf, uap->buf, sizeof(sf)));
4498}
4499
4500int
4501kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4502{
4503	struct statfs *sp;
4504	struct mount *mp;
4505	struct vnode *vp;
4506	int error;
4507
4508	error = priv_check(td, PRIV_VFS_FHSTATFS);
4509	if (error != 0)
4510		return (error);
4511	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4512		return (ESTALE);
4513	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4514	if (error != 0) {
4515		vfs_unbusy(mp);
4516		return (error);
4517	}
4518	vput(vp);
4519	error = prison_canseemount(td->td_ucred, mp);
4520	if (error != 0)
4521		goto out;
4522#ifdef MAC
4523	error = mac_mount_check_stat(td->td_ucred, mp);
4524	if (error != 0)
4525		goto out;
4526#endif
4527	/*
4528	 * Set these in case the underlying filesystem fails to do so.
4529	 */
4530	sp = &mp->mnt_stat;
4531	sp->f_version = STATFS_VERSION;
4532	sp->f_namemax = NAME_MAX;
4533	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4534	error = VFS_STATFS(mp, sp);
4535	if (error == 0)
4536		*buf = *sp;
4537out:
4538	vfs_unbusy(mp);
4539	return (error);
4540}
4541
4542int
4543kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4544{
4545	struct file *fp;
4546	struct mount *mp;
4547	struct vnode *vp;
4548	cap_rights_t rights;
4549	off_t olen, ooffset;
4550	int error;
4551
4552	if (offset < 0 || len <= 0)
4553		return (EINVAL);
4554	/* Check for wrap. */
4555	if (offset > OFF_MAX - len)
4556		return (EFBIG);
4557	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4558	if (error != 0)
4559		return (error);
4560	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4561		error = ESPIPE;
4562		goto out;
4563	}
4564	if ((fp->f_flag & FWRITE) == 0) {
4565		error = EBADF;
4566		goto out;
4567	}
4568	if (fp->f_type != DTYPE_VNODE) {
4569		error = ENODEV;
4570		goto out;
4571	}
4572	vp = fp->f_vnode;
4573	if (vp->v_type != VREG) {
4574		error = ENODEV;
4575		goto out;
4576	}
4577
4578	/* Allocating blocks may take a long time, so iterate. */
4579	for (;;) {
4580		olen = len;
4581		ooffset = offset;
4582
4583		bwillwrite();
4584		mp = NULL;
4585		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4586		if (error != 0)
4587			break;
4588		error = vn_lock(vp, LK_EXCLUSIVE);
4589		if (error != 0) {
4590			vn_finished_write(mp);
4591			break;
4592		}
4593#ifdef MAC
4594		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4595		if (error == 0)
4596#endif
4597			error = VOP_ALLOCATE(vp, &offset, &len);
4598		VOP_UNLOCK(vp, 0);
4599		vn_finished_write(mp);
4600
4601		if (olen + ooffset != offset + len) {
4602			panic("offset + len changed from %jx/%jx to %jx/%jx",
4603			    ooffset, olen, offset, len);
4604		}
4605		if (error != 0 || len == 0)
4606			break;
4607		KASSERT(olen > len, ("Iteration did not make progress?"));
4608		maybe_yield();
4609	}
4610 out:
4611	fdrop(fp, td);
4612	return (error);
4613}
4614
4615int
4616sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4617{
4618
4619	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4620	    uap->len);
4621	return (0);
4622}
4623
4624/*
4625 * Unlike madvise(2), we do not make a best effort to remember every
4626 * possible caching hint.  Instead, we remember the last setting with
4627 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4628 * region of any current setting.
4629 */
4630int
4631kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4632    int advice)
4633{
4634	struct fadvise_info *fa, *new;
4635	struct file *fp;
4636	struct vnode *vp;
4637	cap_rights_t rights;
4638	off_t end;
4639	int error;
4640
4641	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4642		return (EINVAL);
4643	switch (advice) {
4644	case POSIX_FADV_SEQUENTIAL:
4645	case POSIX_FADV_RANDOM:
4646	case POSIX_FADV_NOREUSE:
4647		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4648		break;
4649	case POSIX_FADV_NORMAL:
4650	case POSIX_FADV_WILLNEED:
4651	case POSIX_FADV_DONTNEED:
4652		new = NULL;
4653		break;
4654	default:
4655		return (EINVAL);
4656	}
4657	/* XXX: CAP_POSIX_FADVISE? */
4658	error = fget(td, fd, cap_rights_init(&rights), &fp);
4659	if (error != 0)
4660		goto out;
4661	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4662		error = ESPIPE;
4663		goto out;
4664	}
4665	if (fp->f_type != DTYPE_VNODE) {
4666		error = ENODEV;
4667		goto out;
4668	}
4669	vp = fp->f_vnode;
4670	if (vp->v_type != VREG) {
4671		error = ENODEV;
4672		goto out;
4673	}
4674	if (len == 0)
4675		end = OFF_MAX;
4676	else
4677		end = offset + len - 1;
4678	switch (advice) {
4679	case POSIX_FADV_SEQUENTIAL:
4680	case POSIX_FADV_RANDOM:
4681	case POSIX_FADV_NOREUSE:
4682		/*
4683		 * Try to merge any existing non-standard region with
4684		 * this new region if possible, otherwise create a new
4685		 * non-standard region for this request.
4686		 */
4687		mtx_pool_lock(mtxpool_sleep, fp);
4688		fa = fp->f_advice;
4689		if (fa != NULL && fa->fa_advice == advice &&
4690		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4691		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4692		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4693			if (offset < fa->fa_start)
4694				fa->fa_start = offset;
4695			if (end > fa->fa_end)
4696				fa->fa_end = end;
4697		} else {
4698			new->fa_advice = advice;
4699			new->fa_start = offset;
4700			new->fa_end = end;
4701			new->fa_prevstart = 0;
4702			new->fa_prevend = 0;
4703			fp->f_advice = new;
4704			new = fa;
4705		}
4706		mtx_pool_unlock(mtxpool_sleep, fp);
4707		break;
4708	case POSIX_FADV_NORMAL:
4709		/*
4710		 * If a the "normal" region overlaps with an existing
4711		 * non-standard region, trim or remove the
4712		 * non-standard region.
4713		 */
4714		mtx_pool_lock(mtxpool_sleep, fp);
4715		fa = fp->f_advice;
4716		if (fa != NULL) {
4717			if (offset <= fa->fa_start && end >= fa->fa_end) {
4718				new = fa;
4719				fp->f_advice = NULL;
4720			} else if (offset <= fa->fa_start &&
4721			    end >= fa->fa_start)
4722				fa->fa_start = end + 1;
4723			else if (offset <= fa->fa_end && end >= fa->fa_end)
4724				fa->fa_end = offset - 1;
4725			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4726				/*
4727				 * If the "normal" region is a middle
4728				 * portion of the existing
4729				 * non-standard region, just remove
4730				 * the whole thing rather than picking
4731				 * one side or the other to
4732				 * preserve.
4733				 */
4734				new = fa;
4735				fp->f_advice = NULL;
4736			}
4737		}
4738		mtx_pool_unlock(mtxpool_sleep, fp);
4739		break;
4740	case POSIX_FADV_WILLNEED:
4741	case POSIX_FADV_DONTNEED:
4742		error = VOP_ADVISE(vp, offset, end, advice);
4743		break;
4744	}
4745out:
4746	if (fp != NULL)
4747		fdrop(fp, td);
4748	free(new, M_FADVISE);
4749	return (error);
4750}
4751
4752int
4753sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4754{
4755
4756	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4757	    uap->len, uap->advice);
4758	return (0);
4759}
4760