vfs_syscalls.c revision 301051
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: releng/10.2/sys/kern/vfs_syscalls.c 301051 2016-05-31 16:55:45Z glebius $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capsicum.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <ufs/ufs/quota.h>
91
92MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93
94SDT_PROVIDER_DEFINE(vfs);
95SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
97
98static int chroot_refuse_vdir_fds(struct filedesc *fdp);
99static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
100static int kern_chflags(struct thread *td, const char *path,
101    enum uio_seg pathseg, u_long flags);
102static int kern_chflagsat(struct thread *td, int fd, const char *path,
103    enum uio_seg pathseg, u_long flags, int atflag);
104static int setfflags(struct thread *td, struct vnode *, u_long);
105static int setutimes(struct thread *td, struct vnode *,
106    const struct timespec *, int, int);
107static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
108    struct thread *td);
109
110/*
111 * The module initialization routine for POSIX asynchronous I/O will
112 * set this to the version of AIO that it implements.  (Zero means
113 * that it is not implemented.)  This value is used here by pathconf()
114 * and in kern_descrip.c by fpathconf().
115 */
116int async_io_version;
117
118/*
119 * Sync each mounted filesystem.
120 */
121#ifndef _SYS_SYSPROTO_H_
122struct sync_args {
123	int     dummy;
124};
125#endif
126/* ARGSUSED */
127int
128sys_sync(td, uap)
129	struct thread *td;
130	struct sync_args *uap;
131{
132	struct mount *mp, *nmp;
133	int save;
134
135	mtx_lock(&mountlist_mtx);
136	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
137		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
138			nmp = TAILQ_NEXT(mp, mnt_list);
139			continue;
140		}
141		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
142		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
143			save = curthread_pflags_set(TDP_SYNCIO);
144			vfs_msync(mp, MNT_NOWAIT);
145			VFS_SYNC(mp, MNT_NOWAIT);
146			curthread_pflags_restore(save);
147			vn_finished_write(mp);
148		}
149		mtx_lock(&mountlist_mtx);
150		nmp = TAILQ_NEXT(mp, mnt_list);
151		vfs_unbusy(mp);
152	}
153	mtx_unlock(&mountlist_mtx);
154	return (0);
155}
156
157/*
158 * Change filesystem quotas.
159 */
160#ifndef _SYS_SYSPROTO_H_
161struct quotactl_args {
162	char *path;
163	int cmd;
164	int uid;
165	caddr_t arg;
166};
167#endif
168int
169sys_quotactl(td, uap)
170	struct thread *td;
171	register struct quotactl_args /* {
172		char *path;
173		int cmd;
174		int uid;
175		caddr_t arg;
176	} */ *uap;
177{
178	struct mount *mp;
179	struct nameidata nd;
180	int error;
181
182	AUDIT_ARG_CMD(uap->cmd);
183	AUDIT_ARG_UID(uap->uid);
184	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
185		return (EPERM);
186	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
187	    uap->path, td);
188	if ((error = namei(&nd)) != 0)
189		return (error);
190	NDFREE(&nd, NDF_ONLY_PNBUF);
191	mp = nd.ni_vp->v_mount;
192	vfs_ref(mp);
193	vput(nd.ni_vp);
194	error = vfs_busy(mp, 0);
195	vfs_rel(mp);
196	if (error != 0)
197		return (error);
198	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
199
200	/*
201	 * Since quota on operation typically needs to open quota
202	 * file, the Q_QUOTAON handler needs to unbusy the mount point
203	 * before calling into namei.  Otherwise, unmount might be
204	 * started between two vfs_busy() invocations (first is our,
205	 * second is from mount point cross-walk code in lookup()),
206	 * causing deadlock.
207	 *
208	 * Require that Q_QUOTAON handles the vfs_busy() reference on
209	 * its own, always returning with ubusied mount point.
210	 */
211	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
212		vfs_unbusy(mp);
213	return (error);
214}
215
216/*
217 * Used by statfs conversion routines to scale the block size up if
218 * necessary so that all of the block counts are <= 'max_size'.  Note
219 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
220 * value of 'n'.
221 */
222void
223statfs_scale_blocks(struct statfs *sf, long max_size)
224{
225	uint64_t count;
226	int shift;
227
228	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
229
230	/*
231	 * Attempt to scale the block counts to give a more accurate
232	 * overview to userland of the ratio of free space to used
233	 * space.  To do this, find the largest block count and compute
234	 * a divisor that lets it fit into a signed integer <= max_size.
235	 */
236	if (sf->f_bavail < 0)
237		count = -sf->f_bavail;
238	else
239		count = sf->f_bavail;
240	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
241	if (count <= max_size)
242		return;
243
244	count >>= flsl(max_size);
245	shift = 0;
246	while (count > 0) {
247		shift++;
248		count >>=1;
249	}
250
251	sf->f_bsize <<= shift;
252	sf->f_blocks >>= shift;
253	sf->f_bfree >>= shift;
254	sf->f_bavail >>= shift;
255}
256
257/*
258 * Get filesystem statistics.
259 */
260#ifndef _SYS_SYSPROTO_H_
261struct statfs_args {
262	char *path;
263	struct statfs *buf;
264};
265#endif
266int
267sys_statfs(td, uap)
268	struct thread *td;
269	register struct statfs_args /* {
270		char *path;
271		struct statfs *buf;
272	} */ *uap;
273{
274	struct statfs sf;
275	int error;
276
277	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
278	if (error == 0)
279		error = copyout(&sf, uap->buf, sizeof(sf));
280	return (error);
281}
282
283int
284kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
285    struct statfs *buf)
286{
287	struct mount *mp;
288	struct statfs *sp, sb;
289	struct nameidata nd;
290	int error;
291
292	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
293	    pathseg, path, td);
294	error = namei(&nd);
295	if (error != 0)
296		return (error);
297	mp = nd.ni_vp->v_mount;
298	vfs_ref(mp);
299	NDFREE(&nd, NDF_ONLY_PNBUF);
300	vput(nd.ni_vp);
301	error = vfs_busy(mp, 0);
302	vfs_rel(mp);
303	if (error != 0)
304		return (error);
305#ifdef MAC
306	error = mac_mount_check_stat(td->td_ucred, mp);
307	if (error != 0)
308		goto out;
309#endif
310	/*
311	 * Set these in case the underlying filesystem fails to do so.
312	 */
313	sp = &mp->mnt_stat;
314	sp->f_version = STATFS_VERSION;
315	sp->f_namemax = NAME_MAX;
316	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
317	error = VFS_STATFS(mp, sp);
318	if (error != 0)
319		goto out;
320	if (priv_check(td, PRIV_VFS_GENERATION)) {
321		bcopy(sp, &sb, sizeof(sb));
322		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
323		prison_enforce_statfs(td->td_ucred, mp, &sb);
324		sp = &sb;
325	}
326	*buf = *sp;
327out:
328	vfs_unbusy(mp);
329	return (error);
330}
331
332/*
333 * Get filesystem statistics.
334 */
335#ifndef _SYS_SYSPROTO_H_
336struct fstatfs_args {
337	int fd;
338	struct statfs *buf;
339};
340#endif
341int
342sys_fstatfs(td, uap)
343	struct thread *td;
344	register struct fstatfs_args /* {
345		int fd;
346		struct statfs *buf;
347	} */ *uap;
348{
349	struct statfs sf;
350	int error;
351
352	error = kern_fstatfs(td, uap->fd, &sf);
353	if (error == 0)
354		error = copyout(&sf, uap->buf, sizeof(sf));
355	return (error);
356}
357
358int
359kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
360{
361	struct file *fp;
362	struct mount *mp;
363	struct statfs *sp, sb;
364	struct vnode *vp;
365	cap_rights_t rights;
366	int error;
367
368	AUDIT_ARG_FD(fd);
369	error = getvnode(td->td_proc->p_fd, fd,
370	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
371	if (error != 0)
372		return (error);
373	vp = fp->f_vnode;
374	vn_lock(vp, LK_SHARED | LK_RETRY);
375#ifdef AUDIT
376	AUDIT_ARG_VNODE1(vp);
377#endif
378	mp = vp->v_mount;
379	if (mp)
380		vfs_ref(mp);
381	VOP_UNLOCK(vp, 0);
382	fdrop(fp, td);
383	if (mp == NULL) {
384		error = EBADF;
385		goto out;
386	}
387	error = vfs_busy(mp, 0);
388	vfs_rel(mp);
389	if (error != 0)
390		return (error);
391#ifdef MAC
392	error = mac_mount_check_stat(td->td_ucred, mp);
393	if (error != 0)
394		goto out;
395#endif
396	/*
397	 * Set these in case the underlying filesystem fails to do so.
398	 */
399	sp = &mp->mnt_stat;
400	sp->f_version = STATFS_VERSION;
401	sp->f_namemax = NAME_MAX;
402	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
403	error = VFS_STATFS(mp, sp);
404	if (error != 0)
405		goto out;
406	if (priv_check(td, PRIV_VFS_GENERATION)) {
407		bcopy(sp, &sb, sizeof(sb));
408		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
409		prison_enforce_statfs(td->td_ucred, mp, &sb);
410		sp = &sb;
411	}
412	*buf = *sp;
413out:
414	if (mp)
415		vfs_unbusy(mp);
416	return (error);
417}
418
419/*
420 * Get statistics on all filesystems.
421 */
422#ifndef _SYS_SYSPROTO_H_
423struct getfsstat_args {
424	struct statfs *buf;
425	long bufsize;
426	int flags;
427};
428#endif
429int
430sys_getfsstat(td, uap)
431	struct thread *td;
432	register struct getfsstat_args /* {
433		struct statfs *buf;
434		long bufsize;
435		int flags;
436	} */ *uap;
437{
438
439	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
440	    uap->flags));
441}
442
443/*
444 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
445 *	The caller is responsible for freeing memory which will be allocated
446 *	in '*buf'.
447 */
448int
449kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
450    enum uio_seg bufseg, int flags)
451{
452	struct mount *mp, *nmp;
453	struct statfs *sfsp, *sp, sb;
454	size_t count, maxcount;
455	int error;
456
457	maxcount = bufsize / sizeof(struct statfs);
458	if (bufsize == 0)
459		sfsp = NULL;
460	else if (bufseg == UIO_USERSPACE)
461		sfsp = *buf;
462	else /* if (bufseg == UIO_SYSSPACE) */ {
463		count = 0;
464		mtx_lock(&mountlist_mtx);
465		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
466			count++;
467		}
468		mtx_unlock(&mountlist_mtx);
469		if (maxcount > count)
470			maxcount = count;
471		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
472		    M_WAITOK);
473	}
474	count = 0;
475	mtx_lock(&mountlist_mtx);
476	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
477		if (prison_canseemount(td->td_ucred, mp) != 0) {
478			nmp = TAILQ_NEXT(mp, mnt_list);
479			continue;
480		}
481#ifdef MAC
482		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
483			nmp = TAILQ_NEXT(mp, mnt_list);
484			continue;
485		}
486#endif
487		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
488			nmp = TAILQ_NEXT(mp, mnt_list);
489			continue;
490		}
491		if (sfsp && count < maxcount) {
492			sp = &mp->mnt_stat;
493			/*
494			 * Set these in case the underlying filesystem
495			 * fails to do so.
496			 */
497			sp->f_version = STATFS_VERSION;
498			sp->f_namemax = NAME_MAX;
499			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
500			/*
501			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
502			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
503			 * overrides MNT_WAIT.
504			 */
505			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
506			    (flags & MNT_WAIT)) &&
507			    (error = VFS_STATFS(mp, sp))) {
508				mtx_lock(&mountlist_mtx);
509				nmp = TAILQ_NEXT(mp, mnt_list);
510				vfs_unbusy(mp);
511				continue;
512			}
513			if (priv_check(td, PRIV_VFS_GENERATION)) {
514				bcopy(sp, &sb, sizeof(sb));
515				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
516				prison_enforce_statfs(td->td_ucred, mp, &sb);
517				sp = &sb;
518			}
519			if (bufseg == UIO_SYSSPACE)
520				bcopy(sp, sfsp, sizeof(*sp));
521			else /* if (bufseg == UIO_USERSPACE) */ {
522				error = copyout(sp, sfsp, sizeof(*sp));
523				if (error != 0) {
524					vfs_unbusy(mp);
525					return (error);
526				}
527			}
528			sfsp++;
529		}
530		count++;
531		mtx_lock(&mountlist_mtx);
532		nmp = TAILQ_NEXT(mp, mnt_list);
533		vfs_unbusy(mp);
534	}
535	mtx_unlock(&mountlist_mtx);
536	if (sfsp && count > maxcount)
537		td->td_retval[0] = maxcount;
538	else
539		td->td_retval[0] = count;
540	return (0);
541}
542
543#ifdef COMPAT_FREEBSD4
544/*
545 * Get old format filesystem statistics.
546 */
547static void cvtstatfs(struct statfs *, struct ostatfs *);
548
549#ifndef _SYS_SYSPROTO_H_
550struct freebsd4_statfs_args {
551	char *path;
552	struct ostatfs *buf;
553};
554#endif
555int
556freebsd4_statfs(td, uap)
557	struct thread *td;
558	struct freebsd4_statfs_args /* {
559		char *path;
560		struct ostatfs *buf;
561	} */ *uap;
562{
563	struct ostatfs osb;
564	struct statfs sf;
565	int error;
566
567	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
568	if (error != 0)
569		return (error);
570	cvtstatfs(&sf, &osb);
571	return (copyout(&osb, uap->buf, sizeof(osb)));
572}
573
574/*
575 * Get filesystem statistics.
576 */
577#ifndef _SYS_SYSPROTO_H_
578struct freebsd4_fstatfs_args {
579	int fd;
580	struct ostatfs *buf;
581};
582#endif
583int
584freebsd4_fstatfs(td, uap)
585	struct thread *td;
586	struct freebsd4_fstatfs_args /* {
587		int fd;
588		struct ostatfs *buf;
589	} */ *uap;
590{
591	struct ostatfs osb;
592	struct statfs sf;
593	int error;
594
595	error = kern_fstatfs(td, uap->fd, &sf);
596	if (error != 0)
597		return (error);
598	cvtstatfs(&sf, &osb);
599	return (copyout(&osb, uap->buf, sizeof(osb)));
600}
601
602/*
603 * Get statistics on all filesystems.
604 */
605#ifndef _SYS_SYSPROTO_H_
606struct freebsd4_getfsstat_args {
607	struct ostatfs *buf;
608	long bufsize;
609	int flags;
610};
611#endif
612int
613freebsd4_getfsstat(td, uap)
614	struct thread *td;
615	register struct freebsd4_getfsstat_args /* {
616		struct ostatfs *buf;
617		long bufsize;
618		int flags;
619	} */ *uap;
620{
621	struct statfs *buf, *sp;
622	struct ostatfs osb;
623	size_t count, size;
624	int error;
625
626	count = uap->bufsize / sizeof(struct ostatfs);
627	size = count * sizeof(struct statfs);
628	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
629	if (size > 0) {
630		count = td->td_retval[0];
631		sp = buf;
632		while (count > 0 && error == 0) {
633			cvtstatfs(sp, &osb);
634			error = copyout(&osb, uap->buf, sizeof(osb));
635			sp++;
636			uap->buf++;
637			count--;
638		}
639		free(buf, M_TEMP);
640	}
641	return (error);
642}
643
644/*
645 * Implement fstatfs() for (NFS) file handles.
646 */
647#ifndef _SYS_SYSPROTO_H_
648struct freebsd4_fhstatfs_args {
649	struct fhandle *u_fhp;
650	struct ostatfs *buf;
651};
652#endif
653int
654freebsd4_fhstatfs(td, uap)
655	struct thread *td;
656	struct freebsd4_fhstatfs_args /* {
657		struct fhandle *u_fhp;
658		struct ostatfs *buf;
659	} */ *uap;
660{
661	struct ostatfs osb;
662	struct statfs sf;
663	fhandle_t fh;
664	int error;
665
666	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
667	if (error != 0)
668		return (error);
669	error = kern_fhstatfs(td, fh, &sf);
670	if (error != 0)
671		return (error);
672	cvtstatfs(&sf, &osb);
673	return (copyout(&osb, uap->buf, sizeof(osb)));
674}
675
676/*
677 * Convert a new format statfs structure to an old format statfs structure.
678 */
679static void
680cvtstatfs(nsp, osp)
681	struct statfs *nsp;
682	struct ostatfs *osp;
683{
684
685	statfs_scale_blocks(nsp, LONG_MAX);
686	bzero(osp, sizeof(*osp));
687	osp->f_bsize = nsp->f_bsize;
688	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
689	osp->f_blocks = nsp->f_blocks;
690	osp->f_bfree = nsp->f_bfree;
691	osp->f_bavail = nsp->f_bavail;
692	osp->f_files = MIN(nsp->f_files, LONG_MAX);
693	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
694	osp->f_owner = nsp->f_owner;
695	osp->f_type = nsp->f_type;
696	osp->f_flags = nsp->f_flags;
697	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
698	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
699	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
700	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
701	strlcpy(osp->f_fstypename, nsp->f_fstypename,
702	    MIN(MFSNAMELEN, OMFSNAMELEN));
703	strlcpy(osp->f_mntonname, nsp->f_mntonname,
704	    MIN(MNAMELEN, OMNAMELEN));
705	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
706	    MIN(MNAMELEN, OMNAMELEN));
707	osp->f_fsid = nsp->f_fsid;
708}
709#endif /* COMPAT_FREEBSD4 */
710
711/*
712 * Change current working directory to a given file descriptor.
713 */
714#ifndef _SYS_SYSPROTO_H_
715struct fchdir_args {
716	int	fd;
717};
718#endif
719int
720sys_fchdir(td, uap)
721	struct thread *td;
722	struct fchdir_args /* {
723		int fd;
724	} */ *uap;
725{
726	register struct filedesc *fdp = td->td_proc->p_fd;
727	struct vnode *vp, *tdp, *vpold;
728	struct mount *mp;
729	struct file *fp;
730	cap_rights_t rights;
731	int error;
732
733	AUDIT_ARG_FD(uap->fd);
734	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
735	    &fp);
736	if (error != 0)
737		return (error);
738	vp = fp->f_vnode;
739	VREF(vp);
740	fdrop(fp, td);
741	vn_lock(vp, LK_SHARED | LK_RETRY);
742	AUDIT_ARG_VNODE1(vp);
743	error = change_dir(vp, td);
744	while (!error && (mp = vp->v_mountedhere) != NULL) {
745		if (vfs_busy(mp, 0))
746			continue;
747		error = VFS_ROOT(mp, LK_SHARED, &tdp);
748		vfs_unbusy(mp);
749		if (error != 0)
750			break;
751		vput(vp);
752		vp = tdp;
753	}
754	if (error != 0) {
755		vput(vp);
756		return (error);
757	}
758	VOP_UNLOCK(vp, 0);
759	FILEDESC_XLOCK(fdp);
760	vpold = fdp->fd_cdir;
761	fdp->fd_cdir = vp;
762	FILEDESC_XUNLOCK(fdp);
763	vrele(vpold);
764	return (0);
765}
766
767/*
768 * Change current working directory (``.'').
769 */
770#ifndef _SYS_SYSPROTO_H_
771struct chdir_args {
772	char	*path;
773};
774#endif
775int
776sys_chdir(td, uap)
777	struct thread *td;
778	struct chdir_args /* {
779		char *path;
780	} */ *uap;
781{
782
783	return (kern_chdir(td, uap->path, UIO_USERSPACE));
784}
785
786int
787kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
788{
789	register struct filedesc *fdp = td->td_proc->p_fd;
790	struct nameidata nd;
791	struct vnode *vp;
792	int error;
793
794	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
795	    pathseg, path, td);
796	if ((error = namei(&nd)) != 0)
797		return (error);
798	if ((error = change_dir(nd.ni_vp, td)) != 0) {
799		vput(nd.ni_vp);
800		NDFREE(&nd, NDF_ONLY_PNBUF);
801		return (error);
802	}
803	VOP_UNLOCK(nd.ni_vp, 0);
804	NDFREE(&nd, NDF_ONLY_PNBUF);
805	FILEDESC_XLOCK(fdp);
806	vp = fdp->fd_cdir;
807	fdp->fd_cdir = nd.ni_vp;
808	FILEDESC_XUNLOCK(fdp);
809	vrele(vp);
810	return (0);
811}
812
813/*
814 * Helper function for raised chroot(2) security function:  Refuse if
815 * any filedescriptors are open directories.
816 */
817static int
818chroot_refuse_vdir_fds(fdp)
819	struct filedesc *fdp;
820{
821	struct vnode *vp;
822	struct file *fp;
823	int fd;
824
825	FILEDESC_LOCK_ASSERT(fdp);
826
827	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
828		fp = fget_locked(fdp, fd);
829		if (fp == NULL)
830			continue;
831		if (fp->f_type == DTYPE_VNODE) {
832			vp = fp->f_vnode;
833			if (vp->v_type == VDIR)
834				return (EPERM);
835		}
836	}
837	return (0);
838}
839
840/*
841 * This sysctl determines if we will allow a process to chroot(2) if it
842 * has a directory open:
843 *	0: disallowed for all processes.
844 *	1: allowed for processes that were not already chroot(2)'ed.
845 *	2: allowed for all processes.
846 */
847
848static int chroot_allow_open_directories = 1;
849
850SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
851     &chroot_allow_open_directories, 0,
852     "Allow a process to chroot(2) if it has a directory open");
853
854/*
855 * Change notion of root (``/'') directory.
856 */
857#ifndef _SYS_SYSPROTO_H_
858struct chroot_args {
859	char	*path;
860};
861#endif
862int
863sys_chroot(td, uap)
864	struct thread *td;
865	struct chroot_args /* {
866		char *path;
867	} */ *uap;
868{
869	struct nameidata nd;
870	int error;
871
872	error = priv_check(td, PRIV_VFS_CHROOT);
873	if (error != 0)
874		return (error);
875	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
876	    UIO_USERSPACE, uap->path, td);
877	error = namei(&nd);
878	if (error != 0)
879		goto error;
880	error = change_dir(nd.ni_vp, td);
881	if (error != 0)
882		goto e_vunlock;
883#ifdef MAC
884	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
885	if (error != 0)
886		goto e_vunlock;
887#endif
888	VOP_UNLOCK(nd.ni_vp, 0);
889	error = change_root(nd.ni_vp, td);
890	vrele(nd.ni_vp);
891	NDFREE(&nd, NDF_ONLY_PNBUF);
892	return (error);
893e_vunlock:
894	vput(nd.ni_vp);
895error:
896	NDFREE(&nd, NDF_ONLY_PNBUF);
897	return (error);
898}
899
900/*
901 * Common routine for chroot and chdir.  Callers must provide a locked vnode
902 * instance.
903 */
904int
905change_dir(vp, td)
906	struct vnode *vp;
907	struct thread *td;
908{
909#ifdef MAC
910	int error;
911#endif
912
913	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
914	if (vp->v_type != VDIR)
915		return (ENOTDIR);
916#ifdef MAC
917	error = mac_vnode_check_chdir(td->td_ucred, vp);
918	if (error != 0)
919		return (error);
920#endif
921	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
922}
923
924/*
925 * Common routine for kern_chroot() and jail_attach().  The caller is
926 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
927 * authorize this operation.
928 */
929int
930change_root(vp, td)
931	struct vnode *vp;
932	struct thread *td;
933{
934	struct filedesc *fdp;
935	struct vnode *oldvp;
936	int error;
937
938	fdp = td->td_proc->p_fd;
939	FILEDESC_XLOCK(fdp);
940	if (chroot_allow_open_directories == 0 ||
941	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
942		error = chroot_refuse_vdir_fds(fdp);
943		if (error != 0) {
944			FILEDESC_XUNLOCK(fdp);
945			return (error);
946		}
947	}
948	oldvp = fdp->fd_rdir;
949	fdp->fd_rdir = vp;
950	VREF(fdp->fd_rdir);
951	if (!fdp->fd_jdir) {
952		fdp->fd_jdir = vp;
953		VREF(fdp->fd_jdir);
954	}
955	FILEDESC_XUNLOCK(fdp);
956	vrele(oldvp);
957	return (0);
958}
959
960static __inline void
961flags_to_rights(int flags, cap_rights_t *rightsp)
962{
963
964	if (flags & O_EXEC) {
965		cap_rights_set(rightsp, CAP_FEXECVE);
966	} else {
967		switch ((flags & O_ACCMODE)) {
968		case O_RDONLY:
969			cap_rights_set(rightsp, CAP_READ);
970			break;
971		case O_RDWR:
972			cap_rights_set(rightsp, CAP_READ);
973			/* FALLTHROUGH */
974		case O_WRONLY:
975			cap_rights_set(rightsp, CAP_WRITE);
976			if (!(flags & (O_APPEND | O_TRUNC)))
977				cap_rights_set(rightsp, CAP_SEEK);
978			break;
979		}
980	}
981
982	if (flags & O_CREAT)
983		cap_rights_set(rightsp, CAP_CREATE);
984
985	if (flags & O_TRUNC)
986		cap_rights_set(rightsp, CAP_FTRUNCATE);
987
988	if (flags & (O_SYNC | O_FSYNC))
989		cap_rights_set(rightsp, CAP_FSYNC);
990
991	if (flags & (O_EXLOCK | O_SHLOCK))
992		cap_rights_set(rightsp, CAP_FLOCK);
993}
994
995/*
996 * Check permissions, allocate an open file structure, and call the device
997 * open routine if any.
998 */
999#ifndef _SYS_SYSPROTO_H_
1000struct open_args {
1001	char	*path;
1002	int	flags;
1003	int	mode;
1004};
1005#endif
1006int
1007sys_open(td, uap)
1008	struct thread *td;
1009	register struct open_args /* {
1010		char *path;
1011		int flags;
1012		int mode;
1013	} */ *uap;
1014{
1015
1016	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1017}
1018
1019#ifndef _SYS_SYSPROTO_H_
1020struct openat_args {
1021	int	fd;
1022	char	*path;
1023	int	flag;
1024	int	mode;
1025};
1026#endif
1027int
1028sys_openat(struct thread *td, struct openat_args *uap)
1029{
1030
1031	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1032	    uap->mode));
1033}
1034
1035int
1036kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1037    int mode)
1038{
1039
1040	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1041}
1042
1043int
1044kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1045    int flags, int mode)
1046{
1047	struct proc *p = td->td_proc;
1048	struct filedesc *fdp = p->p_fd;
1049	struct file *fp;
1050	struct vnode *vp;
1051	struct nameidata nd;
1052	cap_rights_t rights;
1053	int cmode, error, indx;
1054
1055	indx = -1;
1056
1057	AUDIT_ARG_FFLAGS(flags);
1058	AUDIT_ARG_MODE(mode);
1059	/* XXX: audit dirfd */
1060	cap_rights_init(&rights, CAP_LOOKUP);
1061	flags_to_rights(flags, &rights);
1062	/*
1063	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1064	 * may be specified.
1065	 */
1066	if (flags & O_EXEC) {
1067		if (flags & O_ACCMODE)
1068			return (EINVAL);
1069	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1070		return (EINVAL);
1071	} else {
1072		flags = FFLAGS(flags);
1073	}
1074
1075	/*
1076	 * Allocate the file descriptor, but don't install a descriptor yet.
1077	 */
1078	error = falloc_noinstall(td, &fp);
1079	if (error != 0)
1080		return (error);
1081	/*
1082	 * An extra reference on `fp' has been held for us by
1083	 * falloc_noinstall().
1084	 */
1085	/* Set the flags early so the finit in devfs can pick them up. */
1086	fp->f_flag = flags & FMASK;
1087	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1088	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1089	    &rights, td);
1090	td->td_dupfd = -1;		/* XXX check for fdopen */
1091	error = vn_open(&nd, &flags, cmode, fp);
1092	if (error != 0) {
1093		/*
1094		 * If the vn_open replaced the method vector, something
1095		 * wonderous happened deep below and we just pass it up
1096		 * pretending we know what we do.
1097		 */
1098		if (error == ENXIO && fp->f_ops != &badfileops)
1099			goto success;
1100
1101		/*
1102		 * Handle special fdopen() case. bleh.
1103		 *
1104		 * Don't do this for relative (capability) lookups; we don't
1105		 * understand exactly what would happen, and we don't think
1106		 * that it ever should.
1107		 */
1108		if (nd.ni_strictrelative == 0 &&
1109		    (error == ENODEV || error == ENXIO) &&
1110		    td->td_dupfd >= 0) {
1111			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1112			    &indx);
1113			if (error == 0)
1114				goto success;
1115		}
1116
1117		goto bad;
1118	}
1119	td->td_dupfd = 0;
1120	NDFREE(&nd, NDF_ONLY_PNBUF);
1121	vp = nd.ni_vp;
1122
1123	/*
1124	 * Store the vnode, for any f_type. Typically, the vnode use
1125	 * count is decremented by direct call to vn_closefile() for
1126	 * files that switched type in the cdevsw fdopen() method.
1127	 */
1128	fp->f_vnode = vp;
1129	/*
1130	 * If the file wasn't claimed by devfs bind it to the normal
1131	 * vnode operations here.
1132	 */
1133	if (fp->f_ops == &badfileops) {
1134		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1135		fp->f_seqcount = 1;
1136		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1137		    DTYPE_VNODE, vp, &vnops);
1138	}
1139
1140	VOP_UNLOCK(vp, 0);
1141	if (flags & O_TRUNC) {
1142		error = fo_truncate(fp, 0, td->td_ucred, td);
1143		if (error != 0)
1144			goto bad;
1145	}
1146success:
1147	/*
1148	 * If we haven't already installed the FD (for dupfdopen), do so now.
1149	 */
1150	if (indx == -1) {
1151		struct filecaps *fcaps;
1152
1153#ifdef CAPABILITIES
1154		if (nd.ni_strictrelative == 1)
1155			fcaps = &nd.ni_filecaps;
1156		else
1157#endif
1158			fcaps = NULL;
1159		error = finstall(td, fp, &indx, flags, fcaps);
1160		/* On success finstall() consumes fcaps. */
1161		if (error != 0) {
1162			filecaps_free(&nd.ni_filecaps);
1163			goto bad;
1164		}
1165	} else {
1166		filecaps_free(&nd.ni_filecaps);
1167	}
1168
1169	/*
1170	 * Release our private reference, leaving the one associated with
1171	 * the descriptor table intact.
1172	 */
1173	fdrop(fp, td);
1174	td->td_retval[0] = indx;
1175	return (0);
1176bad:
1177	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1178	fdrop(fp, td);
1179	return (error);
1180}
1181
1182#ifdef COMPAT_43
1183/*
1184 * Create a file.
1185 */
1186#ifndef _SYS_SYSPROTO_H_
1187struct ocreat_args {
1188	char	*path;
1189	int	mode;
1190};
1191#endif
1192int
1193ocreat(td, uap)
1194	struct thread *td;
1195	register struct ocreat_args /* {
1196		char *path;
1197		int mode;
1198	} */ *uap;
1199{
1200
1201	return (kern_open(td, uap->path, UIO_USERSPACE,
1202	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1203}
1204#endif /* COMPAT_43 */
1205
1206/*
1207 * Create a special file.
1208 */
1209#ifndef _SYS_SYSPROTO_H_
1210struct mknod_args {
1211	char	*path;
1212	int	mode;
1213	int	dev;
1214};
1215#endif
1216int
1217sys_mknod(td, uap)
1218	struct thread *td;
1219	register struct mknod_args /* {
1220		char *path;
1221		int mode;
1222		int dev;
1223	} */ *uap;
1224{
1225
1226	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1227}
1228
1229#ifndef _SYS_SYSPROTO_H_
1230struct mknodat_args {
1231	int	fd;
1232	char	*path;
1233	mode_t	mode;
1234	dev_t	dev;
1235};
1236#endif
1237int
1238sys_mknodat(struct thread *td, struct mknodat_args *uap)
1239{
1240
1241	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1242	    uap->dev));
1243}
1244
1245int
1246kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1247    int dev)
1248{
1249
1250	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1251}
1252
1253int
1254kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1255    int mode, int dev)
1256{
1257	struct vnode *vp;
1258	struct mount *mp;
1259	struct vattr vattr;
1260	struct nameidata nd;
1261	cap_rights_t rights;
1262	int error, whiteout = 0;
1263
1264	AUDIT_ARG_MODE(mode);
1265	AUDIT_ARG_DEV(dev);
1266	switch (mode & S_IFMT) {
1267	case S_IFCHR:
1268	case S_IFBLK:
1269		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1270		break;
1271	case S_IFMT:
1272		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1273		break;
1274	case S_IFWHT:
1275		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1276		break;
1277	case S_IFIFO:
1278		if (dev == 0)
1279			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1280		/* FALLTHROUGH */
1281	default:
1282		error = EINVAL;
1283		break;
1284	}
1285	if (error != 0)
1286		return (error);
1287restart:
1288	bwillwrite();
1289	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1290	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1291	    td);
1292	if ((error = namei(&nd)) != 0)
1293		return (error);
1294	vp = nd.ni_vp;
1295	if (vp != NULL) {
1296		NDFREE(&nd, NDF_ONLY_PNBUF);
1297		if (vp == nd.ni_dvp)
1298			vrele(nd.ni_dvp);
1299		else
1300			vput(nd.ni_dvp);
1301		vrele(vp);
1302		return (EEXIST);
1303	} else {
1304		VATTR_NULL(&vattr);
1305		vattr.va_mode = (mode & ALLPERMS) &
1306		    ~td->td_proc->p_fd->fd_cmask;
1307		vattr.va_rdev = dev;
1308		whiteout = 0;
1309
1310		switch (mode & S_IFMT) {
1311		case S_IFMT:	/* used by badsect to flag bad sectors */
1312			vattr.va_type = VBAD;
1313			break;
1314		case S_IFCHR:
1315			vattr.va_type = VCHR;
1316			break;
1317		case S_IFBLK:
1318			vattr.va_type = VBLK;
1319			break;
1320		case S_IFWHT:
1321			whiteout = 1;
1322			break;
1323		default:
1324			panic("kern_mknod: invalid mode");
1325		}
1326	}
1327	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1328		NDFREE(&nd, NDF_ONLY_PNBUF);
1329		vput(nd.ni_dvp);
1330		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1331			return (error);
1332		goto restart;
1333	}
1334#ifdef MAC
1335	if (error == 0 && !whiteout)
1336		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1337		    &nd.ni_cnd, &vattr);
1338#endif
1339	if (error == 0) {
1340		if (whiteout)
1341			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1342		else {
1343			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1344						&nd.ni_cnd, &vattr);
1345			if (error == 0)
1346				vput(nd.ni_vp);
1347		}
1348	}
1349	NDFREE(&nd, NDF_ONLY_PNBUF);
1350	vput(nd.ni_dvp);
1351	vn_finished_write(mp);
1352	return (error);
1353}
1354
1355/*
1356 * Create a named pipe.
1357 */
1358#ifndef _SYS_SYSPROTO_H_
1359struct mkfifo_args {
1360	char	*path;
1361	int	mode;
1362};
1363#endif
1364int
1365sys_mkfifo(td, uap)
1366	struct thread *td;
1367	register struct mkfifo_args /* {
1368		char *path;
1369		int mode;
1370	} */ *uap;
1371{
1372
1373	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1374}
1375
1376#ifndef _SYS_SYSPROTO_H_
1377struct mkfifoat_args {
1378	int	fd;
1379	char	*path;
1380	mode_t	mode;
1381};
1382#endif
1383int
1384sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1385{
1386
1387	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1388	    uap->mode));
1389}
1390
1391int
1392kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1393{
1394
1395	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1396}
1397
1398int
1399kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1400    int mode)
1401{
1402	struct mount *mp;
1403	struct vattr vattr;
1404	struct nameidata nd;
1405	cap_rights_t rights;
1406	int error;
1407
1408	AUDIT_ARG_MODE(mode);
1409restart:
1410	bwillwrite();
1411	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1412	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1413	    td);
1414	if ((error = namei(&nd)) != 0)
1415		return (error);
1416	if (nd.ni_vp != NULL) {
1417		NDFREE(&nd, NDF_ONLY_PNBUF);
1418		if (nd.ni_vp == nd.ni_dvp)
1419			vrele(nd.ni_dvp);
1420		else
1421			vput(nd.ni_dvp);
1422		vrele(nd.ni_vp);
1423		return (EEXIST);
1424	}
1425	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1426		NDFREE(&nd, NDF_ONLY_PNBUF);
1427		vput(nd.ni_dvp);
1428		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1429			return (error);
1430		goto restart;
1431	}
1432	VATTR_NULL(&vattr);
1433	vattr.va_type = VFIFO;
1434	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1435#ifdef MAC
1436	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1437	    &vattr);
1438	if (error != 0)
1439		goto out;
1440#endif
1441	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1442	if (error == 0)
1443		vput(nd.ni_vp);
1444#ifdef MAC
1445out:
1446#endif
1447	vput(nd.ni_dvp);
1448	vn_finished_write(mp);
1449	NDFREE(&nd, NDF_ONLY_PNBUF);
1450	return (error);
1451}
1452
1453/*
1454 * Make a hard file link.
1455 */
1456#ifndef _SYS_SYSPROTO_H_
1457struct link_args {
1458	char	*path;
1459	char	*link;
1460};
1461#endif
1462int
1463sys_link(td, uap)
1464	struct thread *td;
1465	register struct link_args /* {
1466		char *path;
1467		char *link;
1468	} */ *uap;
1469{
1470
1471	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1472}
1473
1474#ifndef _SYS_SYSPROTO_H_
1475struct linkat_args {
1476	int	fd1;
1477	char	*path1;
1478	int	fd2;
1479	char	*path2;
1480	int	flag;
1481};
1482#endif
1483int
1484sys_linkat(struct thread *td, struct linkat_args *uap)
1485{
1486	int flag;
1487
1488	flag = uap->flag;
1489	if (flag & ~AT_SYMLINK_FOLLOW)
1490		return (EINVAL);
1491
1492	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1493	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1494}
1495
1496int hardlink_check_uid = 0;
1497SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1498    &hardlink_check_uid, 0,
1499    "Unprivileged processes cannot create hard links to files owned by other "
1500    "users");
1501static int hardlink_check_gid = 0;
1502SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1503    &hardlink_check_gid, 0,
1504    "Unprivileged processes cannot create hard links to files owned by other "
1505    "groups");
1506
1507static int
1508can_hardlink(struct vnode *vp, struct ucred *cred)
1509{
1510	struct vattr va;
1511	int error;
1512
1513	if (!hardlink_check_uid && !hardlink_check_gid)
1514		return (0);
1515
1516	error = VOP_GETATTR(vp, &va, cred);
1517	if (error != 0)
1518		return (error);
1519
1520	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1521		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1522		if (error != 0)
1523			return (error);
1524	}
1525
1526	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1527		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1528		if (error != 0)
1529			return (error);
1530	}
1531
1532	return (0);
1533}
1534
1535int
1536kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1537{
1538
1539	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1540}
1541
1542int
1543kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1544    enum uio_seg segflg, int follow)
1545{
1546	struct vnode *vp;
1547	struct mount *mp;
1548	struct nameidata nd;
1549	cap_rights_t rights;
1550	int error;
1551
1552again:
1553	bwillwrite();
1554	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1555
1556	if ((error = namei(&nd)) != 0)
1557		return (error);
1558	NDFREE(&nd, NDF_ONLY_PNBUF);
1559	vp = nd.ni_vp;
1560	if (vp->v_type == VDIR) {
1561		vrele(vp);
1562		return (EPERM);		/* POSIX */
1563	}
1564	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
1565	    NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
1566	    td);
1567	if ((error = namei(&nd)) == 0) {
1568		if (nd.ni_vp != NULL) {
1569			NDFREE(&nd, NDF_ONLY_PNBUF);
1570			if (nd.ni_dvp == nd.ni_vp)
1571				vrele(nd.ni_dvp);
1572			else
1573				vput(nd.ni_dvp);
1574			vrele(nd.ni_vp);
1575			vrele(vp);
1576			return (EEXIST);
1577		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1578			/*
1579			 * Cross-device link.  No need to recheck
1580			 * vp->v_type, since it cannot change, except
1581			 * to VBAD.
1582			 */
1583			NDFREE(&nd, NDF_ONLY_PNBUF);
1584			vput(nd.ni_dvp);
1585			vrele(vp);
1586			return (EXDEV);
1587		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1588			error = can_hardlink(vp, td->td_ucred);
1589#ifdef MAC
1590			if (error == 0)
1591				error = mac_vnode_check_link(td->td_ucred,
1592				    nd.ni_dvp, vp, &nd.ni_cnd);
1593#endif
1594			if (error != 0) {
1595				vput(vp);
1596				vput(nd.ni_dvp);
1597				NDFREE(&nd, NDF_ONLY_PNBUF);
1598				return (error);
1599			}
1600			error = vn_start_write(vp, &mp, V_NOWAIT);
1601			if (error != 0) {
1602				vput(vp);
1603				vput(nd.ni_dvp);
1604				NDFREE(&nd, NDF_ONLY_PNBUF);
1605				error = vn_start_write(NULL, &mp,
1606				    V_XSLEEP | PCATCH);
1607				if (error != 0)
1608					return (error);
1609				goto again;
1610			}
1611			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1612			VOP_UNLOCK(vp, 0);
1613			vput(nd.ni_dvp);
1614			vn_finished_write(mp);
1615			NDFREE(&nd, NDF_ONLY_PNBUF);
1616		} else {
1617			vput(nd.ni_dvp);
1618			NDFREE(&nd, NDF_ONLY_PNBUF);
1619			vrele(vp);
1620			goto again;
1621		}
1622	}
1623	vrele(vp);
1624	return (error);
1625}
1626
1627/*
1628 * Make a symbolic link.
1629 */
1630#ifndef _SYS_SYSPROTO_H_
1631struct symlink_args {
1632	char	*path;
1633	char	*link;
1634};
1635#endif
1636int
1637sys_symlink(td, uap)
1638	struct thread *td;
1639	register struct symlink_args /* {
1640		char *path;
1641		char *link;
1642	} */ *uap;
1643{
1644
1645	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1646}
1647
1648#ifndef _SYS_SYSPROTO_H_
1649struct symlinkat_args {
1650	char	*path;
1651	int	fd;
1652	char	*path2;
1653};
1654#endif
1655int
1656sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1657{
1658
1659	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1660	    UIO_USERSPACE));
1661}
1662
1663int
1664kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1665{
1666
1667	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1668}
1669
1670int
1671kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1672    enum uio_seg segflg)
1673{
1674	struct mount *mp;
1675	struct vattr vattr;
1676	char *syspath;
1677	struct nameidata nd;
1678	int error;
1679	cap_rights_t rights;
1680
1681	if (segflg == UIO_SYSSPACE) {
1682		syspath = path1;
1683	} else {
1684		syspath = uma_zalloc(namei_zone, M_WAITOK);
1685		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1686			goto out;
1687	}
1688	AUDIT_ARG_TEXT(syspath);
1689restart:
1690	bwillwrite();
1691	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1692	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1693	    td);
1694	if ((error = namei(&nd)) != 0)
1695		goto out;
1696	if (nd.ni_vp) {
1697		NDFREE(&nd, NDF_ONLY_PNBUF);
1698		if (nd.ni_vp == nd.ni_dvp)
1699			vrele(nd.ni_dvp);
1700		else
1701			vput(nd.ni_dvp);
1702		vrele(nd.ni_vp);
1703		error = EEXIST;
1704		goto out;
1705	}
1706	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1707		NDFREE(&nd, NDF_ONLY_PNBUF);
1708		vput(nd.ni_dvp);
1709		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1710			goto out;
1711		goto restart;
1712	}
1713	VATTR_NULL(&vattr);
1714	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1715#ifdef MAC
1716	vattr.va_type = VLNK;
1717	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1718	    &vattr);
1719	if (error != 0)
1720		goto out2;
1721#endif
1722	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1723	if (error == 0)
1724		vput(nd.ni_vp);
1725#ifdef MAC
1726out2:
1727#endif
1728	NDFREE(&nd, NDF_ONLY_PNBUF);
1729	vput(nd.ni_dvp);
1730	vn_finished_write(mp);
1731out:
1732	if (segflg != UIO_SYSSPACE)
1733		uma_zfree(namei_zone, syspath);
1734	return (error);
1735}
1736
1737/*
1738 * Delete a whiteout from the filesystem.
1739 */
1740int
1741sys_undelete(td, uap)
1742	struct thread *td;
1743	register struct undelete_args /* {
1744		char *path;
1745	} */ *uap;
1746{
1747	struct mount *mp;
1748	struct nameidata nd;
1749	int error;
1750
1751restart:
1752	bwillwrite();
1753	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1754	    UIO_USERSPACE, uap->path, td);
1755	error = namei(&nd);
1756	if (error != 0)
1757		return (error);
1758
1759	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1760		NDFREE(&nd, NDF_ONLY_PNBUF);
1761		if (nd.ni_vp == nd.ni_dvp)
1762			vrele(nd.ni_dvp);
1763		else
1764			vput(nd.ni_dvp);
1765		if (nd.ni_vp)
1766			vrele(nd.ni_vp);
1767		return (EEXIST);
1768	}
1769	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1770		NDFREE(&nd, NDF_ONLY_PNBUF);
1771		vput(nd.ni_dvp);
1772		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1773			return (error);
1774		goto restart;
1775	}
1776	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1777	NDFREE(&nd, NDF_ONLY_PNBUF);
1778	vput(nd.ni_dvp);
1779	vn_finished_write(mp);
1780	return (error);
1781}
1782
1783/*
1784 * Delete a name from the filesystem.
1785 */
1786#ifndef _SYS_SYSPROTO_H_
1787struct unlink_args {
1788	char	*path;
1789};
1790#endif
1791int
1792sys_unlink(td, uap)
1793	struct thread *td;
1794	struct unlink_args /* {
1795		char *path;
1796	} */ *uap;
1797{
1798
1799	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1800}
1801
1802#ifndef _SYS_SYSPROTO_H_
1803struct unlinkat_args {
1804	int	fd;
1805	char	*path;
1806	int	flag;
1807};
1808#endif
1809int
1810sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1811{
1812	int flag = uap->flag;
1813	int fd = uap->fd;
1814	char *path = uap->path;
1815
1816	if (flag & ~AT_REMOVEDIR)
1817		return (EINVAL);
1818
1819	if (flag & AT_REMOVEDIR)
1820		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1821	else
1822		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1823}
1824
1825int
1826kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1827{
1828
1829	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1830}
1831
1832int
1833kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1834    ino_t oldinum)
1835{
1836	struct mount *mp;
1837	struct vnode *vp;
1838	struct nameidata nd;
1839	struct stat sb;
1840	cap_rights_t rights;
1841	int error;
1842
1843restart:
1844	bwillwrite();
1845	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1846	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1847	if ((error = namei(&nd)) != 0)
1848		return (error == EINVAL ? EPERM : error);
1849	vp = nd.ni_vp;
1850	if (vp->v_type == VDIR && oldinum == 0) {
1851		error = EPERM;		/* POSIX */
1852	} else if (oldinum != 0 &&
1853		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1854		  sb.st_ino != oldinum) {
1855			error = EIDRM;	/* Identifier removed */
1856	} else {
1857		/*
1858		 * The root of a mounted filesystem cannot be deleted.
1859		 *
1860		 * XXX: can this only be a VDIR case?
1861		 */
1862		if (vp->v_vflag & VV_ROOT)
1863			error = EBUSY;
1864	}
1865	if (error == 0) {
1866		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1867			NDFREE(&nd, NDF_ONLY_PNBUF);
1868			vput(nd.ni_dvp);
1869			if (vp == nd.ni_dvp)
1870				vrele(vp);
1871			else
1872				vput(vp);
1873			if ((error = vn_start_write(NULL, &mp,
1874			    V_XSLEEP | PCATCH)) != 0)
1875				return (error);
1876			goto restart;
1877		}
1878#ifdef MAC
1879		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1880		    &nd.ni_cnd);
1881		if (error != 0)
1882			goto out;
1883#endif
1884		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1885		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1886#ifdef MAC
1887out:
1888#endif
1889		vn_finished_write(mp);
1890	}
1891	NDFREE(&nd, NDF_ONLY_PNBUF);
1892	vput(nd.ni_dvp);
1893	if (vp == nd.ni_dvp)
1894		vrele(vp);
1895	else
1896		vput(vp);
1897	return (error);
1898}
1899
1900/*
1901 * Reposition read/write file offset.
1902 */
1903#ifndef _SYS_SYSPROTO_H_
1904struct lseek_args {
1905	int	fd;
1906	int	pad;
1907	off_t	offset;
1908	int	whence;
1909};
1910#endif
1911int
1912sys_lseek(td, uap)
1913	struct thread *td;
1914	register struct lseek_args /* {
1915		int fd;
1916		int pad;
1917		off_t offset;
1918		int whence;
1919	} */ *uap;
1920{
1921	struct file *fp;
1922	cap_rights_t rights;
1923	int error;
1924
1925	AUDIT_ARG_FD(uap->fd);
1926	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1927	if (error != 0)
1928		return (error);
1929	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1930	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1931	fdrop(fp, td);
1932	return (error);
1933}
1934
1935#if defined(COMPAT_43)
1936/*
1937 * Reposition read/write file offset.
1938 */
1939#ifndef _SYS_SYSPROTO_H_
1940struct olseek_args {
1941	int	fd;
1942	long	offset;
1943	int	whence;
1944};
1945#endif
1946int
1947olseek(td, uap)
1948	struct thread *td;
1949	register struct olseek_args /* {
1950		int fd;
1951		long offset;
1952		int whence;
1953	} */ *uap;
1954{
1955	struct lseek_args /* {
1956		int fd;
1957		int pad;
1958		off_t offset;
1959		int whence;
1960	} */ nuap;
1961
1962	nuap.fd = uap->fd;
1963	nuap.offset = uap->offset;
1964	nuap.whence = uap->whence;
1965	return (sys_lseek(td, &nuap));
1966}
1967#endif /* COMPAT_43 */
1968
1969/* Version with the 'pad' argument */
1970int
1971freebsd6_lseek(td, uap)
1972	struct thread *td;
1973	register struct freebsd6_lseek_args *uap;
1974{
1975	struct lseek_args ouap;
1976
1977	ouap.fd = uap->fd;
1978	ouap.offset = uap->offset;
1979	ouap.whence = uap->whence;
1980	return (sys_lseek(td, &ouap));
1981}
1982
1983/*
1984 * Check access permissions using passed credentials.
1985 */
1986static int
1987vn_access(vp, user_flags, cred, td)
1988	struct vnode	*vp;
1989	int		user_flags;
1990	struct ucred	*cred;
1991	struct thread	*td;
1992{
1993	accmode_t accmode;
1994	int error;
1995
1996	/* Flags == 0 means only check for existence. */
1997	error = 0;
1998	if (user_flags) {
1999		accmode = 0;
2000		if (user_flags & R_OK)
2001			accmode |= VREAD;
2002		if (user_flags & W_OK)
2003			accmode |= VWRITE;
2004		if (user_flags & X_OK)
2005			accmode |= VEXEC;
2006#ifdef MAC
2007		error = mac_vnode_check_access(cred, vp, accmode);
2008		if (error != 0)
2009			return (error);
2010#endif
2011		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
2012			error = VOP_ACCESS(vp, accmode, cred, td);
2013	}
2014	return (error);
2015}
2016
2017/*
2018 * Check access permissions using "real" credentials.
2019 */
2020#ifndef _SYS_SYSPROTO_H_
2021struct access_args {
2022	char	*path;
2023	int	amode;
2024};
2025#endif
2026int
2027sys_access(td, uap)
2028	struct thread *td;
2029	register struct access_args /* {
2030		char *path;
2031		int amode;
2032	} */ *uap;
2033{
2034
2035	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2036}
2037
2038#ifndef _SYS_SYSPROTO_H_
2039struct faccessat_args {
2040	int	dirfd;
2041	char	*path;
2042	int	amode;
2043	int	flag;
2044}
2045#endif
2046int
2047sys_faccessat(struct thread *td, struct faccessat_args *uap)
2048{
2049
2050	if (uap->flag & ~AT_EACCESS)
2051		return (EINVAL);
2052	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2053	    uap->amode));
2054}
2055
2056int
2057kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2058{
2059
2060	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2061}
2062
2063int
2064kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2065    int flag, int amode)
2066{
2067	struct ucred *cred, *tmpcred;
2068	struct vnode *vp;
2069	struct nameidata nd;
2070	cap_rights_t rights;
2071	int error;
2072
2073	/*
2074	 * Create and modify a temporary credential instead of one that
2075	 * is potentially shared.
2076	 */
2077	if (!(flag & AT_EACCESS)) {
2078		cred = td->td_ucred;
2079		tmpcred = crdup(cred);
2080		tmpcred->cr_uid = cred->cr_ruid;
2081		tmpcred->cr_groups[0] = cred->cr_rgid;
2082		td->td_ucred = tmpcred;
2083	} else
2084		cred = tmpcred = td->td_ucred;
2085	AUDIT_ARG_VALUE(amode);
2086	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2087	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2088	    td);
2089	if ((error = namei(&nd)) != 0)
2090		goto out1;
2091	vp = nd.ni_vp;
2092
2093	error = vn_access(vp, amode, tmpcred, td);
2094	NDFREE(&nd, NDF_ONLY_PNBUF);
2095	vput(vp);
2096out1:
2097	if (!(flag & AT_EACCESS)) {
2098		td->td_ucred = cred;
2099		crfree(tmpcred);
2100	}
2101	return (error);
2102}
2103
2104/*
2105 * Check access permissions using "effective" credentials.
2106 */
2107#ifndef _SYS_SYSPROTO_H_
2108struct eaccess_args {
2109	char	*path;
2110	int	amode;
2111};
2112#endif
2113int
2114sys_eaccess(td, uap)
2115	struct thread *td;
2116	register struct eaccess_args /* {
2117		char *path;
2118		int amode;
2119	} */ *uap;
2120{
2121
2122	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2123}
2124
2125int
2126kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2127{
2128
2129	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2130}
2131
2132#if defined(COMPAT_43)
2133/*
2134 * Get file status; this version follows links.
2135 */
2136#ifndef _SYS_SYSPROTO_H_
2137struct ostat_args {
2138	char	*path;
2139	struct ostat *ub;
2140};
2141#endif
2142int
2143ostat(td, uap)
2144	struct thread *td;
2145	register struct ostat_args /* {
2146		char *path;
2147		struct ostat *ub;
2148	} */ *uap;
2149{
2150	struct stat sb;
2151	struct ostat osb;
2152	int error;
2153
2154	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2155	if (error != 0)
2156		return (error);
2157	cvtstat(&sb, &osb);
2158	return (copyout(&osb, uap->ub, sizeof (osb)));
2159}
2160
2161/*
2162 * Get file status; this version does not follow links.
2163 */
2164#ifndef _SYS_SYSPROTO_H_
2165struct olstat_args {
2166	char	*path;
2167	struct ostat *ub;
2168};
2169#endif
2170int
2171olstat(td, uap)
2172	struct thread *td;
2173	register struct olstat_args /* {
2174		char *path;
2175		struct ostat *ub;
2176	} */ *uap;
2177{
2178	struct stat sb;
2179	struct ostat osb;
2180	int error;
2181
2182	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2183	if (error != 0)
2184		return (error);
2185	cvtstat(&sb, &osb);
2186	return (copyout(&osb, uap->ub, sizeof (osb)));
2187}
2188
2189/*
2190 * Convert from an old to a new stat structure.
2191 */
2192void
2193cvtstat(st, ost)
2194	struct stat *st;
2195	struct ostat *ost;
2196{
2197
2198	bzero(ost, sizeof(*ost));
2199	ost->st_dev = st->st_dev;
2200	ost->st_ino = st->st_ino;
2201	ost->st_mode = st->st_mode;
2202	ost->st_nlink = st->st_nlink;
2203	ost->st_uid = st->st_uid;
2204	ost->st_gid = st->st_gid;
2205	ost->st_rdev = st->st_rdev;
2206	if (st->st_size < (quad_t)1 << 32)
2207		ost->st_size = st->st_size;
2208	else
2209		ost->st_size = -2;
2210	ost->st_atim = st->st_atim;
2211	ost->st_mtim = st->st_mtim;
2212	ost->st_ctim = st->st_ctim;
2213	ost->st_blksize = st->st_blksize;
2214	ost->st_blocks = st->st_blocks;
2215	ost->st_flags = st->st_flags;
2216	ost->st_gen = st->st_gen;
2217}
2218#endif /* COMPAT_43 */
2219
2220/*
2221 * Get file status; this version follows links.
2222 */
2223#ifndef _SYS_SYSPROTO_H_
2224struct stat_args {
2225	char	*path;
2226	struct stat *ub;
2227};
2228#endif
2229int
2230sys_stat(td, uap)
2231	struct thread *td;
2232	register struct stat_args /* {
2233		char *path;
2234		struct stat *ub;
2235	} */ *uap;
2236{
2237	struct stat sb;
2238	int error;
2239
2240	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2241	if (error == 0)
2242		error = copyout(&sb, uap->ub, sizeof (sb));
2243	return (error);
2244}
2245
2246#ifndef _SYS_SYSPROTO_H_
2247struct fstatat_args {
2248	int	fd;
2249	char	*path;
2250	struct stat	*buf;
2251	int	flag;
2252}
2253#endif
2254int
2255sys_fstatat(struct thread *td, struct fstatat_args *uap)
2256{
2257	struct stat sb;
2258	int error;
2259
2260	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2261	    UIO_USERSPACE, &sb);
2262	if (error == 0)
2263		error = copyout(&sb, uap->buf, sizeof (sb));
2264	return (error);
2265}
2266
2267int
2268kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2269{
2270
2271	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2272}
2273
2274int
2275kern_statat(struct thread *td, int flag, int fd, char *path,
2276    enum uio_seg pathseg, struct stat *sbp)
2277{
2278
2279	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2280}
2281
2282int
2283kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2284    enum uio_seg pathseg, struct stat *sbp,
2285    void (*hook)(struct vnode *vp, struct stat *sbp))
2286{
2287	struct nameidata nd;
2288	struct stat sb;
2289	cap_rights_t rights;
2290	int error;
2291
2292	if (flag & ~AT_SYMLINK_NOFOLLOW)
2293		return (EINVAL);
2294
2295	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2296	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2297	    cap_rights_init(&rights, CAP_FSTAT), td);
2298
2299	if ((error = namei(&nd)) != 0)
2300		return (error);
2301	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2302	if (error == 0) {
2303		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2304		if (S_ISREG(sb.st_mode))
2305			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2306		if (__predict_false(hook != NULL))
2307			hook(nd.ni_vp, &sb);
2308	}
2309	NDFREE(&nd, NDF_ONLY_PNBUF);
2310	vput(nd.ni_vp);
2311	if (error != 0)
2312		return (error);
2313	*sbp = sb;
2314#ifdef KTRACE
2315	if (KTRPOINT(td, KTR_STRUCT))
2316		ktrstat(&sb);
2317#endif
2318	return (0);
2319}
2320
2321/*
2322 * Get file status; this version does not follow links.
2323 */
2324#ifndef _SYS_SYSPROTO_H_
2325struct lstat_args {
2326	char	*path;
2327	struct stat *ub;
2328};
2329#endif
2330int
2331sys_lstat(td, uap)
2332	struct thread *td;
2333	register struct lstat_args /* {
2334		char *path;
2335		struct stat *ub;
2336	} */ *uap;
2337{
2338	struct stat sb;
2339	int error;
2340
2341	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2342	if (error == 0)
2343		error = copyout(&sb, uap->ub, sizeof (sb));
2344	return (error);
2345}
2346
2347int
2348kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2349{
2350
2351	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2352	    sbp));
2353}
2354
2355/*
2356 * Implementation of the NetBSD [l]stat() functions.
2357 */
2358void
2359cvtnstat(sb, nsb)
2360	struct stat *sb;
2361	struct nstat *nsb;
2362{
2363
2364	bzero(nsb, sizeof *nsb);
2365	nsb->st_dev = sb->st_dev;
2366	nsb->st_ino = sb->st_ino;
2367	nsb->st_mode = sb->st_mode;
2368	nsb->st_nlink = sb->st_nlink;
2369	nsb->st_uid = sb->st_uid;
2370	nsb->st_gid = sb->st_gid;
2371	nsb->st_rdev = sb->st_rdev;
2372	nsb->st_atim = sb->st_atim;
2373	nsb->st_mtim = sb->st_mtim;
2374	nsb->st_ctim = sb->st_ctim;
2375	nsb->st_size = sb->st_size;
2376	nsb->st_blocks = sb->st_blocks;
2377	nsb->st_blksize = sb->st_blksize;
2378	nsb->st_flags = sb->st_flags;
2379	nsb->st_gen = sb->st_gen;
2380	nsb->st_birthtim = sb->st_birthtim;
2381}
2382
2383#ifndef _SYS_SYSPROTO_H_
2384struct nstat_args {
2385	char	*path;
2386	struct nstat *ub;
2387};
2388#endif
2389int
2390sys_nstat(td, uap)
2391	struct thread *td;
2392	register struct nstat_args /* {
2393		char *path;
2394		struct nstat *ub;
2395	} */ *uap;
2396{
2397	struct stat sb;
2398	struct nstat nsb;
2399	int error;
2400
2401	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2402	if (error != 0)
2403		return (error);
2404	cvtnstat(&sb, &nsb);
2405	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2406}
2407
2408/*
2409 * NetBSD lstat.  Get file status; this version does not follow links.
2410 */
2411#ifndef _SYS_SYSPROTO_H_
2412struct lstat_args {
2413	char	*path;
2414	struct stat *ub;
2415};
2416#endif
2417int
2418sys_nlstat(td, uap)
2419	struct thread *td;
2420	register struct nlstat_args /* {
2421		char *path;
2422		struct nstat *ub;
2423	} */ *uap;
2424{
2425	struct stat sb;
2426	struct nstat nsb;
2427	int error;
2428
2429	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2430	if (error != 0)
2431		return (error);
2432	cvtnstat(&sb, &nsb);
2433	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2434}
2435
2436/*
2437 * Get configurable pathname variables.
2438 */
2439#ifndef _SYS_SYSPROTO_H_
2440struct pathconf_args {
2441	char	*path;
2442	int	name;
2443};
2444#endif
2445int
2446sys_pathconf(td, uap)
2447	struct thread *td;
2448	register struct pathconf_args /* {
2449		char *path;
2450		int name;
2451	} */ *uap;
2452{
2453
2454	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2455}
2456
2457#ifndef _SYS_SYSPROTO_H_
2458struct lpathconf_args {
2459	char	*path;
2460	int	name;
2461};
2462#endif
2463int
2464sys_lpathconf(td, uap)
2465	struct thread *td;
2466	register struct lpathconf_args /* {
2467		char *path;
2468		int name;
2469	} */ *uap;
2470{
2471
2472	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2473	    NOFOLLOW));
2474}
2475
2476int
2477kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2478    u_long flags)
2479{
2480	struct nameidata nd;
2481	int error;
2482
2483	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2484	    pathseg, path, td);
2485	if ((error = namei(&nd)) != 0)
2486		return (error);
2487	NDFREE(&nd, NDF_ONLY_PNBUF);
2488
2489	/* If asynchronous I/O is available, it works for all files. */
2490	if (name == _PC_ASYNC_IO)
2491		td->td_retval[0] = async_io_version;
2492	else
2493		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2494	vput(nd.ni_vp);
2495	return (error);
2496}
2497
2498/*
2499 * Return target name of a symbolic link.
2500 */
2501#ifndef _SYS_SYSPROTO_H_
2502struct readlink_args {
2503	char	*path;
2504	char	*buf;
2505	size_t	count;
2506};
2507#endif
2508int
2509sys_readlink(td, uap)
2510	struct thread *td;
2511	register struct readlink_args /* {
2512		char *path;
2513		char *buf;
2514		size_t count;
2515	} */ *uap;
2516{
2517
2518	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2519	    UIO_USERSPACE, uap->count));
2520}
2521#ifndef _SYS_SYSPROTO_H_
2522struct readlinkat_args {
2523	int	fd;
2524	char	*path;
2525	char	*buf;
2526	size_t	bufsize;
2527};
2528#endif
2529int
2530sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2531{
2532
2533	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2534	    uap->buf, UIO_USERSPACE, uap->bufsize));
2535}
2536
2537int
2538kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2539    enum uio_seg bufseg, size_t count)
2540{
2541
2542	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2543	    count));
2544}
2545
2546int
2547kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2548    char *buf, enum uio_seg bufseg, size_t count)
2549{
2550	struct vnode *vp;
2551	struct iovec aiov;
2552	struct uio auio;
2553	struct nameidata nd;
2554	int error;
2555
2556	if (count > IOSIZE_MAX)
2557		return (EINVAL);
2558
2559	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2560	    pathseg, path, fd, td);
2561
2562	if ((error = namei(&nd)) != 0)
2563		return (error);
2564	NDFREE(&nd, NDF_ONLY_PNBUF);
2565	vp = nd.ni_vp;
2566#ifdef MAC
2567	error = mac_vnode_check_readlink(td->td_ucred, vp);
2568	if (error != 0) {
2569		vput(vp);
2570		return (error);
2571	}
2572#endif
2573	if (vp->v_type != VLNK)
2574		error = EINVAL;
2575	else {
2576		aiov.iov_base = buf;
2577		aiov.iov_len = count;
2578		auio.uio_iov = &aiov;
2579		auio.uio_iovcnt = 1;
2580		auio.uio_offset = 0;
2581		auio.uio_rw = UIO_READ;
2582		auio.uio_segflg = bufseg;
2583		auio.uio_td = td;
2584		auio.uio_resid = count;
2585		error = VOP_READLINK(vp, &auio, td->td_ucred);
2586		td->td_retval[0] = count - auio.uio_resid;
2587	}
2588	vput(vp);
2589	return (error);
2590}
2591
2592/*
2593 * Common implementation code for chflags() and fchflags().
2594 */
2595static int
2596setfflags(td, vp, flags)
2597	struct thread *td;
2598	struct vnode *vp;
2599	u_long flags;
2600{
2601	struct mount *mp;
2602	struct vattr vattr;
2603	int error;
2604
2605	/* We can't support the value matching VNOVAL. */
2606	if (flags == VNOVAL)
2607		return (EOPNOTSUPP);
2608
2609	/*
2610	 * Prevent non-root users from setting flags on devices.  When
2611	 * a device is reused, users can retain ownership of the device
2612	 * if they are allowed to set flags and programs assume that
2613	 * chown can't fail when done as root.
2614	 */
2615	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2616		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2617		if (error != 0)
2618			return (error);
2619	}
2620
2621	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2622		return (error);
2623	VATTR_NULL(&vattr);
2624	vattr.va_flags = flags;
2625	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2626#ifdef MAC
2627	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2628	if (error == 0)
2629#endif
2630		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2631	VOP_UNLOCK(vp, 0);
2632	vn_finished_write(mp);
2633	return (error);
2634}
2635
2636/*
2637 * Change flags of a file given a path name.
2638 */
2639#ifndef _SYS_SYSPROTO_H_
2640struct chflags_args {
2641	const char *path;
2642	u_long	flags;
2643};
2644#endif
2645int
2646sys_chflags(td, uap)
2647	struct thread *td;
2648	register struct chflags_args /* {
2649		const char *path;
2650		u_long flags;
2651	} */ *uap;
2652{
2653
2654	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2655}
2656
2657#ifndef _SYS_SYSPROTO_H_
2658struct chflagsat_args {
2659	int	fd;
2660	const char *path;
2661	u_long	flags;
2662	int	atflag;
2663}
2664#endif
2665int
2666sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2667{
2668	int fd = uap->fd;
2669	const char *path = uap->path;
2670	u_long flags = uap->flags;
2671	int atflag = uap->atflag;
2672
2673	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2674		return (EINVAL);
2675
2676	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2677}
2678
2679static int
2680kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2681    u_long flags)
2682{
2683
2684	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2685}
2686
2687/*
2688 * Same as chflags() but doesn't follow symlinks.
2689 */
2690int
2691sys_lchflags(td, uap)
2692	struct thread *td;
2693	register struct lchflags_args /* {
2694		const char *path;
2695		u_long flags;
2696	} */ *uap;
2697{
2698
2699	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2700	    uap->flags, AT_SYMLINK_NOFOLLOW));
2701}
2702
2703static int
2704kern_chflagsat(struct thread *td, int fd, const char *path,
2705    enum uio_seg pathseg, u_long flags, int atflag)
2706{
2707	struct nameidata nd;
2708	cap_rights_t rights;
2709	int error, follow;
2710
2711	AUDIT_ARG_FFLAGS(flags);
2712	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2713	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2714	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2715	if ((error = namei(&nd)) != 0)
2716		return (error);
2717	NDFREE(&nd, NDF_ONLY_PNBUF);
2718	error = setfflags(td, nd.ni_vp, flags);
2719	vrele(nd.ni_vp);
2720	return (error);
2721}
2722
2723/*
2724 * Change flags of a file given a file descriptor.
2725 */
2726#ifndef _SYS_SYSPROTO_H_
2727struct fchflags_args {
2728	int	fd;
2729	u_long	flags;
2730};
2731#endif
2732int
2733sys_fchflags(td, uap)
2734	struct thread *td;
2735	register struct fchflags_args /* {
2736		int fd;
2737		u_long flags;
2738	} */ *uap;
2739{
2740	struct file *fp;
2741	cap_rights_t rights;
2742	int error;
2743
2744	AUDIT_ARG_FD(uap->fd);
2745	AUDIT_ARG_FFLAGS(uap->flags);
2746	error = getvnode(td->td_proc->p_fd, uap->fd,
2747	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2748	if (error != 0)
2749		return (error);
2750#ifdef AUDIT
2751	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2752	AUDIT_ARG_VNODE1(fp->f_vnode);
2753	VOP_UNLOCK(fp->f_vnode, 0);
2754#endif
2755	error = setfflags(td, fp->f_vnode, uap->flags);
2756	fdrop(fp, td);
2757	return (error);
2758}
2759
2760/*
2761 * Common implementation code for chmod(), lchmod() and fchmod().
2762 */
2763int
2764setfmode(td, cred, vp, mode)
2765	struct thread *td;
2766	struct ucred *cred;
2767	struct vnode *vp;
2768	int mode;
2769{
2770	struct mount *mp;
2771	struct vattr vattr;
2772	int error;
2773
2774	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2775		return (error);
2776	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2777	VATTR_NULL(&vattr);
2778	vattr.va_mode = mode & ALLPERMS;
2779#ifdef MAC
2780	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2781	if (error == 0)
2782#endif
2783		error = VOP_SETATTR(vp, &vattr, cred);
2784	VOP_UNLOCK(vp, 0);
2785	vn_finished_write(mp);
2786	return (error);
2787}
2788
2789/*
2790 * Change mode of a file given path name.
2791 */
2792#ifndef _SYS_SYSPROTO_H_
2793struct chmod_args {
2794	char	*path;
2795	int	mode;
2796};
2797#endif
2798int
2799sys_chmod(td, uap)
2800	struct thread *td;
2801	register struct chmod_args /* {
2802		char *path;
2803		int mode;
2804	} */ *uap;
2805{
2806
2807	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2808}
2809
2810#ifndef _SYS_SYSPROTO_H_
2811struct fchmodat_args {
2812	int	dirfd;
2813	char	*path;
2814	mode_t	mode;
2815	int	flag;
2816}
2817#endif
2818int
2819sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2820{
2821	int flag = uap->flag;
2822	int fd = uap->fd;
2823	char *path = uap->path;
2824	mode_t mode = uap->mode;
2825
2826	if (flag & ~AT_SYMLINK_NOFOLLOW)
2827		return (EINVAL);
2828
2829	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2830}
2831
2832int
2833kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2834{
2835
2836	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2837}
2838
2839/*
2840 * Change mode of a file given path name (don't follow links.)
2841 */
2842#ifndef _SYS_SYSPROTO_H_
2843struct lchmod_args {
2844	char	*path;
2845	int	mode;
2846};
2847#endif
2848int
2849sys_lchmod(td, uap)
2850	struct thread *td;
2851	register struct lchmod_args /* {
2852		char *path;
2853		int mode;
2854	} */ *uap;
2855{
2856
2857	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2858	    uap->mode, AT_SYMLINK_NOFOLLOW));
2859}
2860
2861int
2862kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2863    mode_t mode, int flag)
2864{
2865	struct nameidata nd;
2866	cap_rights_t rights;
2867	int error, follow;
2868
2869	AUDIT_ARG_MODE(mode);
2870	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2871	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2872	    cap_rights_init(&rights, CAP_FCHMOD), td);
2873	if ((error = namei(&nd)) != 0)
2874		return (error);
2875	NDFREE(&nd, NDF_ONLY_PNBUF);
2876	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2877	vrele(nd.ni_vp);
2878	return (error);
2879}
2880
2881/*
2882 * Change mode of a file given a file descriptor.
2883 */
2884#ifndef _SYS_SYSPROTO_H_
2885struct fchmod_args {
2886	int	fd;
2887	int	mode;
2888};
2889#endif
2890int
2891sys_fchmod(struct thread *td, struct fchmod_args *uap)
2892{
2893	struct file *fp;
2894	cap_rights_t rights;
2895	int error;
2896
2897	AUDIT_ARG_FD(uap->fd);
2898	AUDIT_ARG_MODE(uap->mode);
2899
2900	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2901	if (error != 0)
2902		return (error);
2903	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2904	fdrop(fp, td);
2905	return (error);
2906}
2907
2908/*
2909 * Common implementation for chown(), lchown(), and fchown()
2910 */
2911int
2912setfown(td, cred, vp, uid, gid)
2913	struct thread *td;
2914	struct ucred *cred;
2915	struct vnode *vp;
2916	uid_t uid;
2917	gid_t gid;
2918{
2919	struct mount *mp;
2920	struct vattr vattr;
2921	int error;
2922
2923	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2924		return (error);
2925	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2926	VATTR_NULL(&vattr);
2927	vattr.va_uid = uid;
2928	vattr.va_gid = gid;
2929#ifdef MAC
2930	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2931	    vattr.va_gid);
2932	if (error == 0)
2933#endif
2934		error = VOP_SETATTR(vp, &vattr, cred);
2935	VOP_UNLOCK(vp, 0);
2936	vn_finished_write(mp);
2937	return (error);
2938}
2939
2940/*
2941 * Set ownership given a path name.
2942 */
2943#ifndef _SYS_SYSPROTO_H_
2944struct chown_args {
2945	char	*path;
2946	int	uid;
2947	int	gid;
2948};
2949#endif
2950int
2951sys_chown(td, uap)
2952	struct thread *td;
2953	register struct chown_args /* {
2954		char *path;
2955		int uid;
2956		int gid;
2957	} */ *uap;
2958{
2959
2960	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2961}
2962
2963#ifndef _SYS_SYSPROTO_H_
2964struct fchownat_args {
2965	int fd;
2966	const char * path;
2967	uid_t uid;
2968	gid_t gid;
2969	int flag;
2970};
2971#endif
2972int
2973sys_fchownat(struct thread *td, struct fchownat_args *uap)
2974{
2975	int flag;
2976
2977	flag = uap->flag;
2978	if (flag & ~AT_SYMLINK_NOFOLLOW)
2979		return (EINVAL);
2980
2981	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2982	    uap->gid, uap->flag));
2983}
2984
2985int
2986kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2987    int gid)
2988{
2989
2990	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2991}
2992
2993int
2994kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2995    int uid, int gid, int flag)
2996{
2997	struct nameidata nd;
2998	cap_rights_t rights;
2999	int error, follow;
3000
3001	AUDIT_ARG_OWNER(uid, gid);
3002	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
3003	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
3004	    cap_rights_init(&rights, CAP_FCHOWN), td);
3005
3006	if ((error = namei(&nd)) != 0)
3007		return (error);
3008	NDFREE(&nd, NDF_ONLY_PNBUF);
3009	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
3010	vrele(nd.ni_vp);
3011	return (error);
3012}
3013
3014/*
3015 * Set ownership given a path name, do not cross symlinks.
3016 */
3017#ifndef _SYS_SYSPROTO_H_
3018struct lchown_args {
3019	char	*path;
3020	int	uid;
3021	int	gid;
3022};
3023#endif
3024int
3025sys_lchown(td, uap)
3026	struct thread *td;
3027	register struct lchown_args /* {
3028		char *path;
3029		int uid;
3030		int gid;
3031	} */ *uap;
3032{
3033
3034	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3035}
3036
3037int
3038kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3039    int gid)
3040{
3041
3042	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3043	    AT_SYMLINK_NOFOLLOW));
3044}
3045
3046/*
3047 * Set ownership given a file descriptor.
3048 */
3049#ifndef _SYS_SYSPROTO_H_
3050struct fchown_args {
3051	int	fd;
3052	int	uid;
3053	int	gid;
3054};
3055#endif
3056int
3057sys_fchown(td, uap)
3058	struct thread *td;
3059	register struct fchown_args /* {
3060		int fd;
3061		int uid;
3062		int gid;
3063	} */ *uap;
3064{
3065	struct file *fp;
3066	cap_rights_t rights;
3067	int error;
3068
3069	AUDIT_ARG_FD(uap->fd);
3070	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3071	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3072	if (error != 0)
3073		return (error);
3074	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3075	fdrop(fp, td);
3076	return (error);
3077}
3078
3079/*
3080 * Common implementation code for utimes(), lutimes(), and futimes().
3081 */
3082static int
3083getutimes(usrtvp, tvpseg, tsp)
3084	const struct timeval *usrtvp;
3085	enum uio_seg tvpseg;
3086	struct timespec *tsp;
3087{
3088	struct timeval tv[2];
3089	const struct timeval *tvp;
3090	int error;
3091
3092	if (usrtvp == NULL) {
3093		vfs_timestamp(&tsp[0]);
3094		tsp[1] = tsp[0];
3095	} else {
3096		if (tvpseg == UIO_SYSSPACE) {
3097			tvp = usrtvp;
3098		} else {
3099			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3100				return (error);
3101			tvp = tv;
3102		}
3103
3104		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3105		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3106			return (EINVAL);
3107		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3108		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3109	}
3110	return (0);
3111}
3112
3113/*
3114 * Common implementation code for utimes(), lutimes(), and futimes().
3115 */
3116static int
3117setutimes(td, vp, ts, numtimes, nullflag)
3118	struct thread *td;
3119	struct vnode *vp;
3120	const struct timespec *ts;
3121	int numtimes;
3122	int nullflag;
3123{
3124	struct mount *mp;
3125	struct vattr vattr;
3126	int error, setbirthtime;
3127
3128	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3129		return (error);
3130	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3131	setbirthtime = 0;
3132	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3133	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3134		setbirthtime = 1;
3135	VATTR_NULL(&vattr);
3136	vattr.va_atime = ts[0];
3137	vattr.va_mtime = ts[1];
3138	if (setbirthtime)
3139		vattr.va_birthtime = ts[1];
3140	if (numtimes > 2)
3141		vattr.va_birthtime = ts[2];
3142	if (nullflag)
3143		vattr.va_vaflags |= VA_UTIMES_NULL;
3144#ifdef MAC
3145	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3146	    vattr.va_mtime);
3147#endif
3148	if (error == 0)
3149		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3150	VOP_UNLOCK(vp, 0);
3151	vn_finished_write(mp);
3152	return (error);
3153}
3154
3155/*
3156 * Set the access and modification times of a file.
3157 */
3158#ifndef _SYS_SYSPROTO_H_
3159struct utimes_args {
3160	char	*path;
3161	struct	timeval *tptr;
3162};
3163#endif
3164int
3165sys_utimes(td, uap)
3166	struct thread *td;
3167	register struct utimes_args /* {
3168		char *path;
3169		struct timeval *tptr;
3170	} */ *uap;
3171{
3172
3173	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3174	    UIO_USERSPACE));
3175}
3176
3177#ifndef _SYS_SYSPROTO_H_
3178struct futimesat_args {
3179	int fd;
3180	const char * path;
3181	const struct timeval * times;
3182};
3183#endif
3184int
3185sys_futimesat(struct thread *td, struct futimesat_args *uap)
3186{
3187
3188	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3189	    uap->times, UIO_USERSPACE));
3190}
3191
3192int
3193kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3194    struct timeval *tptr, enum uio_seg tptrseg)
3195{
3196
3197	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3198}
3199
3200int
3201kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3202    struct timeval *tptr, enum uio_seg tptrseg)
3203{
3204	struct nameidata nd;
3205	struct timespec ts[2];
3206	cap_rights_t rights;
3207	int error;
3208
3209	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3210		return (error);
3211	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3212	    cap_rights_init(&rights, CAP_FUTIMES), td);
3213
3214	if ((error = namei(&nd)) != 0)
3215		return (error);
3216	NDFREE(&nd, NDF_ONLY_PNBUF);
3217	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3218	vrele(nd.ni_vp);
3219	return (error);
3220}
3221
3222/*
3223 * Set the access and modification times of a file.
3224 */
3225#ifndef _SYS_SYSPROTO_H_
3226struct lutimes_args {
3227	char	*path;
3228	struct	timeval *tptr;
3229};
3230#endif
3231int
3232sys_lutimes(td, uap)
3233	struct thread *td;
3234	register struct lutimes_args /* {
3235		char *path;
3236		struct timeval *tptr;
3237	} */ *uap;
3238{
3239
3240	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3241	    UIO_USERSPACE));
3242}
3243
3244int
3245kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3246    struct timeval *tptr, enum uio_seg tptrseg)
3247{
3248	struct timespec ts[2];
3249	struct nameidata nd;
3250	int error;
3251
3252	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3253		return (error);
3254	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3255	if ((error = namei(&nd)) != 0)
3256		return (error);
3257	NDFREE(&nd, NDF_ONLY_PNBUF);
3258	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3259	vrele(nd.ni_vp);
3260	return (error);
3261}
3262
3263/*
3264 * Set the access and modification times of a file.
3265 */
3266#ifndef _SYS_SYSPROTO_H_
3267struct futimes_args {
3268	int	fd;
3269	struct	timeval *tptr;
3270};
3271#endif
3272int
3273sys_futimes(td, uap)
3274	struct thread *td;
3275	register struct futimes_args /* {
3276		int  fd;
3277		struct timeval *tptr;
3278	} */ *uap;
3279{
3280
3281	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3282}
3283
3284int
3285kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3286    enum uio_seg tptrseg)
3287{
3288	struct timespec ts[2];
3289	struct file *fp;
3290	cap_rights_t rights;
3291	int error;
3292
3293	AUDIT_ARG_FD(fd);
3294	error = getutimes(tptr, tptrseg, ts);
3295	if (error != 0)
3296		return (error);
3297	error = getvnode(td->td_proc->p_fd, fd,
3298	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3299	if (error != 0)
3300		return (error);
3301#ifdef AUDIT
3302	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3303	AUDIT_ARG_VNODE1(fp->f_vnode);
3304	VOP_UNLOCK(fp->f_vnode, 0);
3305#endif
3306	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3307	fdrop(fp, td);
3308	return (error);
3309}
3310
3311/*
3312 * Truncate a file given its path name.
3313 */
3314#ifndef _SYS_SYSPROTO_H_
3315struct truncate_args {
3316	char	*path;
3317	int	pad;
3318	off_t	length;
3319};
3320#endif
3321int
3322sys_truncate(td, uap)
3323	struct thread *td;
3324	register struct truncate_args /* {
3325		char *path;
3326		int pad;
3327		off_t length;
3328	} */ *uap;
3329{
3330
3331	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3332}
3333
3334int
3335kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3336{
3337	struct mount *mp;
3338	struct vnode *vp;
3339	void *rl_cookie;
3340	struct vattr vattr;
3341	struct nameidata nd;
3342	int error;
3343
3344	if (length < 0)
3345		return(EINVAL);
3346	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3347	if ((error = namei(&nd)) != 0)
3348		return (error);
3349	vp = nd.ni_vp;
3350	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3351	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3352		vn_rangelock_unlock(vp, rl_cookie);
3353		vrele(vp);
3354		return (error);
3355	}
3356	NDFREE(&nd, NDF_ONLY_PNBUF);
3357	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3358	if (vp->v_type == VDIR)
3359		error = EISDIR;
3360#ifdef MAC
3361	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3362	}
3363#endif
3364	else if ((error = vn_writechk(vp)) == 0 &&
3365	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3366		VATTR_NULL(&vattr);
3367		vattr.va_size = length;
3368		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3369	}
3370	VOP_UNLOCK(vp, 0);
3371	vn_finished_write(mp);
3372	vn_rangelock_unlock(vp, rl_cookie);
3373	vrele(vp);
3374	return (error);
3375}
3376
3377#if defined(COMPAT_43)
3378/*
3379 * Truncate a file given its path name.
3380 */
3381#ifndef _SYS_SYSPROTO_H_
3382struct otruncate_args {
3383	char	*path;
3384	long	length;
3385};
3386#endif
3387int
3388otruncate(td, uap)
3389	struct thread *td;
3390	register struct otruncate_args /* {
3391		char *path;
3392		long length;
3393	} */ *uap;
3394{
3395	struct truncate_args /* {
3396		char *path;
3397		int pad;
3398		off_t length;
3399	} */ nuap;
3400
3401	nuap.path = uap->path;
3402	nuap.length = uap->length;
3403	return (sys_truncate(td, &nuap));
3404}
3405#endif /* COMPAT_43 */
3406
3407/* Versions with the pad argument */
3408int
3409freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3410{
3411	struct truncate_args ouap;
3412
3413	ouap.path = uap->path;
3414	ouap.length = uap->length;
3415	return (sys_truncate(td, &ouap));
3416}
3417
3418int
3419freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3420{
3421	struct ftruncate_args ouap;
3422
3423	ouap.fd = uap->fd;
3424	ouap.length = uap->length;
3425	return (sys_ftruncate(td, &ouap));
3426}
3427
3428/*
3429 * Sync an open file.
3430 */
3431#ifndef _SYS_SYSPROTO_H_
3432struct fsync_args {
3433	int	fd;
3434};
3435#endif
3436int
3437sys_fsync(td, uap)
3438	struct thread *td;
3439	struct fsync_args /* {
3440		int fd;
3441	} */ *uap;
3442{
3443	struct vnode *vp;
3444	struct mount *mp;
3445	struct file *fp;
3446	cap_rights_t rights;
3447	int error, lock_flags;
3448
3449	AUDIT_ARG_FD(uap->fd);
3450	error = getvnode(td->td_proc->p_fd, uap->fd,
3451	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3452	if (error != 0)
3453		return (error);
3454	vp = fp->f_vnode;
3455	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3456	if (error != 0)
3457		goto drop;
3458	if (MNT_SHARED_WRITES(mp) ||
3459	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3460		lock_flags = LK_SHARED;
3461	} else {
3462		lock_flags = LK_EXCLUSIVE;
3463	}
3464	vn_lock(vp, lock_flags | LK_RETRY);
3465	AUDIT_ARG_VNODE1(vp);
3466	if (vp->v_object != NULL) {
3467		VM_OBJECT_WLOCK(vp->v_object);
3468		vm_object_page_clean(vp->v_object, 0, 0, 0);
3469		VM_OBJECT_WUNLOCK(vp->v_object);
3470	}
3471	error = VOP_FSYNC(vp, MNT_WAIT, td);
3472
3473	VOP_UNLOCK(vp, 0);
3474	vn_finished_write(mp);
3475drop:
3476	fdrop(fp, td);
3477	return (error);
3478}
3479
3480/*
3481 * Rename files.  Source and destination must either both be directories, or
3482 * both not be directories.  If target is a directory, it must be empty.
3483 */
3484#ifndef _SYS_SYSPROTO_H_
3485struct rename_args {
3486	char	*from;
3487	char	*to;
3488};
3489#endif
3490int
3491sys_rename(td, uap)
3492	struct thread *td;
3493	register struct rename_args /* {
3494		char *from;
3495		char *to;
3496	} */ *uap;
3497{
3498
3499	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3500}
3501
3502#ifndef _SYS_SYSPROTO_H_
3503struct renameat_args {
3504	int	oldfd;
3505	char	*old;
3506	int	newfd;
3507	char	*new;
3508};
3509#endif
3510int
3511sys_renameat(struct thread *td, struct renameat_args *uap)
3512{
3513
3514	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3515	    UIO_USERSPACE));
3516}
3517
3518int
3519kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3520{
3521
3522	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3523}
3524
3525int
3526kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3527    enum uio_seg pathseg)
3528{
3529	struct mount *mp = NULL;
3530	struct vnode *tvp, *fvp, *tdvp;
3531	struct nameidata fromnd, tond;
3532	cap_rights_t rights;
3533	int error;
3534
3535again:
3536	bwillwrite();
3537#ifdef MAC
3538	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3539	    AUDITVNODE1, pathseg, old, oldfd,
3540	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3541#else
3542	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3543	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3544#endif
3545
3546	if ((error = namei(&fromnd)) != 0)
3547		return (error);
3548#ifdef MAC
3549	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3550	    fromnd.ni_vp, &fromnd.ni_cnd);
3551	VOP_UNLOCK(fromnd.ni_dvp, 0);
3552	if (fromnd.ni_dvp != fromnd.ni_vp)
3553		VOP_UNLOCK(fromnd.ni_vp, 0);
3554#endif
3555	fvp = fromnd.ni_vp;
3556	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3557	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3558	    cap_rights_init(&rights, CAP_LINKAT), td);
3559	if (fromnd.ni_vp->v_type == VDIR)
3560		tond.ni_cnd.cn_flags |= WILLBEDIR;
3561	if ((error = namei(&tond)) != 0) {
3562		/* Translate error code for rename("dir1", "dir2/."). */
3563		if (error == EISDIR && fvp->v_type == VDIR)
3564			error = EINVAL;
3565		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3566		vrele(fromnd.ni_dvp);
3567		vrele(fvp);
3568		goto out1;
3569	}
3570	tdvp = tond.ni_dvp;
3571	tvp = tond.ni_vp;
3572	error = vn_start_write(fvp, &mp, V_NOWAIT);
3573	if (error != 0) {
3574		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3575		NDFREE(&tond, NDF_ONLY_PNBUF);
3576		if (tvp != NULL)
3577			vput(tvp);
3578		if (tdvp == tvp)
3579			vrele(tdvp);
3580		else
3581			vput(tdvp);
3582		vrele(fromnd.ni_dvp);
3583		vrele(fvp);
3584		vrele(tond.ni_startdir);
3585		if (fromnd.ni_startdir != NULL)
3586			vrele(fromnd.ni_startdir);
3587		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3588		if (error != 0)
3589			return (error);
3590		goto again;
3591	}
3592	if (tvp != NULL) {
3593		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3594			error = ENOTDIR;
3595			goto out;
3596		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3597			error = EISDIR;
3598			goto out;
3599		}
3600#ifdef CAPABILITIES
3601		if (newfd != AT_FDCWD) {
3602			/*
3603			 * If the target already exists we require CAP_UNLINKAT
3604			 * from 'newfd'.
3605			 */
3606			error = cap_check(&tond.ni_filecaps.fc_rights,
3607			    cap_rights_init(&rights, CAP_UNLINKAT));
3608			if (error != 0)
3609				goto out;
3610		}
3611#endif
3612	}
3613	if (fvp == tdvp) {
3614		error = EINVAL;
3615		goto out;
3616	}
3617	/*
3618	 * If the source is the same as the destination (that is, if they
3619	 * are links to the same vnode), then there is nothing to do.
3620	 */
3621	if (fvp == tvp)
3622		error = -1;
3623#ifdef MAC
3624	else
3625		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3626		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3627#endif
3628out:
3629	if (error == 0) {
3630		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3631		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3632		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3633		NDFREE(&tond, NDF_ONLY_PNBUF);
3634	} else {
3635		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3636		NDFREE(&tond, NDF_ONLY_PNBUF);
3637		if (tvp != NULL)
3638			vput(tvp);
3639		if (tdvp == tvp)
3640			vrele(tdvp);
3641		else
3642			vput(tdvp);
3643		vrele(fromnd.ni_dvp);
3644		vrele(fvp);
3645	}
3646	vrele(tond.ni_startdir);
3647	vn_finished_write(mp);
3648out1:
3649	if (fromnd.ni_startdir)
3650		vrele(fromnd.ni_startdir);
3651	if (error == -1)
3652		return (0);
3653	return (error);
3654}
3655
3656/*
3657 * Make a directory file.
3658 */
3659#ifndef _SYS_SYSPROTO_H_
3660struct mkdir_args {
3661	char	*path;
3662	int	mode;
3663};
3664#endif
3665int
3666sys_mkdir(td, uap)
3667	struct thread *td;
3668	register struct mkdir_args /* {
3669		char *path;
3670		int mode;
3671	} */ *uap;
3672{
3673
3674	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3675}
3676
3677#ifndef _SYS_SYSPROTO_H_
3678struct mkdirat_args {
3679	int	fd;
3680	char	*path;
3681	mode_t	mode;
3682};
3683#endif
3684int
3685sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3686{
3687
3688	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3689}
3690
3691int
3692kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3693{
3694
3695	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3696}
3697
3698int
3699kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3700    int mode)
3701{
3702	struct mount *mp;
3703	struct vnode *vp;
3704	struct vattr vattr;
3705	struct nameidata nd;
3706	cap_rights_t rights;
3707	int error;
3708
3709	AUDIT_ARG_MODE(mode);
3710restart:
3711	bwillwrite();
3712	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3713	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3714	    td);
3715	nd.ni_cnd.cn_flags |= WILLBEDIR;
3716	if ((error = namei(&nd)) != 0)
3717		return (error);
3718	vp = nd.ni_vp;
3719	if (vp != NULL) {
3720		NDFREE(&nd, NDF_ONLY_PNBUF);
3721		/*
3722		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3723		 * the strange behaviour of leaving the vnode unlocked
3724		 * if the target is the same vnode as the parent.
3725		 */
3726		if (vp == nd.ni_dvp)
3727			vrele(nd.ni_dvp);
3728		else
3729			vput(nd.ni_dvp);
3730		vrele(vp);
3731		return (EEXIST);
3732	}
3733	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3734		NDFREE(&nd, NDF_ONLY_PNBUF);
3735		vput(nd.ni_dvp);
3736		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3737			return (error);
3738		goto restart;
3739	}
3740	VATTR_NULL(&vattr);
3741	vattr.va_type = VDIR;
3742	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3743#ifdef MAC
3744	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3745	    &vattr);
3746	if (error != 0)
3747		goto out;
3748#endif
3749	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3750#ifdef MAC
3751out:
3752#endif
3753	NDFREE(&nd, NDF_ONLY_PNBUF);
3754	vput(nd.ni_dvp);
3755	if (error == 0)
3756		vput(nd.ni_vp);
3757	vn_finished_write(mp);
3758	return (error);
3759}
3760
3761/*
3762 * Remove a directory file.
3763 */
3764#ifndef _SYS_SYSPROTO_H_
3765struct rmdir_args {
3766	char	*path;
3767};
3768#endif
3769int
3770sys_rmdir(td, uap)
3771	struct thread *td;
3772	struct rmdir_args /* {
3773		char *path;
3774	} */ *uap;
3775{
3776
3777	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3778}
3779
3780int
3781kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3782{
3783
3784	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3785}
3786
3787int
3788kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3789{
3790	struct mount *mp;
3791	struct vnode *vp;
3792	struct nameidata nd;
3793	cap_rights_t rights;
3794	int error;
3795
3796restart:
3797	bwillwrite();
3798	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3799	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3800	if ((error = namei(&nd)) != 0)
3801		return (error);
3802	vp = nd.ni_vp;
3803	if (vp->v_type != VDIR) {
3804		error = ENOTDIR;
3805		goto out;
3806	}
3807	/*
3808	 * No rmdir "." please.
3809	 */
3810	if (nd.ni_dvp == vp) {
3811		error = EINVAL;
3812		goto out;
3813	}
3814	/*
3815	 * The root of a mounted filesystem cannot be deleted.
3816	 */
3817	if (vp->v_vflag & VV_ROOT) {
3818		error = EBUSY;
3819		goto out;
3820	}
3821#ifdef MAC
3822	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3823	    &nd.ni_cnd);
3824	if (error != 0)
3825		goto out;
3826#endif
3827	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3828		NDFREE(&nd, NDF_ONLY_PNBUF);
3829		vput(vp);
3830		if (nd.ni_dvp == vp)
3831			vrele(nd.ni_dvp);
3832		else
3833			vput(nd.ni_dvp);
3834		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3835			return (error);
3836		goto restart;
3837	}
3838	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3839	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3840	vn_finished_write(mp);
3841out:
3842	NDFREE(&nd, NDF_ONLY_PNBUF);
3843	vput(vp);
3844	if (nd.ni_dvp == vp)
3845		vrele(nd.ni_dvp);
3846	else
3847		vput(nd.ni_dvp);
3848	return (error);
3849}
3850
3851#ifdef COMPAT_43
3852/*
3853 * Read a block of directory entries in a filesystem independent format.
3854 */
3855#ifndef _SYS_SYSPROTO_H_
3856struct ogetdirentries_args {
3857	int	fd;
3858	char	*buf;
3859	u_int	count;
3860	long	*basep;
3861};
3862#endif
3863int
3864ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3865{
3866	long loff;
3867	int error;
3868
3869	error = kern_ogetdirentries(td, uap, &loff);
3870	if (error == 0)
3871		error = copyout(&loff, uap->basep, sizeof(long));
3872	return (error);
3873}
3874
3875int
3876kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3877    long *ploff)
3878{
3879	struct vnode *vp;
3880	struct file *fp;
3881	struct uio auio, kuio;
3882	struct iovec aiov, kiov;
3883	struct dirent *dp, *edp;
3884	cap_rights_t rights;
3885	caddr_t dirbuf;
3886	int error, eofflag, readcnt;
3887	long loff;
3888	off_t foffset;
3889
3890	/* XXX arbitrary sanity limit on `count'. */
3891	if (uap->count > 64 * 1024)
3892		return (EINVAL);
3893	error = getvnode(td->td_proc->p_fd, uap->fd,
3894	    cap_rights_init(&rights, CAP_READ), &fp);
3895	if (error != 0)
3896		return (error);
3897	if ((fp->f_flag & FREAD) == 0) {
3898		fdrop(fp, td);
3899		return (EBADF);
3900	}
3901	vp = fp->f_vnode;
3902	foffset = foffset_lock(fp, 0);
3903unionread:
3904	if (vp->v_type != VDIR) {
3905		foffset_unlock(fp, foffset, 0);
3906		fdrop(fp, td);
3907		return (EINVAL);
3908	}
3909	aiov.iov_base = uap->buf;
3910	aiov.iov_len = uap->count;
3911	auio.uio_iov = &aiov;
3912	auio.uio_iovcnt = 1;
3913	auio.uio_rw = UIO_READ;
3914	auio.uio_segflg = UIO_USERSPACE;
3915	auio.uio_td = td;
3916	auio.uio_resid = uap->count;
3917	vn_lock(vp, LK_SHARED | LK_RETRY);
3918	loff = auio.uio_offset = foffset;
3919#ifdef MAC
3920	error = mac_vnode_check_readdir(td->td_ucred, vp);
3921	if (error != 0) {
3922		VOP_UNLOCK(vp, 0);
3923		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3924		fdrop(fp, td);
3925		return (error);
3926	}
3927#endif
3928#	if (BYTE_ORDER != LITTLE_ENDIAN)
3929		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3930			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3931			    NULL, NULL);
3932			foffset = auio.uio_offset;
3933		} else
3934#	endif
3935	{
3936		kuio = auio;
3937		kuio.uio_iov = &kiov;
3938		kuio.uio_segflg = UIO_SYSSPACE;
3939		kiov.iov_len = uap->count;
3940		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3941		kiov.iov_base = dirbuf;
3942		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3943			    NULL, NULL);
3944		foffset = kuio.uio_offset;
3945		if (error == 0) {
3946			readcnt = uap->count - kuio.uio_resid;
3947			edp = (struct dirent *)&dirbuf[readcnt];
3948			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3949#				if (BYTE_ORDER == LITTLE_ENDIAN)
3950					/*
3951					 * The expected low byte of
3952					 * dp->d_namlen is our dp->d_type.
3953					 * The high MBZ byte of dp->d_namlen
3954					 * is our dp->d_namlen.
3955					 */
3956					dp->d_type = dp->d_namlen;
3957					dp->d_namlen = 0;
3958#				else
3959					/*
3960					 * The dp->d_type is the high byte
3961					 * of the expected dp->d_namlen,
3962					 * so must be zero'ed.
3963					 */
3964					dp->d_type = 0;
3965#				endif
3966				if (dp->d_reclen > 0) {
3967					dp = (struct dirent *)
3968					    ((char *)dp + dp->d_reclen);
3969				} else {
3970					error = EIO;
3971					break;
3972				}
3973			}
3974			if (dp >= edp)
3975				error = uiomove(dirbuf, readcnt, &auio);
3976		}
3977		free(dirbuf, M_TEMP);
3978	}
3979	if (error != 0) {
3980		VOP_UNLOCK(vp, 0);
3981		foffset_unlock(fp, foffset, 0);
3982		fdrop(fp, td);
3983		return (error);
3984	}
3985	if (uap->count == auio.uio_resid &&
3986	    (vp->v_vflag & VV_ROOT) &&
3987	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3988		struct vnode *tvp = vp;
3989		vp = vp->v_mount->mnt_vnodecovered;
3990		VREF(vp);
3991		fp->f_vnode = vp;
3992		fp->f_data = vp;
3993		foffset = 0;
3994		vput(tvp);
3995		goto unionread;
3996	}
3997	VOP_UNLOCK(vp, 0);
3998	foffset_unlock(fp, foffset, 0);
3999	fdrop(fp, td);
4000	td->td_retval[0] = uap->count - auio.uio_resid;
4001	if (error == 0)
4002		*ploff = loff;
4003	return (error);
4004}
4005#endif /* COMPAT_43 */
4006
4007/*
4008 * Read a block of directory entries in a filesystem independent format.
4009 */
4010#ifndef _SYS_SYSPROTO_H_
4011struct getdirentries_args {
4012	int	fd;
4013	char	*buf;
4014	u_int	count;
4015	long	*basep;
4016};
4017#endif
4018int
4019sys_getdirentries(td, uap)
4020	struct thread *td;
4021	register struct getdirentries_args /* {
4022		int fd;
4023		char *buf;
4024		u_int count;
4025		long *basep;
4026	} */ *uap;
4027{
4028	long base;
4029	int error;
4030
4031	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4032	    NULL, UIO_USERSPACE);
4033	if (error != 0)
4034		return (error);
4035	if (uap->basep != NULL)
4036		error = copyout(&base, uap->basep, sizeof(long));
4037	return (error);
4038}
4039
4040int
4041kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4042    long *basep, ssize_t *residp, enum uio_seg bufseg)
4043{
4044	struct vnode *vp;
4045	struct file *fp;
4046	struct uio auio;
4047	struct iovec aiov;
4048	cap_rights_t rights;
4049	long loff;
4050	int error, eofflag;
4051	off_t foffset;
4052
4053	AUDIT_ARG_FD(fd);
4054	if (count > IOSIZE_MAX)
4055		return (EINVAL);
4056	auio.uio_resid = count;
4057	error = getvnode(td->td_proc->p_fd, fd,
4058	    cap_rights_init(&rights, CAP_READ), &fp);
4059	if (error != 0)
4060		return (error);
4061	if ((fp->f_flag & FREAD) == 0) {
4062		fdrop(fp, td);
4063		return (EBADF);
4064	}
4065	vp = fp->f_vnode;
4066	foffset = foffset_lock(fp, 0);
4067unionread:
4068	if (vp->v_type != VDIR) {
4069		error = EINVAL;
4070		goto fail;
4071	}
4072	aiov.iov_base = buf;
4073	aiov.iov_len = count;
4074	auio.uio_iov = &aiov;
4075	auio.uio_iovcnt = 1;
4076	auio.uio_rw = UIO_READ;
4077	auio.uio_segflg = bufseg;
4078	auio.uio_td = td;
4079	vn_lock(vp, LK_SHARED | LK_RETRY);
4080	AUDIT_ARG_VNODE1(vp);
4081	loff = auio.uio_offset = foffset;
4082#ifdef MAC
4083	error = mac_vnode_check_readdir(td->td_ucred, vp);
4084	if (error == 0)
4085#endif
4086		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4087		    NULL);
4088	foffset = auio.uio_offset;
4089	if (error != 0) {
4090		VOP_UNLOCK(vp, 0);
4091		goto fail;
4092	}
4093	if (count == auio.uio_resid &&
4094	    (vp->v_vflag & VV_ROOT) &&
4095	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4096		struct vnode *tvp = vp;
4097
4098		vp = vp->v_mount->mnt_vnodecovered;
4099		VREF(vp);
4100		fp->f_vnode = vp;
4101		fp->f_data = vp;
4102		foffset = 0;
4103		vput(tvp);
4104		goto unionread;
4105	}
4106	VOP_UNLOCK(vp, 0);
4107	*basep = loff;
4108	if (residp != NULL)
4109		*residp = auio.uio_resid;
4110	td->td_retval[0] = count - auio.uio_resid;
4111fail:
4112	foffset_unlock(fp, foffset, 0);
4113	fdrop(fp, td);
4114	return (error);
4115}
4116
4117#ifndef _SYS_SYSPROTO_H_
4118struct getdents_args {
4119	int fd;
4120	char *buf;
4121	size_t count;
4122};
4123#endif
4124int
4125sys_getdents(td, uap)
4126	struct thread *td;
4127	register struct getdents_args /* {
4128		int fd;
4129		char *buf;
4130		u_int count;
4131	} */ *uap;
4132{
4133	struct getdirentries_args ap;
4134
4135	ap.fd = uap->fd;
4136	ap.buf = uap->buf;
4137	ap.count = uap->count;
4138	ap.basep = NULL;
4139	return (sys_getdirentries(td, &ap));
4140}
4141
4142/*
4143 * Set the mode mask for creation of filesystem nodes.
4144 */
4145#ifndef _SYS_SYSPROTO_H_
4146struct umask_args {
4147	int	newmask;
4148};
4149#endif
4150int
4151sys_umask(td, uap)
4152	struct thread *td;
4153	struct umask_args /* {
4154		int newmask;
4155	} */ *uap;
4156{
4157	register struct filedesc *fdp;
4158
4159	FILEDESC_XLOCK(td->td_proc->p_fd);
4160	fdp = td->td_proc->p_fd;
4161	td->td_retval[0] = fdp->fd_cmask;
4162	fdp->fd_cmask = uap->newmask & ALLPERMS;
4163	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4164	return (0);
4165}
4166
4167/*
4168 * Void all references to file by ripping underlying filesystem away from
4169 * vnode.
4170 */
4171#ifndef _SYS_SYSPROTO_H_
4172struct revoke_args {
4173	char	*path;
4174};
4175#endif
4176int
4177sys_revoke(td, uap)
4178	struct thread *td;
4179	register struct revoke_args /* {
4180		char *path;
4181	} */ *uap;
4182{
4183	struct vnode *vp;
4184	struct vattr vattr;
4185	struct nameidata nd;
4186	int error;
4187
4188	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4189	    uap->path, td);
4190	if ((error = namei(&nd)) != 0)
4191		return (error);
4192	vp = nd.ni_vp;
4193	NDFREE(&nd, NDF_ONLY_PNBUF);
4194	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4195		error = EINVAL;
4196		goto out;
4197	}
4198#ifdef MAC
4199	error = mac_vnode_check_revoke(td->td_ucred, vp);
4200	if (error != 0)
4201		goto out;
4202#endif
4203	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4204	if (error != 0)
4205		goto out;
4206	if (td->td_ucred->cr_uid != vattr.va_uid) {
4207		error = priv_check(td, PRIV_VFS_ADMIN);
4208		if (error != 0)
4209			goto out;
4210	}
4211	if (vcount(vp) > 1)
4212		VOP_REVOKE(vp, REVOKEALL);
4213out:
4214	vput(vp);
4215	return (error);
4216}
4217
4218/*
4219 * Convert a user file descriptor to a kernel file entry and check that, if it
4220 * is a capability, the correct rights are present. A reference on the file
4221 * entry is held upon returning.
4222 */
4223int
4224getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4225{
4226	struct file *fp;
4227	int error;
4228
4229	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4230	if (error != 0)
4231		return (error);
4232
4233	/*
4234	 * The file could be not of the vnode type, or it may be not
4235	 * yet fully initialized, in which case the f_vnode pointer
4236	 * may be set, but f_ops is still badfileops.  E.g.,
4237	 * devfs_open() transiently create such situation to
4238	 * facilitate csw d_fdopen().
4239	 *
4240	 * Dupfdopen() handling in kern_openat() installs the
4241	 * half-baked file into the process descriptor table, allowing
4242	 * other thread to dereference it. Guard against the race by
4243	 * checking f_ops.
4244	 */
4245	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4246		fdrop(fp, curthread);
4247		return (EINVAL);
4248	}
4249	*fpp = fp;
4250	return (0);
4251}
4252
4253
4254/*
4255 * Get an (NFS) file handle.
4256 */
4257#ifndef _SYS_SYSPROTO_H_
4258struct lgetfh_args {
4259	char	*fname;
4260	fhandle_t *fhp;
4261};
4262#endif
4263int
4264sys_lgetfh(td, uap)
4265	struct thread *td;
4266	register struct lgetfh_args *uap;
4267{
4268	struct nameidata nd;
4269	fhandle_t fh;
4270	register struct vnode *vp;
4271	int error;
4272
4273	error = priv_check(td, PRIV_VFS_GETFH);
4274	if (error != 0)
4275		return (error);
4276	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4277	    uap->fname, td);
4278	error = namei(&nd);
4279	if (error != 0)
4280		return (error);
4281	NDFREE(&nd, NDF_ONLY_PNBUF);
4282	vp = nd.ni_vp;
4283	bzero(&fh, sizeof(fh));
4284	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4285	error = VOP_VPTOFH(vp, &fh.fh_fid);
4286	vput(vp);
4287	if (error == 0)
4288		error = copyout(&fh, uap->fhp, sizeof (fh));
4289	return (error);
4290}
4291
4292#ifndef _SYS_SYSPROTO_H_
4293struct getfh_args {
4294	char	*fname;
4295	fhandle_t *fhp;
4296};
4297#endif
4298int
4299sys_getfh(td, uap)
4300	struct thread *td;
4301	register struct getfh_args *uap;
4302{
4303	struct nameidata nd;
4304	fhandle_t fh;
4305	register struct vnode *vp;
4306	int error;
4307
4308	error = priv_check(td, PRIV_VFS_GETFH);
4309	if (error != 0)
4310		return (error);
4311	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4312	    uap->fname, td);
4313	error = namei(&nd);
4314	if (error != 0)
4315		return (error);
4316	NDFREE(&nd, NDF_ONLY_PNBUF);
4317	vp = nd.ni_vp;
4318	bzero(&fh, sizeof(fh));
4319	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4320	error = VOP_VPTOFH(vp, &fh.fh_fid);
4321	vput(vp);
4322	if (error == 0)
4323		error = copyout(&fh, uap->fhp, sizeof (fh));
4324	return (error);
4325}
4326
4327/*
4328 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4329 * open descriptor.
4330 *
4331 * warning: do not remove the priv_check() call or this becomes one giant
4332 * security hole.
4333 */
4334#ifndef _SYS_SYSPROTO_H_
4335struct fhopen_args {
4336	const struct fhandle *u_fhp;
4337	int flags;
4338};
4339#endif
4340int
4341sys_fhopen(td, uap)
4342	struct thread *td;
4343	struct fhopen_args /* {
4344		const struct fhandle *u_fhp;
4345		int flags;
4346	} */ *uap;
4347{
4348	struct mount *mp;
4349	struct vnode *vp;
4350	struct fhandle fhp;
4351	struct file *fp;
4352	int fmode, error;
4353	int indx;
4354
4355	error = priv_check(td, PRIV_VFS_FHOPEN);
4356	if (error != 0)
4357		return (error);
4358	indx = -1;
4359	fmode = FFLAGS(uap->flags);
4360	/* why not allow a non-read/write open for our lockd? */
4361	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4362		return (EINVAL);
4363	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4364	if (error != 0)
4365		return(error);
4366	/* find the mount point */
4367	mp = vfs_busyfs(&fhp.fh_fsid);
4368	if (mp == NULL)
4369		return (ESTALE);
4370	/* now give me my vnode, it gets returned to me locked */
4371	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4372	vfs_unbusy(mp);
4373	if (error != 0)
4374		return (error);
4375
4376	error = falloc_noinstall(td, &fp);
4377	if (error != 0) {
4378		vput(vp);
4379		return (error);
4380	}
4381	/*
4382	 * An extra reference on `fp' has been held for us by
4383	 * falloc_noinstall().
4384	 */
4385
4386#ifdef INVARIANTS
4387	td->td_dupfd = -1;
4388#endif
4389	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4390	if (error != 0) {
4391		KASSERT(fp->f_ops == &badfileops,
4392		    ("VOP_OPEN in fhopen() set f_ops"));
4393		KASSERT(td->td_dupfd < 0,
4394		    ("fhopen() encountered fdopen()"));
4395
4396		vput(vp);
4397		goto bad;
4398	}
4399#ifdef INVARIANTS
4400	td->td_dupfd = 0;
4401#endif
4402	fp->f_vnode = vp;
4403	fp->f_seqcount = 1;
4404	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4405	    &vnops);
4406	VOP_UNLOCK(vp, 0);
4407	if ((fmode & O_TRUNC) != 0) {
4408		error = fo_truncate(fp, 0, td->td_ucred, td);
4409		if (error != 0)
4410			goto bad;
4411	}
4412
4413	error = finstall(td, fp, &indx, fmode, NULL);
4414bad:
4415	fdrop(fp, td);
4416	td->td_retval[0] = indx;
4417	return (error);
4418}
4419
4420/*
4421 * Stat an (NFS) file handle.
4422 */
4423#ifndef _SYS_SYSPROTO_H_
4424struct fhstat_args {
4425	struct fhandle *u_fhp;
4426	struct stat *sb;
4427};
4428#endif
4429int
4430sys_fhstat(td, uap)
4431	struct thread *td;
4432	register struct fhstat_args /* {
4433		struct fhandle *u_fhp;
4434		struct stat *sb;
4435	} */ *uap;
4436{
4437	struct stat sb;
4438	struct fhandle fh;
4439	int error;
4440
4441	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4442	if (error != 0)
4443		return (error);
4444	error = kern_fhstat(td, fh, &sb);
4445	if (error == 0)
4446		error = copyout(&sb, uap->sb, sizeof(sb));
4447	return (error);
4448}
4449
4450int
4451kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4452{
4453	struct mount *mp;
4454	struct vnode *vp;
4455	int error;
4456
4457	error = priv_check(td, PRIV_VFS_FHSTAT);
4458	if (error != 0)
4459		return (error);
4460	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4461		return (ESTALE);
4462	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4463	vfs_unbusy(mp);
4464	if (error != 0)
4465		return (error);
4466	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4467	vput(vp);
4468	return (error);
4469}
4470
4471/*
4472 * Implement fstatfs() for (NFS) file handles.
4473 */
4474#ifndef _SYS_SYSPROTO_H_
4475struct fhstatfs_args {
4476	struct fhandle *u_fhp;
4477	struct statfs *buf;
4478};
4479#endif
4480int
4481sys_fhstatfs(td, uap)
4482	struct thread *td;
4483	struct fhstatfs_args /* {
4484		struct fhandle *u_fhp;
4485		struct statfs *buf;
4486	} */ *uap;
4487{
4488	struct statfs sf;
4489	fhandle_t fh;
4490	int error;
4491
4492	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4493	if (error != 0)
4494		return (error);
4495	error = kern_fhstatfs(td, fh, &sf);
4496	if (error != 0)
4497		return (error);
4498	return (copyout(&sf, uap->buf, sizeof(sf)));
4499}
4500
4501int
4502kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4503{
4504	struct statfs *sp;
4505	struct mount *mp;
4506	struct vnode *vp;
4507	int error;
4508
4509	error = priv_check(td, PRIV_VFS_FHSTATFS);
4510	if (error != 0)
4511		return (error);
4512	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4513		return (ESTALE);
4514	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4515	if (error != 0) {
4516		vfs_unbusy(mp);
4517		return (error);
4518	}
4519	vput(vp);
4520	error = prison_canseemount(td->td_ucred, mp);
4521	if (error != 0)
4522		goto out;
4523#ifdef MAC
4524	error = mac_mount_check_stat(td->td_ucred, mp);
4525	if (error != 0)
4526		goto out;
4527#endif
4528	/*
4529	 * Set these in case the underlying filesystem fails to do so.
4530	 */
4531	sp = &mp->mnt_stat;
4532	sp->f_version = STATFS_VERSION;
4533	sp->f_namemax = NAME_MAX;
4534	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4535	error = VFS_STATFS(mp, sp);
4536	if (error == 0)
4537		*buf = *sp;
4538out:
4539	vfs_unbusy(mp);
4540	return (error);
4541}
4542
4543int
4544kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4545{
4546	struct file *fp;
4547	struct mount *mp;
4548	struct vnode *vp;
4549	cap_rights_t rights;
4550	off_t olen, ooffset;
4551	int error;
4552
4553	if (offset < 0 || len <= 0)
4554		return (EINVAL);
4555	/* Check for wrap. */
4556	if (offset > OFF_MAX - len)
4557		return (EFBIG);
4558	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4559	if (error != 0)
4560		return (error);
4561	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4562		error = ESPIPE;
4563		goto out;
4564	}
4565	if ((fp->f_flag & FWRITE) == 0) {
4566		error = EBADF;
4567		goto out;
4568	}
4569	if (fp->f_type != DTYPE_VNODE) {
4570		error = ENODEV;
4571		goto out;
4572	}
4573	vp = fp->f_vnode;
4574	if (vp->v_type != VREG) {
4575		error = ENODEV;
4576		goto out;
4577	}
4578
4579	/* Allocating blocks may take a long time, so iterate. */
4580	for (;;) {
4581		olen = len;
4582		ooffset = offset;
4583
4584		bwillwrite();
4585		mp = NULL;
4586		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4587		if (error != 0)
4588			break;
4589		error = vn_lock(vp, LK_EXCLUSIVE);
4590		if (error != 0) {
4591			vn_finished_write(mp);
4592			break;
4593		}
4594#ifdef MAC
4595		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4596		if (error == 0)
4597#endif
4598			error = VOP_ALLOCATE(vp, &offset, &len);
4599		VOP_UNLOCK(vp, 0);
4600		vn_finished_write(mp);
4601
4602		if (olen + ooffset != offset + len) {
4603			panic("offset + len changed from %jx/%jx to %jx/%jx",
4604			    ooffset, olen, offset, len);
4605		}
4606		if (error != 0 || len == 0)
4607			break;
4608		KASSERT(olen > len, ("Iteration did not make progress?"));
4609		maybe_yield();
4610	}
4611 out:
4612	fdrop(fp, td);
4613	return (error);
4614}
4615
4616int
4617sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4618{
4619
4620	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4621	    uap->len);
4622	return (0);
4623}
4624
4625/*
4626 * Unlike madvise(2), we do not make a best effort to remember every
4627 * possible caching hint.  Instead, we remember the last setting with
4628 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4629 * region of any current setting.
4630 */
4631int
4632kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4633    int advice)
4634{
4635	struct fadvise_info *fa, *new;
4636	struct file *fp;
4637	struct vnode *vp;
4638	cap_rights_t rights;
4639	off_t end;
4640	int error;
4641
4642	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4643		return (EINVAL);
4644	switch (advice) {
4645	case POSIX_FADV_SEQUENTIAL:
4646	case POSIX_FADV_RANDOM:
4647	case POSIX_FADV_NOREUSE:
4648		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4649		break;
4650	case POSIX_FADV_NORMAL:
4651	case POSIX_FADV_WILLNEED:
4652	case POSIX_FADV_DONTNEED:
4653		new = NULL;
4654		break;
4655	default:
4656		return (EINVAL);
4657	}
4658	/* XXX: CAP_POSIX_FADVISE? */
4659	error = fget(td, fd, cap_rights_init(&rights), &fp);
4660	if (error != 0)
4661		goto out;
4662	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4663		error = ESPIPE;
4664		goto out;
4665	}
4666	if (fp->f_type != DTYPE_VNODE) {
4667		error = ENODEV;
4668		goto out;
4669	}
4670	vp = fp->f_vnode;
4671	if (vp->v_type != VREG) {
4672		error = ENODEV;
4673		goto out;
4674	}
4675	if (len == 0)
4676		end = OFF_MAX;
4677	else
4678		end = offset + len - 1;
4679	switch (advice) {
4680	case POSIX_FADV_SEQUENTIAL:
4681	case POSIX_FADV_RANDOM:
4682	case POSIX_FADV_NOREUSE:
4683		/*
4684		 * Try to merge any existing non-standard region with
4685		 * this new region if possible, otherwise create a new
4686		 * non-standard region for this request.
4687		 */
4688		mtx_pool_lock(mtxpool_sleep, fp);
4689		fa = fp->f_advice;
4690		if (fa != NULL && fa->fa_advice == advice &&
4691		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4692		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4693		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4694			if (offset < fa->fa_start)
4695				fa->fa_start = offset;
4696			if (end > fa->fa_end)
4697				fa->fa_end = end;
4698		} else {
4699			new->fa_advice = advice;
4700			new->fa_start = offset;
4701			new->fa_end = end;
4702			new->fa_prevstart = 0;
4703			new->fa_prevend = 0;
4704			fp->f_advice = new;
4705			new = fa;
4706		}
4707		mtx_pool_unlock(mtxpool_sleep, fp);
4708		break;
4709	case POSIX_FADV_NORMAL:
4710		/*
4711		 * If a the "normal" region overlaps with an existing
4712		 * non-standard region, trim or remove the
4713		 * non-standard region.
4714		 */
4715		mtx_pool_lock(mtxpool_sleep, fp);
4716		fa = fp->f_advice;
4717		if (fa != NULL) {
4718			if (offset <= fa->fa_start && end >= fa->fa_end) {
4719				new = fa;
4720				fp->f_advice = NULL;
4721			} else if (offset <= fa->fa_start &&
4722			    end >= fa->fa_start)
4723				fa->fa_start = end + 1;
4724			else if (offset <= fa->fa_end && end >= fa->fa_end)
4725				fa->fa_end = offset - 1;
4726			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4727				/*
4728				 * If the "normal" region is a middle
4729				 * portion of the existing
4730				 * non-standard region, just remove
4731				 * the whole thing rather than picking
4732				 * one side or the other to
4733				 * preserve.
4734				 */
4735				new = fa;
4736				fp->f_advice = NULL;
4737			}
4738		}
4739		mtx_pool_unlock(mtxpool_sleep, fp);
4740		break;
4741	case POSIX_FADV_WILLNEED:
4742	case POSIX_FADV_DONTNEED:
4743		error = VOP_ADVISE(vp, offset, end, advice);
4744		break;
4745	}
4746out:
4747	if (fp != NULL)
4748		fdrop(fp, td);
4749	free(new, M_FADVISE);
4750	return (error);
4751}
4752
4753int
4754sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4755{
4756
4757	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4758	    uap->len, uap->advice);
4759	return (0);
4760}
4761