vfs_syscalls.c revision 269283
155714Skris/*-
255714Skris * Copyright (c) 1989, 1993
355714Skris *	The Regents of the University of California.  All rights reserved.
455714Skris * (c) UNIX System Laboratories, Inc.
555714Skris * All or some portions of this file are derived from material licensed
655714Skris * to the University of California by American Telephone and Telegraph
755714Skris * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8280304Sjkim * the permission of UNIX System Laboratories, Inc.
955714Skris *
1055714Skris * Redistribution and use in source and binary forms, with or without
1155714Skris * modification, are permitted provided that the following conditions
1255714Skris * are met:
1355714Skris * 1. Redistributions of source code must retain the above copyright
1455714Skris *    notice, this list of conditions and the following disclaimer.
15280304Sjkim * 2. Redistributions in binary form must reproduce the above copyright
1655714Skris *    notice, this list of conditions and the following disclaimer in the
1755714Skris *    documentation and/or other materials provided with the distribution.
1855714Skris * 4. Neither the name of the University nor the names of its contributors
1955714Skris *    may be used to endorse or promote products derived from this software
2055714Skris *    without specific prior written permission.
2155714Skris *
22280304Sjkim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2355714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2455714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2555714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2655714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2755714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2855714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2955714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3055714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3155714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3255714Skris * SUCH DAMAGE.
3355714Skris *
3455714Skris *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
3555714Skris */
3655714Skris
37280304Sjkim#include <sys/cdefs.h>
3855714Skris__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 269283 2014-07-30 03:56:17Z kib $");
3955714Skris
40280304Sjkim#include "opt_capsicum.h"
4155714Skris#include "opt_compat.h"
4255714Skris#include "opt_kdtrace.h"
4355714Skris#include "opt_ktrace.h"
4455714Skris
4555714Skris#include <sys/param.h>
4655714Skris#include <sys/systm.h>
4755714Skris#include <sys/bio.h>
4855714Skris#include <sys/buf.h>
4955714Skris#include <sys/capability.h>
5055714Skris#include <sys/disk.h>
5155714Skris#include <sys/sysent.h>
52280304Sjkim#include <sys/malloc.h>
5355714Skris#include <sys/mount.h>
5455714Skris#include <sys/mutex.h>
5555714Skris#include <sys/sysproto.h>
5655714Skris#include <sys/namei.h>
5755714Skris#include <sys/filedesc.h>
5855714Skris#include <sys/kernel.h>
5955714Skris#include <sys/fcntl.h>
6055714Skris#include <sys/file.h>
6155714Skris#include <sys/filio.h>
6255714Skris#include <sys/limits.h>
6355714Skris#include <sys/linker.h>
6455714Skris#include <sys/rwlock.h>
6555714Skris#include <sys/sdt.h>
6655714Skris#include <sys/stat.h>
6755714Skris#include <sys/sx.h>
6855714Skris#include <sys/unistd.h>
69280304Sjkim#include <sys/vnode.h>
7055714Skris#include <sys/priv.h>
7159191Skris#include <sys/proc.h>
7259191Skris#include <sys/dirent.h>
7355714Skris#include <sys/jail.h>
74280304Sjkim#include <sys/syscallsubr.h>
75280304Sjkim#include <sys/sysctl.h>
76280304Sjkim#ifdef KTRACE
77280304Sjkim#include <sys/ktrace.h>
7855714Skris#endif
79280304Sjkim
8055714Skris#include <machine/stdarg.h>
81280304Sjkim
82280304Sjkim#include <security/audit/audit.h>
83280304Sjkim#include <security/mac/mac_framework.h>
8455714Skris
85280304Sjkim#include <vm/vm.h>
8655714Skris#include <vm/vm_object.h>
87280304Sjkim#include <vm/vm_page.h>
88280304Sjkim#include <vm/uma.h>
8955714Skris
90280304Sjkim#include <ufs/ufs/quota.h>
91280304Sjkim
92109998SmarkmMALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93280304Sjkim
94280304SjkimSDT_PROVIDER_DEFINE(vfs);
95280304SjkimSDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96280304SjkimSDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
9768651Skris
98280304Sjkimstatic int chroot_refuse_vdir_fds(struct filedesc *fdp);
99280304Sjkimstatic int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
100280304Sjkimstatic int kern_chflags(struct thread *td, const char *path,
101280304Sjkim    enum uio_seg pathseg, u_long flags);
102280304Sjkimstatic int kern_chflagsat(struct thread *td, int fd, const char *path,
103280304Sjkim    enum uio_seg pathseg, u_long flags, int atflag);
104280304Sjkimstatic int setfflags(struct thread *td, struct vnode *, u_long);
105280304Sjkimstatic int setutimes(struct thread *td, struct vnode *,
106280304Sjkim    const struct timespec *, int, int);
107280304Sjkimstatic int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
10855714Skris    struct thread *td);
109280304Sjkim
110280304Sjkim/*
111280304Sjkim * The module initialization routine for POSIX asynchronous I/O will
112280304Sjkim * set this to the version of AIO that it implements.  (Zero means
113280304Sjkim * that it is not implemented.)  This value is used here by pathconf()
114280304Sjkim * and in kern_descrip.c by fpathconf().
115280304Sjkim */
116280304Sjkimint async_io_version;
117280304Sjkim
118280304Sjkim#ifdef DEBUG
119280304Sjkimstatic int syncprt = 0;
120280304SjkimSYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
121280304Sjkim#endif
122
123/*
124 * Sync each mounted filesystem.
125 */
126#ifndef _SYS_SYSPROTO_H_
127struct sync_args {
128	int     dummy;
129};
130#endif
131/* ARGSUSED */
132int
133sys_sync(td, uap)
134	struct thread *td;
135	struct sync_args *uap;
136{
137	struct mount *mp, *nmp;
138	int save;
139
140	mtx_lock(&mountlist_mtx);
141	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
142		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
143			nmp = TAILQ_NEXT(mp, mnt_list);
144			continue;
145		}
146		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148			save = curthread_pflags_set(TDP_SYNCIO);
149			vfs_msync(mp, MNT_NOWAIT);
150			VFS_SYNC(mp, MNT_NOWAIT);
151			curthread_pflags_restore(save);
152			vn_finished_write(mp);
153		}
154		mtx_lock(&mountlist_mtx);
155		nmp = TAILQ_NEXT(mp, mnt_list);
156		vfs_unbusy(mp);
157	}
158	mtx_unlock(&mountlist_mtx);
159	return (0);
160}
161
162/*
163 * Change filesystem quotas.
164 */
165#ifndef _SYS_SYSPROTO_H_
166struct quotactl_args {
167	char *path;
168	int cmd;
169	int uid;
170	caddr_t arg;
171};
172#endif
173int
174sys_quotactl(td, uap)
175	struct thread *td;
176	register struct quotactl_args /* {
177		char *path;
178		int cmd;
179		int uid;
180		caddr_t arg;
181	} */ *uap;
182{
183	struct mount *mp;
184	struct nameidata nd;
185	int error;
186
187	AUDIT_ARG_CMD(uap->cmd);
188	AUDIT_ARG_UID(uap->uid);
189	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
190		return (EPERM);
191	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
192	    uap->path, td);
193	if ((error = namei(&nd)) != 0)
194		return (error);
195	NDFREE(&nd, NDF_ONLY_PNBUF);
196	mp = nd.ni_vp->v_mount;
197	vfs_ref(mp);
198	vput(nd.ni_vp);
199	error = vfs_busy(mp, 0);
200	vfs_rel(mp);
201	if (error != 0)
202		return (error);
203	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
204
205	/*
206	 * Since quota on operation typically needs to open quota
207	 * file, the Q_QUOTAON handler needs to unbusy the mount point
208	 * before calling into namei.  Otherwise, unmount might be
209	 * started between two vfs_busy() invocations (first is our,
210	 * second is from mount point cross-walk code in lookup()),
211	 * causing deadlock.
212	 *
213	 * Require that Q_QUOTAON handles the vfs_busy() reference on
214	 * its own, always returning with ubusied mount point.
215	 */
216	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
217		vfs_unbusy(mp);
218	return (error);
219}
220
221/*
222 * Used by statfs conversion routines to scale the block size up if
223 * necessary so that all of the block counts are <= 'max_size'.  Note
224 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
225 * value of 'n'.
226 */
227void
228statfs_scale_blocks(struct statfs *sf, long max_size)
229{
230	uint64_t count;
231	int shift;
232
233	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
234
235	/*
236	 * Attempt to scale the block counts to give a more accurate
237	 * overview to userland of the ratio of free space to used
238	 * space.  To do this, find the largest block count and compute
239	 * a divisor that lets it fit into a signed integer <= max_size.
240	 */
241	if (sf->f_bavail < 0)
242		count = -sf->f_bavail;
243	else
244		count = sf->f_bavail;
245	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
246	if (count <= max_size)
247		return;
248
249	count >>= flsl(max_size);
250	shift = 0;
251	while (count > 0) {
252		shift++;
253		count >>=1;
254	}
255
256	sf->f_bsize <<= shift;
257	sf->f_blocks >>= shift;
258	sf->f_bfree >>= shift;
259	sf->f_bavail >>= shift;
260}
261
262/*
263 * Get filesystem statistics.
264 */
265#ifndef _SYS_SYSPROTO_H_
266struct statfs_args {
267	char *path;
268	struct statfs *buf;
269};
270#endif
271int
272sys_statfs(td, uap)
273	struct thread *td;
274	register struct statfs_args /* {
275		char *path;
276		struct statfs *buf;
277	} */ *uap;
278{
279	struct statfs sf;
280	int error;
281
282	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
283	if (error == 0)
284		error = copyout(&sf, uap->buf, sizeof(sf));
285	return (error);
286}
287
288int
289kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
290    struct statfs *buf)
291{
292	struct mount *mp;
293	struct statfs *sp, sb;
294	struct nameidata nd;
295	int error;
296
297	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
298	    pathseg, path, td);
299	error = namei(&nd);
300	if (error != 0)
301		return (error);
302	mp = nd.ni_vp->v_mount;
303	vfs_ref(mp);
304	NDFREE(&nd, NDF_ONLY_PNBUF);
305	vput(nd.ni_vp);
306	error = vfs_busy(mp, 0);
307	vfs_rel(mp);
308	if (error != 0)
309		return (error);
310#ifdef MAC
311	error = mac_mount_check_stat(td->td_ucred, mp);
312	if (error != 0)
313		goto out;
314#endif
315	/*
316	 * Set these in case the underlying filesystem fails to do so.
317	 */
318	sp = &mp->mnt_stat;
319	sp->f_version = STATFS_VERSION;
320	sp->f_namemax = NAME_MAX;
321	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
322	error = VFS_STATFS(mp, sp);
323	if (error != 0)
324		goto out;
325	if (priv_check(td, PRIV_VFS_GENERATION)) {
326		bcopy(sp, &sb, sizeof(sb));
327		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
328		prison_enforce_statfs(td->td_ucred, mp, &sb);
329		sp = &sb;
330	}
331	*buf = *sp;
332out:
333	vfs_unbusy(mp);
334	return (error);
335}
336
337/*
338 * Get filesystem statistics.
339 */
340#ifndef _SYS_SYSPROTO_H_
341struct fstatfs_args {
342	int fd;
343	struct statfs *buf;
344};
345#endif
346int
347sys_fstatfs(td, uap)
348	struct thread *td;
349	register struct fstatfs_args /* {
350		int fd;
351		struct statfs *buf;
352	} */ *uap;
353{
354	struct statfs sf;
355	int error;
356
357	error = kern_fstatfs(td, uap->fd, &sf);
358	if (error == 0)
359		error = copyout(&sf, uap->buf, sizeof(sf));
360	return (error);
361}
362
363int
364kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
365{
366	struct file *fp;
367	struct mount *mp;
368	struct statfs *sp, sb;
369	struct vnode *vp;
370	cap_rights_t rights;
371	int error;
372
373	AUDIT_ARG_FD(fd);
374	error = getvnode(td->td_proc->p_fd, fd,
375	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
376	if (error != 0)
377		return (error);
378	vp = fp->f_vnode;
379	vn_lock(vp, LK_SHARED | LK_RETRY);
380#ifdef AUDIT
381	AUDIT_ARG_VNODE1(vp);
382#endif
383	mp = vp->v_mount;
384	if (mp)
385		vfs_ref(mp);
386	VOP_UNLOCK(vp, 0);
387	fdrop(fp, td);
388	if (mp == NULL) {
389		error = EBADF;
390		goto out;
391	}
392	error = vfs_busy(mp, 0);
393	vfs_rel(mp);
394	if (error != 0)
395		return (error);
396#ifdef MAC
397	error = mac_mount_check_stat(td->td_ucred, mp);
398	if (error != 0)
399		goto out;
400#endif
401	/*
402	 * Set these in case the underlying filesystem fails to do so.
403	 */
404	sp = &mp->mnt_stat;
405	sp->f_version = STATFS_VERSION;
406	sp->f_namemax = NAME_MAX;
407	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
408	error = VFS_STATFS(mp, sp);
409	if (error != 0)
410		goto out;
411	if (priv_check(td, PRIV_VFS_GENERATION)) {
412		bcopy(sp, &sb, sizeof(sb));
413		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
414		prison_enforce_statfs(td->td_ucred, mp, &sb);
415		sp = &sb;
416	}
417	*buf = *sp;
418out:
419	if (mp)
420		vfs_unbusy(mp);
421	return (error);
422}
423
424/*
425 * Get statistics on all filesystems.
426 */
427#ifndef _SYS_SYSPROTO_H_
428struct getfsstat_args {
429	struct statfs *buf;
430	long bufsize;
431	int flags;
432};
433#endif
434int
435sys_getfsstat(td, uap)
436	struct thread *td;
437	register struct getfsstat_args /* {
438		struct statfs *buf;
439		long bufsize;
440		int flags;
441	} */ *uap;
442{
443
444	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
445	    uap->flags));
446}
447
448/*
449 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
450 *	The caller is responsible for freeing memory which will be allocated
451 *	in '*buf'.
452 */
453int
454kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
455    enum uio_seg bufseg, int flags)
456{
457	struct mount *mp, *nmp;
458	struct statfs *sfsp, *sp, sb;
459	size_t count, maxcount;
460	int error;
461
462	maxcount = bufsize / sizeof(struct statfs);
463	if (bufsize == 0)
464		sfsp = NULL;
465	else if (bufseg == UIO_USERSPACE)
466		sfsp = *buf;
467	else /* if (bufseg == UIO_SYSSPACE) */ {
468		count = 0;
469		mtx_lock(&mountlist_mtx);
470		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
471			count++;
472		}
473		mtx_unlock(&mountlist_mtx);
474		if (maxcount > count)
475			maxcount = count;
476		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
477		    M_WAITOK);
478	}
479	count = 0;
480	mtx_lock(&mountlist_mtx);
481	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
482		if (prison_canseemount(td->td_ucred, mp) != 0) {
483			nmp = TAILQ_NEXT(mp, mnt_list);
484			continue;
485		}
486#ifdef MAC
487		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
488			nmp = TAILQ_NEXT(mp, mnt_list);
489			continue;
490		}
491#endif
492		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
493			nmp = TAILQ_NEXT(mp, mnt_list);
494			continue;
495		}
496		if (sfsp && count < maxcount) {
497			sp = &mp->mnt_stat;
498			/*
499			 * Set these in case the underlying filesystem
500			 * fails to do so.
501			 */
502			sp->f_version = STATFS_VERSION;
503			sp->f_namemax = NAME_MAX;
504			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
505			/*
506			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
507			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
508			 * overrides MNT_WAIT.
509			 */
510			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
511			    (flags & MNT_WAIT)) &&
512			    (error = VFS_STATFS(mp, sp))) {
513				mtx_lock(&mountlist_mtx);
514				nmp = TAILQ_NEXT(mp, mnt_list);
515				vfs_unbusy(mp);
516				continue;
517			}
518			if (priv_check(td, PRIV_VFS_GENERATION)) {
519				bcopy(sp, &sb, sizeof(sb));
520				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
521				prison_enforce_statfs(td->td_ucred, mp, &sb);
522				sp = &sb;
523			}
524			if (bufseg == UIO_SYSSPACE)
525				bcopy(sp, sfsp, sizeof(*sp));
526			else /* if (bufseg == UIO_USERSPACE) */ {
527				error = copyout(sp, sfsp, sizeof(*sp));
528				if (error != 0) {
529					vfs_unbusy(mp);
530					return (error);
531				}
532			}
533			sfsp++;
534		}
535		count++;
536		mtx_lock(&mountlist_mtx);
537		nmp = TAILQ_NEXT(mp, mnt_list);
538		vfs_unbusy(mp);
539	}
540	mtx_unlock(&mountlist_mtx);
541	if (sfsp && count > maxcount)
542		td->td_retval[0] = maxcount;
543	else
544		td->td_retval[0] = count;
545	return (0);
546}
547
548#ifdef COMPAT_FREEBSD4
549/*
550 * Get old format filesystem statistics.
551 */
552static void cvtstatfs(struct statfs *, struct ostatfs *);
553
554#ifndef _SYS_SYSPROTO_H_
555struct freebsd4_statfs_args {
556	char *path;
557	struct ostatfs *buf;
558};
559#endif
560int
561freebsd4_statfs(td, uap)
562	struct thread *td;
563	struct freebsd4_statfs_args /* {
564		char *path;
565		struct ostatfs *buf;
566	} */ *uap;
567{
568	struct ostatfs osb;
569	struct statfs sf;
570	int error;
571
572	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
573	if (error != 0)
574		return (error);
575	cvtstatfs(&sf, &osb);
576	return (copyout(&osb, uap->buf, sizeof(osb)));
577}
578
579/*
580 * Get filesystem statistics.
581 */
582#ifndef _SYS_SYSPROTO_H_
583struct freebsd4_fstatfs_args {
584	int fd;
585	struct ostatfs *buf;
586};
587#endif
588int
589freebsd4_fstatfs(td, uap)
590	struct thread *td;
591	struct freebsd4_fstatfs_args /* {
592		int fd;
593		struct ostatfs *buf;
594	} */ *uap;
595{
596	struct ostatfs osb;
597	struct statfs sf;
598	int error;
599
600	error = kern_fstatfs(td, uap->fd, &sf);
601	if (error != 0)
602		return (error);
603	cvtstatfs(&sf, &osb);
604	return (copyout(&osb, uap->buf, sizeof(osb)));
605}
606
607/*
608 * Get statistics on all filesystems.
609 */
610#ifndef _SYS_SYSPROTO_H_
611struct freebsd4_getfsstat_args {
612	struct ostatfs *buf;
613	long bufsize;
614	int flags;
615};
616#endif
617int
618freebsd4_getfsstat(td, uap)
619	struct thread *td;
620	register struct freebsd4_getfsstat_args /* {
621		struct ostatfs *buf;
622		long bufsize;
623		int flags;
624	} */ *uap;
625{
626	struct statfs *buf, *sp;
627	struct ostatfs osb;
628	size_t count, size;
629	int error;
630
631	count = uap->bufsize / sizeof(struct ostatfs);
632	size = count * sizeof(struct statfs);
633	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
634	if (size > 0) {
635		count = td->td_retval[0];
636		sp = buf;
637		while (count > 0 && error == 0) {
638			cvtstatfs(sp, &osb);
639			error = copyout(&osb, uap->buf, sizeof(osb));
640			sp++;
641			uap->buf++;
642			count--;
643		}
644		free(buf, M_TEMP);
645	}
646	return (error);
647}
648
649/*
650 * Implement fstatfs() for (NFS) file handles.
651 */
652#ifndef _SYS_SYSPROTO_H_
653struct freebsd4_fhstatfs_args {
654	struct fhandle *u_fhp;
655	struct ostatfs *buf;
656};
657#endif
658int
659freebsd4_fhstatfs(td, uap)
660	struct thread *td;
661	struct freebsd4_fhstatfs_args /* {
662		struct fhandle *u_fhp;
663		struct ostatfs *buf;
664	} */ *uap;
665{
666	struct ostatfs osb;
667	struct statfs sf;
668	fhandle_t fh;
669	int error;
670
671	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
672	if (error != 0)
673		return (error);
674	error = kern_fhstatfs(td, fh, &sf);
675	if (error != 0)
676		return (error);
677	cvtstatfs(&sf, &osb);
678	return (copyout(&osb, uap->buf, sizeof(osb)));
679}
680
681/*
682 * Convert a new format statfs structure to an old format statfs structure.
683 */
684static void
685cvtstatfs(nsp, osp)
686	struct statfs *nsp;
687	struct ostatfs *osp;
688{
689
690	statfs_scale_blocks(nsp, LONG_MAX);
691	bzero(osp, sizeof(*osp));
692	osp->f_bsize = nsp->f_bsize;
693	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
694	osp->f_blocks = nsp->f_blocks;
695	osp->f_bfree = nsp->f_bfree;
696	osp->f_bavail = nsp->f_bavail;
697	osp->f_files = MIN(nsp->f_files, LONG_MAX);
698	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
699	osp->f_owner = nsp->f_owner;
700	osp->f_type = nsp->f_type;
701	osp->f_flags = nsp->f_flags;
702	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
703	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
704	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
705	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
706	strlcpy(osp->f_fstypename, nsp->f_fstypename,
707	    MIN(MFSNAMELEN, OMFSNAMELEN));
708	strlcpy(osp->f_mntonname, nsp->f_mntonname,
709	    MIN(MNAMELEN, OMNAMELEN));
710	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
711	    MIN(MNAMELEN, OMNAMELEN));
712	osp->f_fsid = nsp->f_fsid;
713}
714#endif /* COMPAT_FREEBSD4 */
715
716/*
717 * Change current working directory to a given file descriptor.
718 */
719#ifndef _SYS_SYSPROTO_H_
720struct fchdir_args {
721	int	fd;
722};
723#endif
724int
725sys_fchdir(td, uap)
726	struct thread *td;
727	struct fchdir_args /* {
728		int fd;
729	} */ *uap;
730{
731	register struct filedesc *fdp = td->td_proc->p_fd;
732	struct vnode *vp, *tdp, *vpold;
733	struct mount *mp;
734	struct file *fp;
735	cap_rights_t rights;
736	int error;
737
738	AUDIT_ARG_FD(uap->fd);
739	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
740	    &fp);
741	if (error != 0)
742		return (error);
743	vp = fp->f_vnode;
744	VREF(vp);
745	fdrop(fp, td);
746	vn_lock(vp, LK_SHARED | LK_RETRY);
747	AUDIT_ARG_VNODE1(vp);
748	error = change_dir(vp, td);
749	while (!error && (mp = vp->v_mountedhere) != NULL) {
750		if (vfs_busy(mp, 0))
751			continue;
752		error = VFS_ROOT(mp, LK_SHARED, &tdp);
753		vfs_unbusy(mp);
754		if (error != 0)
755			break;
756		vput(vp);
757		vp = tdp;
758	}
759	if (error != 0) {
760		vput(vp);
761		return (error);
762	}
763	VOP_UNLOCK(vp, 0);
764	FILEDESC_XLOCK(fdp);
765	vpold = fdp->fd_cdir;
766	fdp->fd_cdir = vp;
767	FILEDESC_XUNLOCK(fdp);
768	vrele(vpold);
769	return (0);
770}
771
772/*
773 * Change current working directory (``.'').
774 */
775#ifndef _SYS_SYSPROTO_H_
776struct chdir_args {
777	char	*path;
778};
779#endif
780int
781sys_chdir(td, uap)
782	struct thread *td;
783	struct chdir_args /* {
784		char *path;
785	} */ *uap;
786{
787
788	return (kern_chdir(td, uap->path, UIO_USERSPACE));
789}
790
791int
792kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
793{
794	register struct filedesc *fdp = td->td_proc->p_fd;
795	struct nameidata nd;
796	struct vnode *vp;
797	int error;
798
799	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
800	    pathseg, path, td);
801	if ((error = namei(&nd)) != 0)
802		return (error);
803	if ((error = change_dir(nd.ni_vp, td)) != 0) {
804		vput(nd.ni_vp);
805		NDFREE(&nd, NDF_ONLY_PNBUF);
806		return (error);
807	}
808	VOP_UNLOCK(nd.ni_vp, 0);
809	NDFREE(&nd, NDF_ONLY_PNBUF);
810	FILEDESC_XLOCK(fdp);
811	vp = fdp->fd_cdir;
812	fdp->fd_cdir = nd.ni_vp;
813	FILEDESC_XUNLOCK(fdp);
814	vrele(vp);
815	return (0);
816}
817
818/*
819 * Helper function for raised chroot(2) security function:  Refuse if
820 * any filedescriptors are open directories.
821 */
822static int
823chroot_refuse_vdir_fds(fdp)
824	struct filedesc *fdp;
825{
826	struct vnode *vp;
827	struct file *fp;
828	int fd;
829
830	FILEDESC_LOCK_ASSERT(fdp);
831
832	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
833		fp = fget_locked(fdp, fd);
834		if (fp == NULL)
835			continue;
836		if (fp->f_type == DTYPE_VNODE) {
837			vp = fp->f_vnode;
838			if (vp->v_type == VDIR)
839				return (EPERM);
840		}
841	}
842	return (0);
843}
844
845/*
846 * This sysctl determines if we will allow a process to chroot(2) if it
847 * has a directory open:
848 *	0: disallowed for all processes.
849 *	1: allowed for processes that were not already chroot(2)'ed.
850 *	2: allowed for all processes.
851 */
852
853static int chroot_allow_open_directories = 1;
854
855SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
856     &chroot_allow_open_directories, 0,
857     "Allow a process to chroot(2) if it has a directory open");
858
859/*
860 * Change notion of root (``/'') directory.
861 */
862#ifndef _SYS_SYSPROTO_H_
863struct chroot_args {
864	char	*path;
865};
866#endif
867int
868sys_chroot(td, uap)
869	struct thread *td;
870	struct chroot_args /* {
871		char *path;
872	} */ *uap;
873{
874	struct nameidata nd;
875	int error;
876
877	error = priv_check(td, PRIV_VFS_CHROOT);
878	if (error != 0)
879		return (error);
880	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
881	    UIO_USERSPACE, uap->path, td);
882	error = namei(&nd);
883	if (error != 0)
884		goto error;
885	error = change_dir(nd.ni_vp, td);
886	if (error != 0)
887		goto e_vunlock;
888#ifdef MAC
889	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
890	if (error != 0)
891		goto e_vunlock;
892#endif
893	VOP_UNLOCK(nd.ni_vp, 0);
894	error = change_root(nd.ni_vp, td);
895	vrele(nd.ni_vp);
896	NDFREE(&nd, NDF_ONLY_PNBUF);
897	return (error);
898e_vunlock:
899	vput(nd.ni_vp);
900error:
901	NDFREE(&nd, NDF_ONLY_PNBUF);
902	return (error);
903}
904
905/*
906 * Common routine for chroot and chdir.  Callers must provide a locked vnode
907 * instance.
908 */
909int
910change_dir(vp, td)
911	struct vnode *vp;
912	struct thread *td;
913{
914#ifdef MAC
915	int error;
916#endif
917
918	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
919	if (vp->v_type != VDIR)
920		return (ENOTDIR);
921#ifdef MAC
922	error = mac_vnode_check_chdir(td->td_ucred, vp);
923	if (error != 0)
924		return (error);
925#endif
926	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
927}
928
929/*
930 * Common routine for kern_chroot() and jail_attach().  The caller is
931 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
932 * authorize this operation.
933 */
934int
935change_root(vp, td)
936	struct vnode *vp;
937	struct thread *td;
938{
939	struct filedesc *fdp;
940	struct vnode *oldvp;
941	int error;
942
943	fdp = td->td_proc->p_fd;
944	FILEDESC_XLOCK(fdp);
945	if (chroot_allow_open_directories == 0 ||
946	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
947		error = chroot_refuse_vdir_fds(fdp);
948		if (error != 0) {
949			FILEDESC_XUNLOCK(fdp);
950			return (error);
951		}
952	}
953	oldvp = fdp->fd_rdir;
954	fdp->fd_rdir = vp;
955	VREF(fdp->fd_rdir);
956	if (!fdp->fd_jdir) {
957		fdp->fd_jdir = vp;
958		VREF(fdp->fd_jdir);
959	}
960	FILEDESC_XUNLOCK(fdp);
961	vrele(oldvp);
962	return (0);
963}
964
965static __inline void
966flags_to_rights(int flags, cap_rights_t *rightsp)
967{
968
969	if (flags & O_EXEC) {
970		cap_rights_set(rightsp, CAP_FEXECVE);
971	} else {
972		switch ((flags & O_ACCMODE)) {
973		case O_RDONLY:
974			cap_rights_set(rightsp, CAP_READ);
975			break;
976		case O_RDWR:
977			cap_rights_set(rightsp, CAP_READ);
978			/* FALLTHROUGH */
979		case O_WRONLY:
980			cap_rights_set(rightsp, CAP_WRITE);
981			if (!(flags & (O_APPEND | O_TRUNC)))
982				cap_rights_set(rightsp, CAP_SEEK);
983			break;
984		}
985	}
986
987	if (flags & O_CREAT)
988		cap_rights_set(rightsp, CAP_CREATE);
989
990	if (flags & O_TRUNC)
991		cap_rights_set(rightsp, CAP_FTRUNCATE);
992
993	if (flags & (O_SYNC | O_FSYNC))
994		cap_rights_set(rightsp, CAP_FSYNC);
995
996	if (flags & (O_EXLOCK | O_SHLOCK))
997		cap_rights_set(rightsp, CAP_FLOCK);
998}
999
1000/*
1001 * Check permissions, allocate an open file structure, and call the device
1002 * open routine if any.
1003 */
1004#ifndef _SYS_SYSPROTO_H_
1005struct open_args {
1006	char	*path;
1007	int	flags;
1008	int	mode;
1009};
1010#endif
1011int
1012sys_open(td, uap)
1013	struct thread *td;
1014	register struct open_args /* {
1015		char *path;
1016		int flags;
1017		int mode;
1018	} */ *uap;
1019{
1020
1021	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1022}
1023
1024#ifndef _SYS_SYSPROTO_H_
1025struct openat_args {
1026	int	fd;
1027	char	*path;
1028	int	flag;
1029	int	mode;
1030};
1031#endif
1032int
1033sys_openat(struct thread *td, struct openat_args *uap)
1034{
1035
1036	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1037	    uap->mode));
1038}
1039
1040int
1041kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1042    int mode)
1043{
1044
1045	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1046}
1047
1048int
1049kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1050    int flags, int mode)
1051{
1052	struct proc *p = td->td_proc;
1053	struct filedesc *fdp = p->p_fd;
1054	struct file *fp;
1055	struct vnode *vp;
1056	struct nameidata nd;
1057	cap_rights_t rights;
1058	int cmode, error, indx;
1059
1060	indx = -1;
1061
1062	AUDIT_ARG_FFLAGS(flags);
1063	AUDIT_ARG_MODE(mode);
1064	/* XXX: audit dirfd */
1065	cap_rights_init(&rights, CAP_LOOKUP);
1066	flags_to_rights(flags, &rights);
1067	/*
1068	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1069	 * may be specified.
1070	 */
1071	if (flags & O_EXEC) {
1072		if (flags & O_ACCMODE)
1073			return (EINVAL);
1074	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1075		return (EINVAL);
1076	} else {
1077		flags = FFLAGS(flags);
1078	}
1079
1080	/*
1081	 * Allocate the file descriptor, but don't install a descriptor yet.
1082	 */
1083	error = falloc_noinstall(td, &fp);
1084	if (error != 0)
1085		return (error);
1086	/*
1087	 * An extra reference on `fp' has been held for us by
1088	 * falloc_noinstall().
1089	 */
1090	/* Set the flags early so the finit in devfs can pick them up. */
1091	fp->f_flag = flags & FMASK;
1092	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1093	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1094	    &rights, td);
1095	td->td_dupfd = -1;		/* XXX check for fdopen */
1096	error = vn_open(&nd, &flags, cmode, fp);
1097	if (error != 0) {
1098		/*
1099		 * If the vn_open replaced the method vector, something
1100		 * wonderous happened deep below and we just pass it up
1101		 * pretending we know what we do.
1102		 */
1103		if (error == ENXIO && fp->f_ops != &badfileops)
1104			goto success;
1105
1106		/*
1107		 * Handle special fdopen() case. bleh.
1108		 *
1109		 * Don't do this for relative (capability) lookups; we don't
1110		 * understand exactly what would happen, and we don't think
1111		 * that it ever should.
1112		 */
1113		if (nd.ni_strictrelative == 0 &&
1114		    (error == ENODEV || error == ENXIO) &&
1115		    td->td_dupfd >= 0) {
1116			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1117			    &indx);
1118			if (error == 0)
1119				goto success;
1120		}
1121
1122		goto bad;
1123	}
1124	td->td_dupfd = 0;
1125	NDFREE(&nd, NDF_ONLY_PNBUF);
1126	vp = nd.ni_vp;
1127
1128	/*
1129	 * Store the vnode, for any f_type. Typically, the vnode use
1130	 * count is decremented by direct call to vn_closefile() for
1131	 * files that switched type in the cdevsw fdopen() method.
1132	 */
1133	fp->f_vnode = vp;
1134	/*
1135	 * If the file wasn't claimed by devfs bind it to the normal
1136	 * vnode operations here.
1137	 */
1138	if (fp->f_ops == &badfileops) {
1139		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1140		fp->f_seqcount = 1;
1141		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1142		    DTYPE_VNODE, vp, &vnops);
1143	}
1144
1145	VOP_UNLOCK(vp, 0);
1146	if (flags & O_TRUNC) {
1147		error = fo_truncate(fp, 0, td->td_ucred, td);
1148		if (error != 0)
1149			goto bad;
1150	}
1151success:
1152	/*
1153	 * If we haven't already installed the FD (for dupfdopen), do so now.
1154	 */
1155	if (indx == -1) {
1156		struct filecaps *fcaps;
1157
1158#ifdef CAPABILITIES
1159		if (nd.ni_strictrelative == 1)
1160			fcaps = &nd.ni_filecaps;
1161		else
1162#endif
1163			fcaps = NULL;
1164		error = finstall(td, fp, &indx, flags, fcaps);
1165		/* On success finstall() consumes fcaps. */
1166		if (error != 0) {
1167			filecaps_free(&nd.ni_filecaps);
1168			goto bad;
1169		}
1170	} else {
1171		filecaps_free(&nd.ni_filecaps);
1172	}
1173
1174	/*
1175	 * Release our private reference, leaving the one associated with
1176	 * the descriptor table intact.
1177	 */
1178	fdrop(fp, td);
1179	td->td_retval[0] = indx;
1180	return (0);
1181bad:
1182	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1183	fdrop(fp, td);
1184	return (error);
1185}
1186
1187#ifdef COMPAT_43
1188/*
1189 * Create a file.
1190 */
1191#ifndef _SYS_SYSPROTO_H_
1192struct ocreat_args {
1193	char	*path;
1194	int	mode;
1195};
1196#endif
1197int
1198ocreat(td, uap)
1199	struct thread *td;
1200	register struct ocreat_args /* {
1201		char *path;
1202		int mode;
1203	} */ *uap;
1204{
1205
1206	return (kern_open(td, uap->path, UIO_USERSPACE,
1207	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1208}
1209#endif /* COMPAT_43 */
1210
1211/*
1212 * Create a special file.
1213 */
1214#ifndef _SYS_SYSPROTO_H_
1215struct mknod_args {
1216	char	*path;
1217	int	mode;
1218	int	dev;
1219};
1220#endif
1221int
1222sys_mknod(td, uap)
1223	struct thread *td;
1224	register struct mknod_args /* {
1225		char *path;
1226		int mode;
1227		int dev;
1228	} */ *uap;
1229{
1230
1231	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1232}
1233
1234#ifndef _SYS_SYSPROTO_H_
1235struct mknodat_args {
1236	int	fd;
1237	char	*path;
1238	mode_t	mode;
1239	dev_t	dev;
1240};
1241#endif
1242int
1243sys_mknodat(struct thread *td, struct mknodat_args *uap)
1244{
1245
1246	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1247	    uap->dev));
1248}
1249
1250int
1251kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1252    int dev)
1253{
1254
1255	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1256}
1257
1258int
1259kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1260    int mode, int dev)
1261{
1262	struct vnode *vp;
1263	struct mount *mp;
1264	struct vattr vattr;
1265	struct nameidata nd;
1266	cap_rights_t rights;
1267	int error, whiteout = 0;
1268
1269	AUDIT_ARG_MODE(mode);
1270	AUDIT_ARG_DEV(dev);
1271	switch (mode & S_IFMT) {
1272	case S_IFCHR:
1273	case S_IFBLK:
1274		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1275		break;
1276	case S_IFMT:
1277		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1278		break;
1279	case S_IFWHT:
1280		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1281		break;
1282	case S_IFIFO:
1283		if (dev == 0)
1284			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1285		/* FALLTHROUGH */
1286	default:
1287		error = EINVAL;
1288		break;
1289	}
1290	if (error != 0)
1291		return (error);
1292restart:
1293	bwillwrite();
1294	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1295	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
1296	if ((error = namei(&nd)) != 0)
1297		return (error);
1298	vp = nd.ni_vp;
1299	if (vp != NULL) {
1300		NDFREE(&nd, NDF_ONLY_PNBUF);
1301		if (vp == nd.ni_dvp)
1302			vrele(nd.ni_dvp);
1303		else
1304			vput(nd.ni_dvp);
1305		vrele(vp);
1306		return (EEXIST);
1307	} else {
1308		VATTR_NULL(&vattr);
1309		vattr.va_mode = (mode & ALLPERMS) &
1310		    ~td->td_proc->p_fd->fd_cmask;
1311		vattr.va_rdev = dev;
1312		whiteout = 0;
1313
1314		switch (mode & S_IFMT) {
1315		case S_IFMT:	/* used by badsect to flag bad sectors */
1316			vattr.va_type = VBAD;
1317			break;
1318		case S_IFCHR:
1319			vattr.va_type = VCHR;
1320			break;
1321		case S_IFBLK:
1322			vattr.va_type = VBLK;
1323			break;
1324		case S_IFWHT:
1325			whiteout = 1;
1326			break;
1327		default:
1328			panic("kern_mknod: invalid mode");
1329		}
1330	}
1331	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1332		NDFREE(&nd, NDF_ONLY_PNBUF);
1333		vput(nd.ni_dvp);
1334		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1335			return (error);
1336		goto restart;
1337	}
1338#ifdef MAC
1339	if (error == 0 && !whiteout)
1340		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1341		    &nd.ni_cnd, &vattr);
1342#endif
1343	if (error == 0) {
1344		if (whiteout)
1345			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1346		else {
1347			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1348						&nd.ni_cnd, &vattr);
1349			if (error == 0)
1350				vput(nd.ni_vp);
1351		}
1352	}
1353	NDFREE(&nd, NDF_ONLY_PNBUF);
1354	vput(nd.ni_dvp);
1355	vn_finished_write(mp);
1356	return (error);
1357}
1358
1359/*
1360 * Create a named pipe.
1361 */
1362#ifndef _SYS_SYSPROTO_H_
1363struct mkfifo_args {
1364	char	*path;
1365	int	mode;
1366};
1367#endif
1368int
1369sys_mkfifo(td, uap)
1370	struct thread *td;
1371	register struct mkfifo_args /* {
1372		char *path;
1373		int mode;
1374	} */ *uap;
1375{
1376
1377	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1378}
1379
1380#ifndef _SYS_SYSPROTO_H_
1381struct mkfifoat_args {
1382	int	fd;
1383	char	*path;
1384	mode_t	mode;
1385};
1386#endif
1387int
1388sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1389{
1390
1391	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1392	    uap->mode));
1393}
1394
1395int
1396kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1397{
1398
1399	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1400}
1401
1402int
1403kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1404    int mode)
1405{
1406	struct mount *mp;
1407	struct vattr vattr;
1408	struct nameidata nd;
1409	cap_rights_t rights;
1410	int error;
1411
1412	AUDIT_ARG_MODE(mode);
1413restart:
1414	bwillwrite();
1415	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1416	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
1417	if ((error = namei(&nd)) != 0)
1418		return (error);
1419	if (nd.ni_vp != NULL) {
1420		NDFREE(&nd, NDF_ONLY_PNBUF);
1421		if (nd.ni_vp == nd.ni_dvp)
1422			vrele(nd.ni_dvp);
1423		else
1424			vput(nd.ni_dvp);
1425		vrele(nd.ni_vp);
1426		return (EEXIST);
1427	}
1428	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1429		NDFREE(&nd, NDF_ONLY_PNBUF);
1430		vput(nd.ni_dvp);
1431		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1432			return (error);
1433		goto restart;
1434	}
1435	VATTR_NULL(&vattr);
1436	vattr.va_type = VFIFO;
1437	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1438#ifdef MAC
1439	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1440	    &vattr);
1441	if (error != 0)
1442		goto out;
1443#endif
1444	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1445	if (error == 0)
1446		vput(nd.ni_vp);
1447#ifdef MAC
1448out:
1449#endif
1450	vput(nd.ni_dvp);
1451	vn_finished_write(mp);
1452	NDFREE(&nd, NDF_ONLY_PNBUF);
1453	return (error);
1454}
1455
1456/*
1457 * Make a hard file link.
1458 */
1459#ifndef _SYS_SYSPROTO_H_
1460struct link_args {
1461	char	*path;
1462	char	*link;
1463};
1464#endif
1465int
1466sys_link(td, uap)
1467	struct thread *td;
1468	register struct link_args /* {
1469		char *path;
1470		char *link;
1471	} */ *uap;
1472{
1473
1474	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1475}
1476
1477#ifndef _SYS_SYSPROTO_H_
1478struct linkat_args {
1479	int	fd1;
1480	char	*path1;
1481	int	fd2;
1482	char	*path2;
1483	int	flag;
1484};
1485#endif
1486int
1487sys_linkat(struct thread *td, struct linkat_args *uap)
1488{
1489	int flag;
1490
1491	flag = uap->flag;
1492	if (flag & ~AT_SYMLINK_FOLLOW)
1493		return (EINVAL);
1494
1495	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1496	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1497}
1498
1499int hardlink_check_uid = 0;
1500SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1501    &hardlink_check_uid, 0,
1502    "Unprivileged processes cannot create hard links to files owned by other "
1503    "users");
1504static int hardlink_check_gid = 0;
1505SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1506    &hardlink_check_gid, 0,
1507    "Unprivileged processes cannot create hard links to files owned by other "
1508    "groups");
1509
1510static int
1511can_hardlink(struct vnode *vp, struct ucred *cred)
1512{
1513	struct vattr va;
1514	int error;
1515
1516	if (!hardlink_check_uid && !hardlink_check_gid)
1517		return (0);
1518
1519	error = VOP_GETATTR(vp, &va, cred);
1520	if (error != 0)
1521		return (error);
1522
1523	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1524		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1525		if (error != 0)
1526			return (error);
1527	}
1528
1529	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1530		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1531		if (error != 0)
1532			return (error);
1533	}
1534
1535	return (0);
1536}
1537
1538int
1539kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1540{
1541
1542	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1543}
1544
1545int
1546kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1547    enum uio_seg segflg, int follow)
1548{
1549	struct vnode *vp;
1550	struct mount *mp;
1551	struct nameidata nd;
1552	cap_rights_t rights;
1553	int error;
1554
1555	bwillwrite();
1556	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1557
1558again:
1559	if ((error = namei(&nd)) != 0)
1560		return (error);
1561	NDFREE(&nd, NDF_ONLY_PNBUF);
1562	vp = nd.ni_vp;
1563	if (vp->v_type == VDIR) {
1564		vrele(vp);
1565		return (EPERM);		/* POSIX */
1566	}
1567	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1568		vrele(vp);
1569		return (error);
1570	}
1571	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
1572	    segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
1573	if ((error = namei(&nd)) == 0) {
1574		if (nd.ni_vp != NULL) {
1575			if (nd.ni_dvp == nd.ni_vp)
1576				vrele(nd.ni_dvp);
1577			else
1578				vput(nd.ni_dvp);
1579			vrele(nd.ni_vp);
1580			error = EEXIST;
1581		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1582			/*
1583			 * Check for cross-device links.  No need to
1584			 * recheck vp->v_type, since it cannot change
1585			 * for non-doomed vnode.
1586			 */
1587			if (nd.ni_dvp->v_mount != vp->v_mount)
1588				error = EXDEV;
1589			else
1590				error = can_hardlink(vp, td->td_ucred);
1591			if (error == 0)
1592#ifdef MAC
1593				error = mac_vnode_check_link(td->td_ucred,
1594				    nd.ni_dvp, vp, &nd.ni_cnd);
1595			if (error == 0)
1596#endif
1597				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1598			VOP_UNLOCK(vp, 0);
1599			vput(nd.ni_dvp);
1600		} else {
1601			vput(nd.ni_dvp);
1602			NDFREE(&nd, NDF_ONLY_PNBUF);
1603			vrele(vp);
1604			vn_finished_write(mp);
1605			goto again;
1606		}
1607		NDFREE(&nd, NDF_ONLY_PNBUF);
1608	}
1609	vrele(vp);
1610	vn_finished_write(mp);
1611	return (error);
1612}
1613
1614/*
1615 * Make a symbolic link.
1616 */
1617#ifndef _SYS_SYSPROTO_H_
1618struct symlink_args {
1619	char	*path;
1620	char	*link;
1621};
1622#endif
1623int
1624sys_symlink(td, uap)
1625	struct thread *td;
1626	register struct symlink_args /* {
1627		char *path;
1628		char *link;
1629	} */ *uap;
1630{
1631
1632	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1633}
1634
1635#ifndef _SYS_SYSPROTO_H_
1636struct symlinkat_args {
1637	char	*path;
1638	int	fd;
1639	char	*path2;
1640};
1641#endif
1642int
1643sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1644{
1645
1646	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1647	    UIO_USERSPACE));
1648}
1649
1650int
1651kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1652{
1653
1654	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1655}
1656
1657int
1658kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1659    enum uio_seg segflg)
1660{
1661	struct mount *mp;
1662	struct vattr vattr;
1663	char *syspath;
1664	struct nameidata nd;
1665	int error;
1666	cap_rights_t rights;
1667
1668	if (segflg == UIO_SYSSPACE) {
1669		syspath = path1;
1670	} else {
1671		syspath = uma_zalloc(namei_zone, M_WAITOK);
1672		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1673			goto out;
1674	}
1675	AUDIT_ARG_TEXT(syspath);
1676restart:
1677	bwillwrite();
1678	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1679	    segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
1680	if ((error = namei(&nd)) != 0)
1681		goto out;
1682	if (nd.ni_vp) {
1683		NDFREE(&nd, NDF_ONLY_PNBUF);
1684		if (nd.ni_vp == nd.ni_dvp)
1685			vrele(nd.ni_dvp);
1686		else
1687			vput(nd.ni_dvp);
1688		vrele(nd.ni_vp);
1689		error = EEXIST;
1690		goto out;
1691	}
1692	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1693		NDFREE(&nd, NDF_ONLY_PNBUF);
1694		vput(nd.ni_dvp);
1695		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1696			goto out;
1697		goto restart;
1698	}
1699	VATTR_NULL(&vattr);
1700	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1701#ifdef MAC
1702	vattr.va_type = VLNK;
1703	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1704	    &vattr);
1705	if (error != 0)
1706		goto out2;
1707#endif
1708	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1709	if (error == 0)
1710		vput(nd.ni_vp);
1711#ifdef MAC
1712out2:
1713#endif
1714	NDFREE(&nd, NDF_ONLY_PNBUF);
1715	vput(nd.ni_dvp);
1716	vn_finished_write(mp);
1717out:
1718	if (segflg != UIO_SYSSPACE)
1719		uma_zfree(namei_zone, syspath);
1720	return (error);
1721}
1722
1723/*
1724 * Delete a whiteout from the filesystem.
1725 */
1726int
1727sys_undelete(td, uap)
1728	struct thread *td;
1729	register struct undelete_args /* {
1730		char *path;
1731	} */ *uap;
1732{
1733	struct mount *mp;
1734	struct nameidata nd;
1735	int error;
1736
1737restart:
1738	bwillwrite();
1739	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1740	    UIO_USERSPACE, uap->path, td);
1741	error = namei(&nd);
1742	if (error != 0)
1743		return (error);
1744
1745	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1746		NDFREE(&nd, NDF_ONLY_PNBUF);
1747		if (nd.ni_vp == nd.ni_dvp)
1748			vrele(nd.ni_dvp);
1749		else
1750			vput(nd.ni_dvp);
1751		if (nd.ni_vp)
1752			vrele(nd.ni_vp);
1753		return (EEXIST);
1754	}
1755	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1756		NDFREE(&nd, NDF_ONLY_PNBUF);
1757		vput(nd.ni_dvp);
1758		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1759			return (error);
1760		goto restart;
1761	}
1762	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1763	NDFREE(&nd, NDF_ONLY_PNBUF);
1764	vput(nd.ni_dvp);
1765	vn_finished_write(mp);
1766	return (error);
1767}
1768
1769/*
1770 * Delete a name from the filesystem.
1771 */
1772#ifndef _SYS_SYSPROTO_H_
1773struct unlink_args {
1774	char	*path;
1775};
1776#endif
1777int
1778sys_unlink(td, uap)
1779	struct thread *td;
1780	struct unlink_args /* {
1781		char *path;
1782	} */ *uap;
1783{
1784
1785	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1786}
1787
1788#ifndef _SYS_SYSPROTO_H_
1789struct unlinkat_args {
1790	int	fd;
1791	char	*path;
1792	int	flag;
1793};
1794#endif
1795int
1796sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1797{
1798	int flag = uap->flag;
1799	int fd = uap->fd;
1800	char *path = uap->path;
1801
1802	if (flag & ~AT_REMOVEDIR)
1803		return (EINVAL);
1804
1805	if (flag & AT_REMOVEDIR)
1806		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1807	else
1808		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1809}
1810
1811int
1812kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1813{
1814
1815	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1816}
1817
1818int
1819kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1820    ino_t oldinum)
1821{
1822	struct mount *mp;
1823	struct vnode *vp;
1824	struct nameidata nd;
1825	struct stat sb;
1826	cap_rights_t rights;
1827	int error;
1828
1829restart:
1830	bwillwrite();
1831	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1832	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1833	if ((error = namei(&nd)) != 0)
1834		return (error == EINVAL ? EPERM : error);
1835	vp = nd.ni_vp;
1836	if (vp->v_type == VDIR && oldinum == 0) {
1837		error = EPERM;		/* POSIX */
1838	} else if (oldinum != 0 &&
1839		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1840		  sb.st_ino != oldinum) {
1841			error = EIDRM;	/* Identifier removed */
1842	} else {
1843		/*
1844		 * The root of a mounted filesystem cannot be deleted.
1845		 *
1846		 * XXX: can this only be a VDIR case?
1847		 */
1848		if (vp->v_vflag & VV_ROOT)
1849			error = EBUSY;
1850	}
1851	if (error == 0) {
1852		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1853			NDFREE(&nd, NDF_ONLY_PNBUF);
1854			vput(nd.ni_dvp);
1855			if (vp == nd.ni_dvp)
1856				vrele(vp);
1857			else
1858				vput(vp);
1859			if ((error = vn_start_write(NULL, &mp,
1860			    V_XSLEEP | PCATCH)) != 0)
1861				return (error);
1862			goto restart;
1863		}
1864#ifdef MAC
1865		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1866		    &nd.ni_cnd);
1867		if (error != 0)
1868			goto out;
1869#endif
1870		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1871		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1872#ifdef MAC
1873out:
1874#endif
1875		vn_finished_write(mp);
1876	}
1877	NDFREE(&nd, NDF_ONLY_PNBUF);
1878	vput(nd.ni_dvp);
1879	if (vp == nd.ni_dvp)
1880		vrele(vp);
1881	else
1882		vput(vp);
1883	return (error);
1884}
1885
1886/*
1887 * Reposition read/write file offset.
1888 */
1889#ifndef _SYS_SYSPROTO_H_
1890struct lseek_args {
1891	int	fd;
1892	int	pad;
1893	off_t	offset;
1894	int	whence;
1895};
1896#endif
1897int
1898sys_lseek(td, uap)
1899	struct thread *td;
1900	register struct lseek_args /* {
1901		int fd;
1902		int pad;
1903		off_t offset;
1904		int whence;
1905	} */ *uap;
1906{
1907	struct file *fp;
1908	cap_rights_t rights;
1909	int error;
1910
1911	AUDIT_ARG_FD(uap->fd);
1912	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1913	if (error != 0)
1914		return (error);
1915	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1916	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1917	fdrop(fp, td);
1918	return (error);
1919}
1920
1921#if defined(COMPAT_43)
1922/*
1923 * Reposition read/write file offset.
1924 */
1925#ifndef _SYS_SYSPROTO_H_
1926struct olseek_args {
1927	int	fd;
1928	long	offset;
1929	int	whence;
1930};
1931#endif
1932int
1933olseek(td, uap)
1934	struct thread *td;
1935	register struct olseek_args /* {
1936		int fd;
1937		long offset;
1938		int whence;
1939	} */ *uap;
1940{
1941	struct lseek_args /* {
1942		int fd;
1943		int pad;
1944		off_t offset;
1945		int whence;
1946	} */ nuap;
1947
1948	nuap.fd = uap->fd;
1949	nuap.offset = uap->offset;
1950	nuap.whence = uap->whence;
1951	return (sys_lseek(td, &nuap));
1952}
1953#endif /* COMPAT_43 */
1954
1955/* Version with the 'pad' argument */
1956int
1957freebsd6_lseek(td, uap)
1958	struct thread *td;
1959	register struct freebsd6_lseek_args *uap;
1960{
1961	struct lseek_args ouap;
1962
1963	ouap.fd = uap->fd;
1964	ouap.offset = uap->offset;
1965	ouap.whence = uap->whence;
1966	return (sys_lseek(td, &ouap));
1967}
1968
1969/*
1970 * Check access permissions using passed credentials.
1971 */
1972static int
1973vn_access(vp, user_flags, cred, td)
1974	struct vnode	*vp;
1975	int		user_flags;
1976	struct ucred	*cred;
1977	struct thread	*td;
1978{
1979	accmode_t accmode;
1980	int error;
1981
1982	/* Flags == 0 means only check for existence. */
1983	error = 0;
1984	if (user_flags) {
1985		accmode = 0;
1986		if (user_flags & R_OK)
1987			accmode |= VREAD;
1988		if (user_flags & W_OK)
1989			accmode |= VWRITE;
1990		if (user_flags & X_OK)
1991			accmode |= VEXEC;
1992#ifdef MAC
1993		error = mac_vnode_check_access(cred, vp, accmode);
1994		if (error != 0)
1995			return (error);
1996#endif
1997		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1998			error = VOP_ACCESS(vp, accmode, cred, td);
1999	}
2000	return (error);
2001}
2002
2003/*
2004 * Check access permissions using "real" credentials.
2005 */
2006#ifndef _SYS_SYSPROTO_H_
2007struct access_args {
2008	char	*path;
2009	int	amode;
2010};
2011#endif
2012int
2013sys_access(td, uap)
2014	struct thread *td;
2015	register struct access_args /* {
2016		char *path;
2017		int amode;
2018	} */ *uap;
2019{
2020
2021	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2022}
2023
2024#ifndef _SYS_SYSPROTO_H_
2025struct faccessat_args {
2026	int	dirfd;
2027	char	*path;
2028	int	amode;
2029	int	flag;
2030}
2031#endif
2032int
2033sys_faccessat(struct thread *td, struct faccessat_args *uap)
2034{
2035
2036	if (uap->flag & ~AT_EACCESS)
2037		return (EINVAL);
2038	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2039	    uap->amode));
2040}
2041
2042int
2043kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2044{
2045
2046	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2047}
2048
2049int
2050kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2051    int flag, int amode)
2052{
2053	struct ucred *cred, *tmpcred;
2054	struct vnode *vp;
2055	struct nameidata nd;
2056	cap_rights_t rights;
2057	int error;
2058
2059	/*
2060	 * Create and modify a temporary credential instead of one that
2061	 * is potentially shared.
2062	 */
2063	if (!(flag & AT_EACCESS)) {
2064		cred = td->td_ucred;
2065		tmpcred = crdup(cred);
2066		tmpcred->cr_uid = cred->cr_ruid;
2067		tmpcred->cr_groups[0] = cred->cr_rgid;
2068		td->td_ucred = tmpcred;
2069	} else
2070		cred = tmpcred = td->td_ucred;
2071	AUDIT_ARG_VALUE(amode);
2072	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2073	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2074	    td);
2075	if ((error = namei(&nd)) != 0)
2076		goto out1;
2077	vp = nd.ni_vp;
2078
2079	error = vn_access(vp, amode, tmpcred, td);
2080	NDFREE(&nd, NDF_ONLY_PNBUF);
2081	vput(vp);
2082out1:
2083	if (!(flag & AT_EACCESS)) {
2084		td->td_ucred = cred;
2085		crfree(tmpcred);
2086	}
2087	return (error);
2088}
2089
2090/*
2091 * Check access permissions using "effective" credentials.
2092 */
2093#ifndef _SYS_SYSPROTO_H_
2094struct eaccess_args {
2095	char	*path;
2096	int	amode;
2097};
2098#endif
2099int
2100sys_eaccess(td, uap)
2101	struct thread *td;
2102	register struct eaccess_args /* {
2103		char *path;
2104		int amode;
2105	} */ *uap;
2106{
2107
2108	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2109}
2110
2111int
2112kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2113{
2114
2115	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2116}
2117
2118#if defined(COMPAT_43)
2119/*
2120 * Get file status; this version follows links.
2121 */
2122#ifndef _SYS_SYSPROTO_H_
2123struct ostat_args {
2124	char	*path;
2125	struct ostat *ub;
2126};
2127#endif
2128int
2129ostat(td, uap)
2130	struct thread *td;
2131	register struct ostat_args /* {
2132		char *path;
2133		struct ostat *ub;
2134	} */ *uap;
2135{
2136	struct stat sb;
2137	struct ostat osb;
2138	int error;
2139
2140	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2141	if (error != 0)
2142		return (error);
2143	cvtstat(&sb, &osb);
2144	return (copyout(&osb, uap->ub, sizeof (osb)));
2145}
2146
2147/*
2148 * Get file status; this version does not follow links.
2149 */
2150#ifndef _SYS_SYSPROTO_H_
2151struct olstat_args {
2152	char	*path;
2153	struct ostat *ub;
2154};
2155#endif
2156int
2157olstat(td, uap)
2158	struct thread *td;
2159	register struct olstat_args /* {
2160		char *path;
2161		struct ostat *ub;
2162	} */ *uap;
2163{
2164	struct stat sb;
2165	struct ostat osb;
2166	int error;
2167
2168	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2169	if (error != 0)
2170		return (error);
2171	cvtstat(&sb, &osb);
2172	return (copyout(&osb, uap->ub, sizeof (osb)));
2173}
2174
2175/*
2176 * Convert from an old to a new stat structure.
2177 */
2178void
2179cvtstat(st, ost)
2180	struct stat *st;
2181	struct ostat *ost;
2182{
2183
2184	ost->st_dev = st->st_dev;
2185	ost->st_ino = st->st_ino;
2186	ost->st_mode = st->st_mode;
2187	ost->st_nlink = st->st_nlink;
2188	ost->st_uid = st->st_uid;
2189	ost->st_gid = st->st_gid;
2190	ost->st_rdev = st->st_rdev;
2191	if (st->st_size < (quad_t)1 << 32)
2192		ost->st_size = st->st_size;
2193	else
2194		ost->st_size = -2;
2195	ost->st_atim = st->st_atim;
2196	ost->st_mtim = st->st_mtim;
2197	ost->st_ctim = st->st_ctim;
2198	ost->st_blksize = st->st_blksize;
2199	ost->st_blocks = st->st_blocks;
2200	ost->st_flags = st->st_flags;
2201	ost->st_gen = st->st_gen;
2202}
2203#endif /* COMPAT_43 */
2204
2205/*
2206 * Get file status; this version follows links.
2207 */
2208#ifndef _SYS_SYSPROTO_H_
2209struct stat_args {
2210	char	*path;
2211	struct stat *ub;
2212};
2213#endif
2214int
2215sys_stat(td, uap)
2216	struct thread *td;
2217	register struct stat_args /* {
2218		char *path;
2219		struct stat *ub;
2220	} */ *uap;
2221{
2222	struct stat sb;
2223	int error;
2224
2225	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2226	if (error == 0)
2227		error = copyout(&sb, uap->ub, sizeof (sb));
2228	return (error);
2229}
2230
2231#ifndef _SYS_SYSPROTO_H_
2232struct fstatat_args {
2233	int	fd;
2234	char	*path;
2235	struct stat	*buf;
2236	int	flag;
2237}
2238#endif
2239int
2240sys_fstatat(struct thread *td, struct fstatat_args *uap)
2241{
2242	struct stat sb;
2243	int error;
2244
2245	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2246	    UIO_USERSPACE, &sb);
2247	if (error == 0)
2248		error = copyout(&sb, uap->buf, sizeof (sb));
2249	return (error);
2250}
2251
2252int
2253kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2254{
2255
2256	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2257}
2258
2259int
2260kern_statat(struct thread *td, int flag, int fd, char *path,
2261    enum uio_seg pathseg, struct stat *sbp)
2262{
2263
2264	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2265}
2266
2267int
2268kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2269    enum uio_seg pathseg, struct stat *sbp,
2270    void (*hook)(struct vnode *vp, struct stat *sbp))
2271{
2272	struct nameidata nd;
2273	struct stat sb;
2274	cap_rights_t rights;
2275	int error;
2276
2277	if (flag & ~AT_SYMLINK_NOFOLLOW)
2278		return (EINVAL);
2279
2280	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2281	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2282	    cap_rights_init(&rights, CAP_FSTAT), td);
2283
2284	if ((error = namei(&nd)) != 0)
2285		return (error);
2286	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2287	if (error == 0) {
2288		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2289		if (S_ISREG(sb.st_mode))
2290			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2291		if (__predict_false(hook != NULL))
2292			hook(nd.ni_vp, &sb);
2293	}
2294	NDFREE(&nd, NDF_ONLY_PNBUF);
2295	vput(nd.ni_vp);
2296	if (error != 0)
2297		return (error);
2298	*sbp = sb;
2299#ifdef KTRACE
2300	if (KTRPOINT(td, KTR_STRUCT))
2301		ktrstat(&sb);
2302#endif
2303	return (0);
2304}
2305
2306/*
2307 * Get file status; this version does not follow links.
2308 */
2309#ifndef _SYS_SYSPROTO_H_
2310struct lstat_args {
2311	char	*path;
2312	struct stat *ub;
2313};
2314#endif
2315int
2316sys_lstat(td, uap)
2317	struct thread *td;
2318	register struct lstat_args /* {
2319		char *path;
2320		struct stat *ub;
2321	} */ *uap;
2322{
2323	struct stat sb;
2324	int error;
2325
2326	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2327	if (error == 0)
2328		error = copyout(&sb, uap->ub, sizeof (sb));
2329	return (error);
2330}
2331
2332int
2333kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2334{
2335
2336	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2337	    sbp));
2338}
2339
2340/*
2341 * Implementation of the NetBSD [l]stat() functions.
2342 */
2343void
2344cvtnstat(sb, nsb)
2345	struct stat *sb;
2346	struct nstat *nsb;
2347{
2348
2349	bzero(nsb, sizeof *nsb);
2350	nsb->st_dev = sb->st_dev;
2351	nsb->st_ino = sb->st_ino;
2352	nsb->st_mode = sb->st_mode;
2353	nsb->st_nlink = sb->st_nlink;
2354	nsb->st_uid = sb->st_uid;
2355	nsb->st_gid = sb->st_gid;
2356	nsb->st_rdev = sb->st_rdev;
2357	nsb->st_atim = sb->st_atim;
2358	nsb->st_mtim = sb->st_mtim;
2359	nsb->st_ctim = sb->st_ctim;
2360	nsb->st_size = sb->st_size;
2361	nsb->st_blocks = sb->st_blocks;
2362	nsb->st_blksize = sb->st_blksize;
2363	nsb->st_flags = sb->st_flags;
2364	nsb->st_gen = sb->st_gen;
2365	nsb->st_birthtim = sb->st_birthtim;
2366}
2367
2368#ifndef _SYS_SYSPROTO_H_
2369struct nstat_args {
2370	char	*path;
2371	struct nstat *ub;
2372};
2373#endif
2374int
2375sys_nstat(td, uap)
2376	struct thread *td;
2377	register struct nstat_args /* {
2378		char *path;
2379		struct nstat *ub;
2380	} */ *uap;
2381{
2382	struct stat sb;
2383	struct nstat nsb;
2384	int error;
2385
2386	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2387	if (error != 0)
2388		return (error);
2389	cvtnstat(&sb, &nsb);
2390	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2391}
2392
2393/*
2394 * NetBSD lstat.  Get file status; this version does not follow links.
2395 */
2396#ifndef _SYS_SYSPROTO_H_
2397struct lstat_args {
2398	char	*path;
2399	struct stat *ub;
2400};
2401#endif
2402int
2403sys_nlstat(td, uap)
2404	struct thread *td;
2405	register struct nlstat_args /* {
2406		char *path;
2407		struct nstat *ub;
2408	} */ *uap;
2409{
2410	struct stat sb;
2411	struct nstat nsb;
2412	int error;
2413
2414	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2415	if (error != 0)
2416		return (error);
2417	cvtnstat(&sb, &nsb);
2418	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2419}
2420
2421/*
2422 * Get configurable pathname variables.
2423 */
2424#ifndef _SYS_SYSPROTO_H_
2425struct pathconf_args {
2426	char	*path;
2427	int	name;
2428};
2429#endif
2430int
2431sys_pathconf(td, uap)
2432	struct thread *td;
2433	register struct pathconf_args /* {
2434		char *path;
2435		int name;
2436	} */ *uap;
2437{
2438
2439	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2440}
2441
2442#ifndef _SYS_SYSPROTO_H_
2443struct lpathconf_args {
2444	char	*path;
2445	int	name;
2446};
2447#endif
2448int
2449sys_lpathconf(td, uap)
2450	struct thread *td;
2451	register struct lpathconf_args /* {
2452		char *path;
2453		int name;
2454	} */ *uap;
2455{
2456
2457	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2458	    NOFOLLOW));
2459}
2460
2461int
2462kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2463    u_long flags)
2464{
2465	struct nameidata nd;
2466	int error;
2467
2468	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2469	    pathseg, path, td);
2470	if ((error = namei(&nd)) != 0)
2471		return (error);
2472	NDFREE(&nd, NDF_ONLY_PNBUF);
2473
2474	/* If asynchronous I/O is available, it works for all files. */
2475	if (name == _PC_ASYNC_IO)
2476		td->td_retval[0] = async_io_version;
2477	else
2478		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2479	vput(nd.ni_vp);
2480	return (error);
2481}
2482
2483/*
2484 * Return target name of a symbolic link.
2485 */
2486#ifndef _SYS_SYSPROTO_H_
2487struct readlink_args {
2488	char	*path;
2489	char	*buf;
2490	size_t	count;
2491};
2492#endif
2493int
2494sys_readlink(td, uap)
2495	struct thread *td;
2496	register struct readlink_args /* {
2497		char *path;
2498		char *buf;
2499		size_t count;
2500	} */ *uap;
2501{
2502
2503	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2504	    UIO_USERSPACE, uap->count));
2505}
2506#ifndef _SYS_SYSPROTO_H_
2507struct readlinkat_args {
2508	int	fd;
2509	char	*path;
2510	char	*buf;
2511	size_t	bufsize;
2512};
2513#endif
2514int
2515sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2516{
2517
2518	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2519	    uap->buf, UIO_USERSPACE, uap->bufsize));
2520}
2521
2522int
2523kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2524    enum uio_seg bufseg, size_t count)
2525{
2526
2527	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2528	    count));
2529}
2530
2531int
2532kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2533    char *buf, enum uio_seg bufseg, size_t count)
2534{
2535	struct vnode *vp;
2536	struct iovec aiov;
2537	struct uio auio;
2538	struct nameidata nd;
2539	int error;
2540
2541	if (count > IOSIZE_MAX)
2542		return (EINVAL);
2543
2544	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2545	    pathseg, path, fd, td);
2546
2547	if ((error = namei(&nd)) != 0)
2548		return (error);
2549	NDFREE(&nd, NDF_ONLY_PNBUF);
2550	vp = nd.ni_vp;
2551#ifdef MAC
2552	error = mac_vnode_check_readlink(td->td_ucred, vp);
2553	if (error != 0) {
2554		vput(vp);
2555		return (error);
2556	}
2557#endif
2558	if (vp->v_type != VLNK)
2559		error = EINVAL;
2560	else {
2561		aiov.iov_base = buf;
2562		aiov.iov_len = count;
2563		auio.uio_iov = &aiov;
2564		auio.uio_iovcnt = 1;
2565		auio.uio_offset = 0;
2566		auio.uio_rw = UIO_READ;
2567		auio.uio_segflg = bufseg;
2568		auio.uio_td = td;
2569		auio.uio_resid = count;
2570		error = VOP_READLINK(vp, &auio, td->td_ucred);
2571		td->td_retval[0] = count - auio.uio_resid;
2572	}
2573	vput(vp);
2574	return (error);
2575}
2576
2577/*
2578 * Common implementation code for chflags() and fchflags().
2579 */
2580static int
2581setfflags(td, vp, flags)
2582	struct thread *td;
2583	struct vnode *vp;
2584	u_long flags;
2585{
2586	struct mount *mp;
2587	struct vattr vattr;
2588	int error;
2589
2590	/* We can't support the value matching VNOVAL. */
2591	if (flags == VNOVAL)
2592		return (EOPNOTSUPP);
2593
2594	/*
2595	 * Prevent non-root users from setting flags on devices.  When
2596	 * a device is reused, users can retain ownership of the device
2597	 * if they are allowed to set flags and programs assume that
2598	 * chown can't fail when done as root.
2599	 */
2600	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2601		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2602		if (error != 0)
2603			return (error);
2604	}
2605
2606	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2607		return (error);
2608	VATTR_NULL(&vattr);
2609	vattr.va_flags = flags;
2610	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2611#ifdef MAC
2612	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2613	if (error == 0)
2614#endif
2615		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2616	VOP_UNLOCK(vp, 0);
2617	vn_finished_write(mp);
2618	return (error);
2619}
2620
2621/*
2622 * Change flags of a file given a path name.
2623 */
2624#ifndef _SYS_SYSPROTO_H_
2625struct chflags_args {
2626	const char *path;
2627	u_long	flags;
2628};
2629#endif
2630int
2631sys_chflags(td, uap)
2632	struct thread *td;
2633	register struct chflags_args /* {
2634		const char *path;
2635		u_long flags;
2636	} */ *uap;
2637{
2638
2639	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2640}
2641
2642#ifndef _SYS_SYSPROTO_H_
2643struct chflagsat_args {
2644	int	fd;
2645	const char *path;
2646	u_long	flags;
2647	int	atflag;
2648}
2649#endif
2650int
2651sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2652{
2653	int fd = uap->fd;
2654	const char *path = uap->path;
2655	u_long flags = uap->flags;
2656	int atflag = uap->atflag;
2657
2658	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2659		return (EINVAL);
2660
2661	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2662}
2663
2664static int
2665kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2666    u_long flags)
2667{
2668
2669	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2670}
2671
2672/*
2673 * Same as chflags() but doesn't follow symlinks.
2674 */
2675int
2676sys_lchflags(td, uap)
2677	struct thread *td;
2678	register struct lchflags_args /* {
2679		const char *path;
2680		u_long flags;
2681	} */ *uap;
2682{
2683
2684	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2685	    uap->flags, AT_SYMLINK_NOFOLLOW));
2686}
2687
2688static int
2689kern_chflagsat(struct thread *td, int fd, const char *path,
2690    enum uio_seg pathseg, u_long flags, int atflag)
2691{
2692	struct nameidata nd;
2693	cap_rights_t rights;
2694	int error, follow;
2695
2696	AUDIT_ARG_FFLAGS(flags);
2697	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2698	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2699	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2700	if ((error = namei(&nd)) != 0)
2701		return (error);
2702	NDFREE(&nd, NDF_ONLY_PNBUF);
2703	error = setfflags(td, nd.ni_vp, flags);
2704	vrele(nd.ni_vp);
2705	return (error);
2706}
2707
2708/*
2709 * Change flags of a file given a file descriptor.
2710 */
2711#ifndef _SYS_SYSPROTO_H_
2712struct fchflags_args {
2713	int	fd;
2714	u_long	flags;
2715};
2716#endif
2717int
2718sys_fchflags(td, uap)
2719	struct thread *td;
2720	register struct fchflags_args /* {
2721		int fd;
2722		u_long flags;
2723	} */ *uap;
2724{
2725	struct file *fp;
2726	cap_rights_t rights;
2727	int error;
2728
2729	AUDIT_ARG_FD(uap->fd);
2730	AUDIT_ARG_FFLAGS(uap->flags);
2731	error = getvnode(td->td_proc->p_fd, uap->fd,
2732	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2733	if (error != 0)
2734		return (error);
2735#ifdef AUDIT
2736	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2737	AUDIT_ARG_VNODE1(fp->f_vnode);
2738	VOP_UNLOCK(fp->f_vnode, 0);
2739#endif
2740	error = setfflags(td, fp->f_vnode, uap->flags);
2741	fdrop(fp, td);
2742	return (error);
2743}
2744
2745/*
2746 * Common implementation code for chmod(), lchmod() and fchmod().
2747 */
2748int
2749setfmode(td, cred, vp, mode)
2750	struct thread *td;
2751	struct ucred *cred;
2752	struct vnode *vp;
2753	int mode;
2754{
2755	struct mount *mp;
2756	struct vattr vattr;
2757	int error;
2758
2759	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2760		return (error);
2761	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2762	VATTR_NULL(&vattr);
2763	vattr.va_mode = mode & ALLPERMS;
2764#ifdef MAC
2765	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2766	if (error == 0)
2767#endif
2768		error = VOP_SETATTR(vp, &vattr, cred);
2769	VOP_UNLOCK(vp, 0);
2770	vn_finished_write(mp);
2771	return (error);
2772}
2773
2774/*
2775 * Change mode of a file given path name.
2776 */
2777#ifndef _SYS_SYSPROTO_H_
2778struct chmod_args {
2779	char	*path;
2780	int	mode;
2781};
2782#endif
2783int
2784sys_chmod(td, uap)
2785	struct thread *td;
2786	register struct chmod_args /* {
2787		char *path;
2788		int mode;
2789	} */ *uap;
2790{
2791
2792	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2793}
2794
2795#ifndef _SYS_SYSPROTO_H_
2796struct fchmodat_args {
2797	int	dirfd;
2798	char	*path;
2799	mode_t	mode;
2800	int	flag;
2801}
2802#endif
2803int
2804sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2805{
2806	int flag = uap->flag;
2807	int fd = uap->fd;
2808	char *path = uap->path;
2809	mode_t mode = uap->mode;
2810
2811	if (flag & ~AT_SYMLINK_NOFOLLOW)
2812		return (EINVAL);
2813
2814	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2815}
2816
2817int
2818kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2819{
2820
2821	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2822}
2823
2824/*
2825 * Change mode of a file given path name (don't follow links.)
2826 */
2827#ifndef _SYS_SYSPROTO_H_
2828struct lchmod_args {
2829	char	*path;
2830	int	mode;
2831};
2832#endif
2833int
2834sys_lchmod(td, uap)
2835	struct thread *td;
2836	register struct lchmod_args /* {
2837		char *path;
2838		int mode;
2839	} */ *uap;
2840{
2841
2842	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2843	    uap->mode, AT_SYMLINK_NOFOLLOW));
2844}
2845
2846int
2847kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2848    mode_t mode, int flag)
2849{
2850	struct nameidata nd;
2851	cap_rights_t rights;
2852	int error, follow;
2853
2854	AUDIT_ARG_MODE(mode);
2855	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2856	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2857	    cap_rights_init(&rights, CAP_FCHMOD), td);
2858	if ((error = namei(&nd)) != 0)
2859		return (error);
2860	NDFREE(&nd, NDF_ONLY_PNBUF);
2861	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2862	vrele(nd.ni_vp);
2863	return (error);
2864}
2865
2866/*
2867 * Change mode of a file given a file descriptor.
2868 */
2869#ifndef _SYS_SYSPROTO_H_
2870struct fchmod_args {
2871	int	fd;
2872	int	mode;
2873};
2874#endif
2875int
2876sys_fchmod(struct thread *td, struct fchmod_args *uap)
2877{
2878	struct file *fp;
2879	cap_rights_t rights;
2880	int error;
2881
2882	AUDIT_ARG_FD(uap->fd);
2883	AUDIT_ARG_MODE(uap->mode);
2884
2885	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2886	if (error != 0)
2887		return (error);
2888	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2889	fdrop(fp, td);
2890	return (error);
2891}
2892
2893/*
2894 * Common implementation for chown(), lchown(), and fchown()
2895 */
2896int
2897setfown(td, cred, vp, uid, gid)
2898	struct thread *td;
2899	struct ucred *cred;
2900	struct vnode *vp;
2901	uid_t uid;
2902	gid_t gid;
2903{
2904	struct mount *mp;
2905	struct vattr vattr;
2906	int error;
2907
2908	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2909		return (error);
2910	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2911	VATTR_NULL(&vattr);
2912	vattr.va_uid = uid;
2913	vattr.va_gid = gid;
2914#ifdef MAC
2915	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2916	    vattr.va_gid);
2917	if (error == 0)
2918#endif
2919		error = VOP_SETATTR(vp, &vattr, cred);
2920	VOP_UNLOCK(vp, 0);
2921	vn_finished_write(mp);
2922	return (error);
2923}
2924
2925/*
2926 * Set ownership given a path name.
2927 */
2928#ifndef _SYS_SYSPROTO_H_
2929struct chown_args {
2930	char	*path;
2931	int	uid;
2932	int	gid;
2933};
2934#endif
2935int
2936sys_chown(td, uap)
2937	struct thread *td;
2938	register struct chown_args /* {
2939		char *path;
2940		int uid;
2941		int gid;
2942	} */ *uap;
2943{
2944
2945	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2946}
2947
2948#ifndef _SYS_SYSPROTO_H_
2949struct fchownat_args {
2950	int fd;
2951	const char * path;
2952	uid_t uid;
2953	gid_t gid;
2954	int flag;
2955};
2956#endif
2957int
2958sys_fchownat(struct thread *td, struct fchownat_args *uap)
2959{
2960	int flag;
2961
2962	flag = uap->flag;
2963	if (flag & ~AT_SYMLINK_NOFOLLOW)
2964		return (EINVAL);
2965
2966	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2967	    uap->gid, uap->flag));
2968}
2969
2970int
2971kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2972    int gid)
2973{
2974
2975	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2976}
2977
2978int
2979kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2980    int uid, int gid, int flag)
2981{
2982	struct nameidata nd;
2983	cap_rights_t rights;
2984	int error, follow;
2985
2986	AUDIT_ARG_OWNER(uid, gid);
2987	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2988	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2989	    cap_rights_init(&rights, CAP_FCHOWN), td);
2990
2991	if ((error = namei(&nd)) != 0)
2992		return (error);
2993	NDFREE(&nd, NDF_ONLY_PNBUF);
2994	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2995	vrele(nd.ni_vp);
2996	return (error);
2997}
2998
2999/*
3000 * Set ownership given a path name, do not cross symlinks.
3001 */
3002#ifndef _SYS_SYSPROTO_H_
3003struct lchown_args {
3004	char	*path;
3005	int	uid;
3006	int	gid;
3007};
3008#endif
3009int
3010sys_lchown(td, uap)
3011	struct thread *td;
3012	register struct lchown_args /* {
3013		char *path;
3014		int uid;
3015		int gid;
3016	} */ *uap;
3017{
3018
3019	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3020}
3021
3022int
3023kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3024    int gid)
3025{
3026
3027	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3028	    AT_SYMLINK_NOFOLLOW));
3029}
3030
3031/*
3032 * Set ownership given a file descriptor.
3033 */
3034#ifndef _SYS_SYSPROTO_H_
3035struct fchown_args {
3036	int	fd;
3037	int	uid;
3038	int	gid;
3039};
3040#endif
3041int
3042sys_fchown(td, uap)
3043	struct thread *td;
3044	register struct fchown_args /* {
3045		int fd;
3046		int uid;
3047		int gid;
3048	} */ *uap;
3049{
3050	struct file *fp;
3051	cap_rights_t rights;
3052	int error;
3053
3054	AUDIT_ARG_FD(uap->fd);
3055	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3056	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3057	if (error != 0)
3058		return (error);
3059	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3060	fdrop(fp, td);
3061	return (error);
3062}
3063
3064/*
3065 * Common implementation code for utimes(), lutimes(), and futimes().
3066 */
3067static int
3068getutimes(usrtvp, tvpseg, tsp)
3069	const struct timeval *usrtvp;
3070	enum uio_seg tvpseg;
3071	struct timespec *tsp;
3072{
3073	struct timeval tv[2];
3074	const struct timeval *tvp;
3075	int error;
3076
3077	if (usrtvp == NULL) {
3078		vfs_timestamp(&tsp[0]);
3079		tsp[1] = tsp[0];
3080	} else {
3081		if (tvpseg == UIO_SYSSPACE) {
3082			tvp = usrtvp;
3083		} else {
3084			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3085				return (error);
3086			tvp = tv;
3087		}
3088
3089		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3090		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3091			return (EINVAL);
3092		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3093		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3094	}
3095	return (0);
3096}
3097
3098/*
3099 * Common implementation code for utimes(), lutimes(), and futimes().
3100 */
3101static int
3102setutimes(td, vp, ts, numtimes, nullflag)
3103	struct thread *td;
3104	struct vnode *vp;
3105	const struct timespec *ts;
3106	int numtimes;
3107	int nullflag;
3108{
3109	struct mount *mp;
3110	struct vattr vattr;
3111	int error, setbirthtime;
3112
3113	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3114		return (error);
3115	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3116	setbirthtime = 0;
3117	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3118	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3119		setbirthtime = 1;
3120	VATTR_NULL(&vattr);
3121	vattr.va_atime = ts[0];
3122	vattr.va_mtime = ts[1];
3123	if (setbirthtime)
3124		vattr.va_birthtime = ts[1];
3125	if (numtimes > 2)
3126		vattr.va_birthtime = ts[2];
3127	if (nullflag)
3128		vattr.va_vaflags |= VA_UTIMES_NULL;
3129#ifdef MAC
3130	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3131	    vattr.va_mtime);
3132#endif
3133	if (error == 0)
3134		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3135	VOP_UNLOCK(vp, 0);
3136	vn_finished_write(mp);
3137	return (error);
3138}
3139
3140/*
3141 * Set the access and modification times of a file.
3142 */
3143#ifndef _SYS_SYSPROTO_H_
3144struct utimes_args {
3145	char	*path;
3146	struct	timeval *tptr;
3147};
3148#endif
3149int
3150sys_utimes(td, uap)
3151	struct thread *td;
3152	register struct utimes_args /* {
3153		char *path;
3154		struct timeval *tptr;
3155	} */ *uap;
3156{
3157
3158	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3159	    UIO_USERSPACE));
3160}
3161
3162#ifndef _SYS_SYSPROTO_H_
3163struct futimesat_args {
3164	int fd;
3165	const char * path;
3166	const struct timeval * times;
3167};
3168#endif
3169int
3170sys_futimesat(struct thread *td, struct futimesat_args *uap)
3171{
3172
3173	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3174	    uap->times, UIO_USERSPACE));
3175}
3176
3177int
3178kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3179    struct timeval *tptr, enum uio_seg tptrseg)
3180{
3181
3182	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3183}
3184
3185int
3186kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3187    struct timeval *tptr, enum uio_seg tptrseg)
3188{
3189	struct nameidata nd;
3190	struct timespec ts[2];
3191	cap_rights_t rights;
3192	int error;
3193
3194	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3195		return (error);
3196	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3197	    cap_rights_init(&rights, CAP_FUTIMES), td);
3198
3199	if ((error = namei(&nd)) != 0)
3200		return (error);
3201	NDFREE(&nd, NDF_ONLY_PNBUF);
3202	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3203	vrele(nd.ni_vp);
3204	return (error);
3205}
3206
3207/*
3208 * Set the access and modification times of a file.
3209 */
3210#ifndef _SYS_SYSPROTO_H_
3211struct lutimes_args {
3212	char	*path;
3213	struct	timeval *tptr;
3214};
3215#endif
3216int
3217sys_lutimes(td, uap)
3218	struct thread *td;
3219	register struct lutimes_args /* {
3220		char *path;
3221		struct timeval *tptr;
3222	} */ *uap;
3223{
3224
3225	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3226	    UIO_USERSPACE));
3227}
3228
3229int
3230kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3231    struct timeval *tptr, enum uio_seg tptrseg)
3232{
3233	struct timespec ts[2];
3234	struct nameidata nd;
3235	int error;
3236
3237	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3238		return (error);
3239	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3240	if ((error = namei(&nd)) != 0)
3241		return (error);
3242	NDFREE(&nd, NDF_ONLY_PNBUF);
3243	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3244	vrele(nd.ni_vp);
3245	return (error);
3246}
3247
3248/*
3249 * Set the access and modification times of a file.
3250 */
3251#ifndef _SYS_SYSPROTO_H_
3252struct futimes_args {
3253	int	fd;
3254	struct	timeval *tptr;
3255};
3256#endif
3257int
3258sys_futimes(td, uap)
3259	struct thread *td;
3260	register struct futimes_args /* {
3261		int  fd;
3262		struct timeval *tptr;
3263	} */ *uap;
3264{
3265
3266	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3267}
3268
3269int
3270kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3271    enum uio_seg tptrseg)
3272{
3273	struct timespec ts[2];
3274	struct file *fp;
3275	cap_rights_t rights;
3276	int error;
3277
3278	AUDIT_ARG_FD(fd);
3279	error = getutimes(tptr, tptrseg, ts);
3280	if (error != 0)
3281		return (error);
3282	error = getvnode(td->td_proc->p_fd, fd,
3283	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3284	if (error != 0)
3285		return (error);
3286#ifdef AUDIT
3287	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3288	AUDIT_ARG_VNODE1(fp->f_vnode);
3289	VOP_UNLOCK(fp->f_vnode, 0);
3290#endif
3291	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3292	fdrop(fp, td);
3293	return (error);
3294}
3295
3296/*
3297 * Truncate a file given its path name.
3298 */
3299#ifndef _SYS_SYSPROTO_H_
3300struct truncate_args {
3301	char	*path;
3302	int	pad;
3303	off_t	length;
3304};
3305#endif
3306int
3307sys_truncate(td, uap)
3308	struct thread *td;
3309	register struct truncate_args /* {
3310		char *path;
3311		int pad;
3312		off_t length;
3313	} */ *uap;
3314{
3315
3316	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3317}
3318
3319int
3320kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3321{
3322	struct mount *mp;
3323	struct vnode *vp;
3324	void *rl_cookie;
3325	struct vattr vattr;
3326	struct nameidata nd;
3327	int error;
3328
3329	if (length < 0)
3330		return(EINVAL);
3331	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3332	if ((error = namei(&nd)) != 0)
3333		return (error);
3334	vp = nd.ni_vp;
3335	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3336	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3337		vn_rangelock_unlock(vp, rl_cookie);
3338		vrele(vp);
3339		return (error);
3340	}
3341	NDFREE(&nd, NDF_ONLY_PNBUF);
3342	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3343	if (vp->v_type == VDIR)
3344		error = EISDIR;
3345#ifdef MAC
3346	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3347	}
3348#endif
3349	else if ((error = vn_writechk(vp)) == 0 &&
3350	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3351		VATTR_NULL(&vattr);
3352		vattr.va_size = length;
3353		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3354	}
3355	VOP_UNLOCK(vp, 0);
3356	vn_finished_write(mp);
3357	vn_rangelock_unlock(vp, rl_cookie);
3358	vrele(vp);
3359	return (error);
3360}
3361
3362#if defined(COMPAT_43)
3363/*
3364 * Truncate a file given its path name.
3365 */
3366#ifndef _SYS_SYSPROTO_H_
3367struct otruncate_args {
3368	char	*path;
3369	long	length;
3370};
3371#endif
3372int
3373otruncate(td, uap)
3374	struct thread *td;
3375	register struct otruncate_args /* {
3376		char *path;
3377		long length;
3378	} */ *uap;
3379{
3380	struct truncate_args /* {
3381		char *path;
3382		int pad;
3383		off_t length;
3384	} */ nuap;
3385
3386	nuap.path = uap->path;
3387	nuap.length = uap->length;
3388	return (sys_truncate(td, &nuap));
3389}
3390#endif /* COMPAT_43 */
3391
3392/* Versions with the pad argument */
3393int
3394freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3395{
3396	struct truncate_args ouap;
3397
3398	ouap.path = uap->path;
3399	ouap.length = uap->length;
3400	return (sys_truncate(td, &ouap));
3401}
3402
3403int
3404freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3405{
3406	struct ftruncate_args ouap;
3407
3408	ouap.fd = uap->fd;
3409	ouap.length = uap->length;
3410	return (sys_ftruncate(td, &ouap));
3411}
3412
3413/*
3414 * Sync an open file.
3415 */
3416#ifndef _SYS_SYSPROTO_H_
3417struct fsync_args {
3418	int	fd;
3419};
3420#endif
3421int
3422sys_fsync(td, uap)
3423	struct thread *td;
3424	struct fsync_args /* {
3425		int fd;
3426	} */ *uap;
3427{
3428	struct vnode *vp;
3429	struct mount *mp;
3430	struct file *fp;
3431	cap_rights_t rights;
3432	int error, lock_flags;
3433
3434	AUDIT_ARG_FD(uap->fd);
3435	error = getvnode(td->td_proc->p_fd, uap->fd,
3436	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3437	if (error != 0)
3438		return (error);
3439	vp = fp->f_vnode;
3440	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3441	if (error != 0)
3442		goto drop;
3443	if (MNT_SHARED_WRITES(mp) ||
3444	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3445		lock_flags = LK_SHARED;
3446	} else {
3447		lock_flags = LK_EXCLUSIVE;
3448	}
3449	vn_lock(vp, lock_flags | LK_RETRY);
3450	AUDIT_ARG_VNODE1(vp);
3451	if (vp->v_object != NULL) {
3452		VM_OBJECT_WLOCK(vp->v_object);
3453		vm_object_page_clean(vp->v_object, 0, 0, 0);
3454		VM_OBJECT_WUNLOCK(vp->v_object);
3455	}
3456	error = VOP_FSYNC(vp, MNT_WAIT, td);
3457
3458	VOP_UNLOCK(vp, 0);
3459	vn_finished_write(mp);
3460drop:
3461	fdrop(fp, td);
3462	return (error);
3463}
3464
3465/*
3466 * Rename files.  Source and destination must either both be directories, or
3467 * both not be directories.  If target is a directory, it must be empty.
3468 */
3469#ifndef _SYS_SYSPROTO_H_
3470struct rename_args {
3471	char	*from;
3472	char	*to;
3473};
3474#endif
3475int
3476sys_rename(td, uap)
3477	struct thread *td;
3478	register struct rename_args /* {
3479		char *from;
3480		char *to;
3481	} */ *uap;
3482{
3483
3484	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3485}
3486
3487#ifndef _SYS_SYSPROTO_H_
3488struct renameat_args {
3489	int	oldfd;
3490	char	*old;
3491	int	newfd;
3492	char	*new;
3493};
3494#endif
3495int
3496sys_renameat(struct thread *td, struct renameat_args *uap)
3497{
3498
3499	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3500	    UIO_USERSPACE));
3501}
3502
3503int
3504kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3505{
3506
3507	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3508}
3509
3510int
3511kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3512    enum uio_seg pathseg)
3513{
3514	struct mount *mp = NULL;
3515	struct vnode *tvp, *fvp, *tdvp;
3516	struct nameidata fromnd, tond;
3517	cap_rights_t rights;
3518	int error;
3519
3520	bwillwrite();
3521#ifdef MAC
3522	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3523	    AUDITVNODE1, pathseg, old, oldfd,
3524	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3525#else
3526	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3527	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3528#endif
3529
3530	if ((error = namei(&fromnd)) != 0)
3531		return (error);
3532#ifdef MAC
3533	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3534	    fromnd.ni_vp, &fromnd.ni_cnd);
3535	VOP_UNLOCK(fromnd.ni_dvp, 0);
3536	if (fromnd.ni_dvp != fromnd.ni_vp)
3537		VOP_UNLOCK(fromnd.ni_vp, 0);
3538#endif
3539	fvp = fromnd.ni_vp;
3540	if (error == 0)
3541		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3542	if (error != 0) {
3543		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3544		vrele(fromnd.ni_dvp);
3545		vrele(fvp);
3546		goto out1;
3547	}
3548	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3549	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3550	    cap_rights_init(&rights, CAP_LINKAT), td);
3551	if (fromnd.ni_vp->v_type == VDIR)
3552		tond.ni_cnd.cn_flags |= WILLBEDIR;
3553	if ((error = namei(&tond)) != 0) {
3554		/* Translate error code for rename("dir1", "dir2/."). */
3555		if (error == EISDIR && fvp->v_type == VDIR)
3556			error = EINVAL;
3557		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3558		vrele(fromnd.ni_dvp);
3559		vrele(fvp);
3560		vn_finished_write(mp);
3561		goto out1;
3562	}
3563	tdvp = tond.ni_dvp;
3564	tvp = tond.ni_vp;
3565	if (tvp != NULL) {
3566		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3567			error = ENOTDIR;
3568			goto out;
3569		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3570			error = EISDIR;
3571			goto out;
3572		}
3573#ifdef CAPABILITIES
3574		if (newfd != AT_FDCWD) {
3575			/*
3576			 * If the target already exists we require CAP_UNLINKAT
3577			 * from 'newfd'.
3578			 */
3579			error = cap_check(&tond.ni_filecaps.fc_rights,
3580			    cap_rights_init(&rights, CAP_UNLINKAT));
3581			if (error != 0)
3582				goto out;
3583		}
3584#endif
3585	}
3586	if (fvp == tdvp) {
3587		error = EINVAL;
3588		goto out;
3589	}
3590	/*
3591	 * If the source is the same as the destination (that is, if they
3592	 * are links to the same vnode), then there is nothing to do.
3593	 */
3594	if (fvp == tvp)
3595		error = -1;
3596#ifdef MAC
3597	else
3598		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3599		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3600#endif
3601out:
3602	if (error == 0) {
3603		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3604		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3605		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3606		NDFREE(&tond, NDF_ONLY_PNBUF);
3607	} else {
3608		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3609		NDFREE(&tond, NDF_ONLY_PNBUF);
3610		if (tvp != NULL)
3611			vput(tvp);
3612		if (tdvp == tvp)
3613			vrele(tdvp);
3614		else
3615			vput(tdvp);
3616		vrele(fromnd.ni_dvp);
3617		vrele(fvp);
3618	}
3619	vrele(tond.ni_startdir);
3620	vn_finished_write(mp);
3621out1:
3622	if (fromnd.ni_startdir)
3623		vrele(fromnd.ni_startdir);
3624	if (error == -1)
3625		return (0);
3626	return (error);
3627}
3628
3629/*
3630 * Make a directory file.
3631 */
3632#ifndef _SYS_SYSPROTO_H_
3633struct mkdir_args {
3634	char	*path;
3635	int	mode;
3636};
3637#endif
3638int
3639sys_mkdir(td, uap)
3640	struct thread *td;
3641	register struct mkdir_args /* {
3642		char *path;
3643		int mode;
3644	} */ *uap;
3645{
3646
3647	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3648}
3649
3650#ifndef _SYS_SYSPROTO_H_
3651struct mkdirat_args {
3652	int	fd;
3653	char	*path;
3654	mode_t	mode;
3655};
3656#endif
3657int
3658sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3659{
3660
3661	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3662}
3663
3664int
3665kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3666{
3667
3668	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3669}
3670
3671int
3672kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3673    int mode)
3674{
3675	struct mount *mp;
3676	struct vnode *vp;
3677	struct vattr vattr;
3678	struct nameidata nd;
3679	cap_rights_t rights;
3680	int error;
3681
3682	AUDIT_ARG_MODE(mode);
3683restart:
3684	bwillwrite();
3685	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
3686	    segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
3687	nd.ni_cnd.cn_flags |= WILLBEDIR;
3688	if ((error = namei(&nd)) != 0)
3689		return (error);
3690	vp = nd.ni_vp;
3691	if (vp != NULL) {
3692		NDFREE(&nd, NDF_ONLY_PNBUF);
3693		/*
3694		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3695		 * the strange behaviour of leaving the vnode unlocked
3696		 * if the target is the same vnode as the parent.
3697		 */
3698		if (vp == nd.ni_dvp)
3699			vrele(nd.ni_dvp);
3700		else
3701			vput(nd.ni_dvp);
3702		vrele(vp);
3703		return (EEXIST);
3704	}
3705	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3706		NDFREE(&nd, NDF_ONLY_PNBUF);
3707		vput(nd.ni_dvp);
3708		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3709			return (error);
3710		goto restart;
3711	}
3712	VATTR_NULL(&vattr);
3713	vattr.va_type = VDIR;
3714	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3715#ifdef MAC
3716	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3717	    &vattr);
3718	if (error != 0)
3719		goto out;
3720#endif
3721	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3722#ifdef MAC
3723out:
3724#endif
3725	NDFREE(&nd, NDF_ONLY_PNBUF);
3726	vput(nd.ni_dvp);
3727	if (error == 0)
3728		vput(nd.ni_vp);
3729	vn_finished_write(mp);
3730	return (error);
3731}
3732
3733/*
3734 * Remove a directory file.
3735 */
3736#ifndef _SYS_SYSPROTO_H_
3737struct rmdir_args {
3738	char	*path;
3739};
3740#endif
3741int
3742sys_rmdir(td, uap)
3743	struct thread *td;
3744	struct rmdir_args /* {
3745		char *path;
3746	} */ *uap;
3747{
3748
3749	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3750}
3751
3752int
3753kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3754{
3755
3756	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3757}
3758
3759int
3760kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3761{
3762	struct mount *mp;
3763	struct vnode *vp;
3764	struct nameidata nd;
3765	cap_rights_t rights;
3766	int error;
3767
3768restart:
3769	bwillwrite();
3770	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3771	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3772	if ((error = namei(&nd)) != 0)
3773		return (error);
3774	vp = nd.ni_vp;
3775	if (vp->v_type != VDIR) {
3776		error = ENOTDIR;
3777		goto out;
3778	}
3779	/*
3780	 * No rmdir "." please.
3781	 */
3782	if (nd.ni_dvp == vp) {
3783		error = EINVAL;
3784		goto out;
3785	}
3786	/*
3787	 * The root of a mounted filesystem cannot be deleted.
3788	 */
3789	if (vp->v_vflag & VV_ROOT) {
3790		error = EBUSY;
3791		goto out;
3792	}
3793#ifdef MAC
3794	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3795	    &nd.ni_cnd);
3796	if (error != 0)
3797		goto out;
3798#endif
3799	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3800		NDFREE(&nd, NDF_ONLY_PNBUF);
3801		vput(vp);
3802		if (nd.ni_dvp == vp)
3803			vrele(nd.ni_dvp);
3804		else
3805			vput(nd.ni_dvp);
3806		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3807			return (error);
3808		goto restart;
3809	}
3810	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3811	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3812	vn_finished_write(mp);
3813out:
3814	NDFREE(&nd, NDF_ONLY_PNBUF);
3815	vput(vp);
3816	if (nd.ni_dvp == vp)
3817		vrele(nd.ni_dvp);
3818	else
3819		vput(nd.ni_dvp);
3820	return (error);
3821}
3822
3823#ifdef COMPAT_43
3824/*
3825 * Read a block of directory entries in a filesystem independent format.
3826 */
3827#ifndef _SYS_SYSPROTO_H_
3828struct ogetdirentries_args {
3829	int	fd;
3830	char	*buf;
3831	u_int	count;
3832	long	*basep;
3833};
3834#endif
3835int
3836ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3837{
3838	long loff;
3839	int error;
3840
3841	error = kern_ogetdirentries(td, uap, &loff);
3842	if (error == 0)
3843		error = copyout(&loff, uap->basep, sizeof(long));
3844	return (error);
3845}
3846
3847int
3848kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3849    long *ploff)
3850{
3851	struct vnode *vp;
3852	struct file *fp;
3853	struct uio auio, kuio;
3854	struct iovec aiov, kiov;
3855	struct dirent *dp, *edp;
3856	cap_rights_t rights;
3857	caddr_t dirbuf;
3858	int error, eofflag, readcnt;
3859	long loff;
3860	off_t foffset;
3861
3862	/* XXX arbitrary sanity limit on `count'. */
3863	if (uap->count > 64 * 1024)
3864		return (EINVAL);
3865	error = getvnode(td->td_proc->p_fd, uap->fd,
3866	    cap_rights_init(&rights, CAP_READ), &fp);
3867	if (error != 0)
3868		return (error);
3869	if ((fp->f_flag & FREAD) == 0) {
3870		fdrop(fp, td);
3871		return (EBADF);
3872	}
3873	vp = fp->f_vnode;
3874	foffset = foffset_lock(fp, 0);
3875unionread:
3876	if (vp->v_type != VDIR) {
3877		foffset_unlock(fp, foffset, 0);
3878		fdrop(fp, td);
3879		return (EINVAL);
3880	}
3881	aiov.iov_base = uap->buf;
3882	aiov.iov_len = uap->count;
3883	auio.uio_iov = &aiov;
3884	auio.uio_iovcnt = 1;
3885	auio.uio_rw = UIO_READ;
3886	auio.uio_segflg = UIO_USERSPACE;
3887	auio.uio_td = td;
3888	auio.uio_resid = uap->count;
3889	vn_lock(vp, LK_SHARED | LK_RETRY);
3890	loff = auio.uio_offset = foffset;
3891#ifdef MAC
3892	error = mac_vnode_check_readdir(td->td_ucred, vp);
3893	if (error != 0) {
3894		VOP_UNLOCK(vp, 0);
3895		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3896		fdrop(fp, td);
3897		return (error);
3898	}
3899#endif
3900#	if (BYTE_ORDER != LITTLE_ENDIAN)
3901		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3902			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3903			    NULL, NULL);
3904			foffset = auio.uio_offset;
3905		} else
3906#	endif
3907	{
3908		kuio = auio;
3909		kuio.uio_iov = &kiov;
3910		kuio.uio_segflg = UIO_SYSSPACE;
3911		kiov.iov_len = uap->count;
3912		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3913		kiov.iov_base = dirbuf;
3914		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3915			    NULL, NULL);
3916		foffset = kuio.uio_offset;
3917		if (error == 0) {
3918			readcnt = uap->count - kuio.uio_resid;
3919			edp = (struct dirent *)&dirbuf[readcnt];
3920			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3921#				if (BYTE_ORDER == LITTLE_ENDIAN)
3922					/*
3923					 * The expected low byte of
3924					 * dp->d_namlen is our dp->d_type.
3925					 * The high MBZ byte of dp->d_namlen
3926					 * is our dp->d_namlen.
3927					 */
3928					dp->d_type = dp->d_namlen;
3929					dp->d_namlen = 0;
3930#				else
3931					/*
3932					 * The dp->d_type is the high byte
3933					 * of the expected dp->d_namlen,
3934					 * so must be zero'ed.
3935					 */
3936					dp->d_type = 0;
3937#				endif
3938				if (dp->d_reclen > 0) {
3939					dp = (struct dirent *)
3940					    ((char *)dp + dp->d_reclen);
3941				} else {
3942					error = EIO;
3943					break;
3944				}
3945			}
3946			if (dp >= edp)
3947				error = uiomove(dirbuf, readcnt, &auio);
3948		}
3949		free(dirbuf, M_TEMP);
3950	}
3951	if (error != 0) {
3952		VOP_UNLOCK(vp, 0);
3953		foffset_unlock(fp, foffset, 0);
3954		fdrop(fp, td);
3955		return (error);
3956	}
3957	if (uap->count == auio.uio_resid &&
3958	    (vp->v_vflag & VV_ROOT) &&
3959	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3960		struct vnode *tvp = vp;
3961		vp = vp->v_mount->mnt_vnodecovered;
3962		VREF(vp);
3963		fp->f_vnode = vp;
3964		fp->f_data = vp;
3965		foffset = 0;
3966		vput(tvp);
3967		goto unionread;
3968	}
3969	VOP_UNLOCK(vp, 0);
3970	foffset_unlock(fp, foffset, 0);
3971	fdrop(fp, td);
3972	td->td_retval[0] = uap->count - auio.uio_resid;
3973	if (error == 0)
3974		*ploff = loff;
3975	return (error);
3976}
3977#endif /* COMPAT_43 */
3978
3979/*
3980 * Read a block of directory entries in a filesystem independent format.
3981 */
3982#ifndef _SYS_SYSPROTO_H_
3983struct getdirentries_args {
3984	int	fd;
3985	char	*buf;
3986	u_int	count;
3987	long	*basep;
3988};
3989#endif
3990int
3991sys_getdirentries(td, uap)
3992	struct thread *td;
3993	register struct getdirentries_args /* {
3994		int fd;
3995		char *buf;
3996		u_int count;
3997		long *basep;
3998	} */ *uap;
3999{
4000	long base;
4001	int error;
4002
4003	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
4004	    NULL, UIO_USERSPACE);
4005	if (error != 0)
4006		return (error);
4007	if (uap->basep != NULL)
4008		error = copyout(&base, uap->basep, sizeof(long));
4009	return (error);
4010}
4011
4012int
4013kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4014    long *basep, ssize_t *residp, enum uio_seg bufseg)
4015{
4016	struct vnode *vp;
4017	struct file *fp;
4018	struct uio auio;
4019	struct iovec aiov;
4020	cap_rights_t rights;
4021	long loff;
4022	int error, eofflag;
4023	off_t foffset;
4024
4025	AUDIT_ARG_FD(fd);
4026	if (count > IOSIZE_MAX)
4027		return (EINVAL);
4028	auio.uio_resid = count;
4029	error = getvnode(td->td_proc->p_fd, fd,
4030	    cap_rights_init(&rights, CAP_READ), &fp);
4031	if (error != 0)
4032		return (error);
4033	if ((fp->f_flag & FREAD) == 0) {
4034		fdrop(fp, td);
4035		return (EBADF);
4036	}
4037	vp = fp->f_vnode;
4038	foffset = foffset_lock(fp, 0);
4039unionread:
4040	if (vp->v_type != VDIR) {
4041		error = EINVAL;
4042		goto fail;
4043	}
4044	aiov.iov_base = buf;
4045	aiov.iov_len = count;
4046	auio.uio_iov = &aiov;
4047	auio.uio_iovcnt = 1;
4048	auio.uio_rw = UIO_READ;
4049	auio.uio_segflg = bufseg;
4050	auio.uio_td = td;
4051	vn_lock(vp, LK_SHARED | LK_RETRY);
4052	AUDIT_ARG_VNODE1(vp);
4053	loff = auio.uio_offset = foffset;
4054#ifdef MAC
4055	error = mac_vnode_check_readdir(td->td_ucred, vp);
4056	if (error == 0)
4057#endif
4058		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4059		    NULL);
4060	foffset = auio.uio_offset;
4061	if (error != 0) {
4062		VOP_UNLOCK(vp, 0);
4063		goto fail;
4064	}
4065	if (count == auio.uio_resid &&
4066	    (vp->v_vflag & VV_ROOT) &&
4067	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4068		struct vnode *tvp = vp;
4069
4070		vp = vp->v_mount->mnt_vnodecovered;
4071		VREF(vp);
4072		fp->f_vnode = vp;
4073		fp->f_data = vp;
4074		foffset = 0;
4075		vput(tvp);
4076		goto unionread;
4077	}
4078	VOP_UNLOCK(vp, 0);
4079	*basep = loff;
4080	if (residp != NULL)
4081		*residp = auio.uio_resid;
4082	td->td_retval[0] = count - auio.uio_resid;
4083fail:
4084	foffset_unlock(fp, foffset, 0);
4085	fdrop(fp, td);
4086	return (error);
4087}
4088
4089#ifndef _SYS_SYSPROTO_H_
4090struct getdents_args {
4091	int fd;
4092	char *buf;
4093	size_t count;
4094};
4095#endif
4096int
4097sys_getdents(td, uap)
4098	struct thread *td;
4099	register struct getdents_args /* {
4100		int fd;
4101		char *buf;
4102		u_int count;
4103	} */ *uap;
4104{
4105	struct getdirentries_args ap;
4106
4107	ap.fd = uap->fd;
4108	ap.buf = uap->buf;
4109	ap.count = uap->count;
4110	ap.basep = NULL;
4111	return (sys_getdirentries(td, &ap));
4112}
4113
4114/*
4115 * Set the mode mask for creation of filesystem nodes.
4116 */
4117#ifndef _SYS_SYSPROTO_H_
4118struct umask_args {
4119	int	newmask;
4120};
4121#endif
4122int
4123sys_umask(td, uap)
4124	struct thread *td;
4125	struct umask_args /* {
4126		int newmask;
4127	} */ *uap;
4128{
4129	register struct filedesc *fdp;
4130
4131	FILEDESC_XLOCK(td->td_proc->p_fd);
4132	fdp = td->td_proc->p_fd;
4133	td->td_retval[0] = fdp->fd_cmask;
4134	fdp->fd_cmask = uap->newmask & ALLPERMS;
4135	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4136	return (0);
4137}
4138
4139/*
4140 * Void all references to file by ripping underlying filesystem away from
4141 * vnode.
4142 */
4143#ifndef _SYS_SYSPROTO_H_
4144struct revoke_args {
4145	char	*path;
4146};
4147#endif
4148int
4149sys_revoke(td, uap)
4150	struct thread *td;
4151	register struct revoke_args /* {
4152		char *path;
4153	} */ *uap;
4154{
4155	struct vnode *vp;
4156	struct vattr vattr;
4157	struct nameidata nd;
4158	int error;
4159
4160	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4161	    uap->path, td);
4162	if ((error = namei(&nd)) != 0)
4163		return (error);
4164	vp = nd.ni_vp;
4165	NDFREE(&nd, NDF_ONLY_PNBUF);
4166	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4167		error = EINVAL;
4168		goto out;
4169	}
4170#ifdef MAC
4171	error = mac_vnode_check_revoke(td->td_ucred, vp);
4172	if (error != 0)
4173		goto out;
4174#endif
4175	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4176	if (error != 0)
4177		goto out;
4178	if (td->td_ucred->cr_uid != vattr.va_uid) {
4179		error = priv_check(td, PRIV_VFS_ADMIN);
4180		if (error != 0)
4181			goto out;
4182	}
4183	if (vcount(vp) > 1)
4184		VOP_REVOKE(vp, REVOKEALL);
4185out:
4186	vput(vp);
4187	return (error);
4188}
4189
4190/*
4191 * Convert a user file descriptor to a kernel file entry and check that, if it
4192 * is a capability, the correct rights are present. A reference on the file
4193 * entry is held upon returning.
4194 */
4195int
4196getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4197{
4198	struct file *fp;
4199	int error;
4200
4201	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4202	if (error != 0)
4203		return (error);
4204
4205	/*
4206	 * The file could be not of the vnode type, or it may be not
4207	 * yet fully initialized, in which case the f_vnode pointer
4208	 * may be set, but f_ops is still badfileops.  E.g.,
4209	 * devfs_open() transiently create such situation to
4210	 * facilitate csw d_fdopen().
4211	 *
4212	 * Dupfdopen() handling in kern_openat() installs the
4213	 * half-baked file into the process descriptor table, allowing
4214	 * other thread to dereference it. Guard against the race by
4215	 * checking f_ops.
4216	 */
4217	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4218		fdrop(fp, curthread);
4219		return (EINVAL);
4220	}
4221	*fpp = fp;
4222	return (0);
4223}
4224
4225
4226/*
4227 * Get an (NFS) file handle.
4228 */
4229#ifndef _SYS_SYSPROTO_H_
4230struct lgetfh_args {
4231	char	*fname;
4232	fhandle_t *fhp;
4233};
4234#endif
4235int
4236sys_lgetfh(td, uap)
4237	struct thread *td;
4238	register struct lgetfh_args *uap;
4239{
4240	struct nameidata nd;
4241	fhandle_t fh;
4242	register struct vnode *vp;
4243	int error;
4244
4245	error = priv_check(td, PRIV_VFS_GETFH);
4246	if (error != 0)
4247		return (error);
4248	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4249	    uap->fname, td);
4250	error = namei(&nd);
4251	if (error != 0)
4252		return (error);
4253	NDFREE(&nd, NDF_ONLY_PNBUF);
4254	vp = nd.ni_vp;
4255	bzero(&fh, sizeof(fh));
4256	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4257	error = VOP_VPTOFH(vp, &fh.fh_fid);
4258	vput(vp);
4259	if (error == 0)
4260		error = copyout(&fh, uap->fhp, sizeof (fh));
4261	return (error);
4262}
4263
4264#ifndef _SYS_SYSPROTO_H_
4265struct getfh_args {
4266	char	*fname;
4267	fhandle_t *fhp;
4268};
4269#endif
4270int
4271sys_getfh(td, uap)
4272	struct thread *td;
4273	register struct getfh_args *uap;
4274{
4275	struct nameidata nd;
4276	fhandle_t fh;
4277	register struct vnode *vp;
4278	int error;
4279
4280	error = priv_check(td, PRIV_VFS_GETFH);
4281	if (error != 0)
4282		return (error);
4283	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4284	    uap->fname, td);
4285	error = namei(&nd);
4286	if (error != 0)
4287		return (error);
4288	NDFREE(&nd, NDF_ONLY_PNBUF);
4289	vp = nd.ni_vp;
4290	bzero(&fh, sizeof(fh));
4291	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4292	error = VOP_VPTOFH(vp, &fh.fh_fid);
4293	vput(vp);
4294	if (error == 0)
4295		error = copyout(&fh, uap->fhp, sizeof (fh));
4296	return (error);
4297}
4298
4299/*
4300 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4301 * open descriptor.
4302 *
4303 * warning: do not remove the priv_check() call or this becomes one giant
4304 * security hole.
4305 */
4306#ifndef _SYS_SYSPROTO_H_
4307struct fhopen_args {
4308	const struct fhandle *u_fhp;
4309	int flags;
4310};
4311#endif
4312int
4313sys_fhopen(td, uap)
4314	struct thread *td;
4315	struct fhopen_args /* {
4316		const struct fhandle *u_fhp;
4317		int flags;
4318	} */ *uap;
4319{
4320	struct mount *mp;
4321	struct vnode *vp;
4322	struct fhandle fhp;
4323	struct file *fp;
4324	int fmode, error;
4325	int indx;
4326
4327	error = priv_check(td, PRIV_VFS_FHOPEN);
4328	if (error != 0)
4329		return (error);
4330	indx = -1;
4331	fmode = FFLAGS(uap->flags);
4332	/* why not allow a non-read/write open for our lockd? */
4333	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4334		return (EINVAL);
4335	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4336	if (error != 0)
4337		return(error);
4338	/* find the mount point */
4339	mp = vfs_busyfs(&fhp.fh_fsid);
4340	if (mp == NULL)
4341		return (ESTALE);
4342	/* now give me my vnode, it gets returned to me locked */
4343	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4344	vfs_unbusy(mp);
4345	if (error != 0)
4346		return (error);
4347
4348	error = falloc_noinstall(td, &fp);
4349	if (error != 0) {
4350		vput(vp);
4351		return (error);
4352	}
4353	/*
4354	 * An extra reference on `fp' has been held for us by
4355	 * falloc_noinstall().
4356	 */
4357
4358#ifdef INVARIANTS
4359	td->td_dupfd = -1;
4360#endif
4361	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4362	if (error != 0) {
4363		KASSERT(fp->f_ops == &badfileops,
4364		    ("VOP_OPEN in fhopen() set f_ops"));
4365		KASSERT(td->td_dupfd < 0,
4366		    ("fhopen() encountered fdopen()"));
4367
4368		vput(vp);
4369		goto bad;
4370	}
4371#ifdef INVARIANTS
4372	td->td_dupfd = 0;
4373#endif
4374	fp->f_vnode = vp;
4375	fp->f_seqcount = 1;
4376	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4377	    &vnops);
4378	VOP_UNLOCK(vp, 0);
4379	if ((fmode & O_TRUNC) != 0) {
4380		error = fo_truncate(fp, 0, td->td_ucred, td);
4381		if (error != 0)
4382			goto bad;
4383	}
4384
4385	error = finstall(td, fp, &indx, fmode, NULL);
4386bad:
4387	fdrop(fp, td);
4388	td->td_retval[0] = indx;
4389	return (error);
4390}
4391
4392/*
4393 * Stat an (NFS) file handle.
4394 */
4395#ifndef _SYS_SYSPROTO_H_
4396struct fhstat_args {
4397	struct fhandle *u_fhp;
4398	struct stat *sb;
4399};
4400#endif
4401int
4402sys_fhstat(td, uap)
4403	struct thread *td;
4404	register struct fhstat_args /* {
4405		struct fhandle *u_fhp;
4406		struct stat *sb;
4407	} */ *uap;
4408{
4409	struct stat sb;
4410	struct fhandle fh;
4411	int error;
4412
4413	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4414	if (error != 0)
4415		return (error);
4416	error = kern_fhstat(td, fh, &sb);
4417	if (error == 0)
4418		error = copyout(&sb, uap->sb, sizeof(sb));
4419	return (error);
4420}
4421
4422int
4423kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4424{
4425	struct mount *mp;
4426	struct vnode *vp;
4427	int error;
4428
4429	error = priv_check(td, PRIV_VFS_FHSTAT);
4430	if (error != 0)
4431		return (error);
4432	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4433		return (ESTALE);
4434	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4435	vfs_unbusy(mp);
4436	if (error != 0)
4437		return (error);
4438	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4439	vput(vp);
4440	return (error);
4441}
4442
4443/*
4444 * Implement fstatfs() for (NFS) file handles.
4445 */
4446#ifndef _SYS_SYSPROTO_H_
4447struct fhstatfs_args {
4448	struct fhandle *u_fhp;
4449	struct statfs *buf;
4450};
4451#endif
4452int
4453sys_fhstatfs(td, uap)
4454	struct thread *td;
4455	struct fhstatfs_args /* {
4456		struct fhandle *u_fhp;
4457		struct statfs *buf;
4458	} */ *uap;
4459{
4460	struct statfs sf;
4461	fhandle_t fh;
4462	int error;
4463
4464	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4465	if (error != 0)
4466		return (error);
4467	error = kern_fhstatfs(td, fh, &sf);
4468	if (error != 0)
4469		return (error);
4470	return (copyout(&sf, uap->buf, sizeof(sf)));
4471}
4472
4473int
4474kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4475{
4476	struct statfs *sp;
4477	struct mount *mp;
4478	struct vnode *vp;
4479	int error;
4480
4481	error = priv_check(td, PRIV_VFS_FHSTATFS);
4482	if (error != 0)
4483		return (error);
4484	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4485		return (ESTALE);
4486	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4487	if (error != 0) {
4488		vfs_unbusy(mp);
4489		return (error);
4490	}
4491	vput(vp);
4492	error = prison_canseemount(td->td_ucred, mp);
4493	if (error != 0)
4494		goto out;
4495#ifdef MAC
4496	error = mac_mount_check_stat(td->td_ucred, mp);
4497	if (error != 0)
4498		goto out;
4499#endif
4500	/*
4501	 * Set these in case the underlying filesystem fails to do so.
4502	 */
4503	sp = &mp->mnt_stat;
4504	sp->f_version = STATFS_VERSION;
4505	sp->f_namemax = NAME_MAX;
4506	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4507	error = VFS_STATFS(mp, sp);
4508	if (error == 0)
4509		*buf = *sp;
4510out:
4511	vfs_unbusy(mp);
4512	return (error);
4513}
4514
4515int
4516kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4517{
4518	struct file *fp;
4519	struct mount *mp;
4520	struct vnode *vp;
4521	cap_rights_t rights;
4522	off_t olen, ooffset;
4523	int error;
4524
4525	fp = NULL;
4526	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4527	if (error != 0)
4528		goto out;
4529
4530	switch (fp->f_type) {
4531	case DTYPE_VNODE:
4532		break;
4533	case DTYPE_PIPE:
4534	case DTYPE_FIFO:
4535		error = ESPIPE;
4536		goto out;
4537	default:
4538		error = ENODEV;
4539		goto out;
4540	}
4541	if ((fp->f_flag & FWRITE) == 0) {
4542		error = EBADF;
4543		goto out;
4544	}
4545	vp = fp->f_vnode;
4546	if (vp->v_type != VREG) {
4547		error = ENODEV;
4548		goto out;
4549	}
4550	if (offset < 0 || len <= 0) {
4551		error = EINVAL;
4552		goto out;
4553	}
4554	/* Check for wrap. */
4555	if (offset > OFF_MAX - len) {
4556		error = EFBIG;
4557		goto out;
4558	}
4559
4560	/* Allocating blocks may take a long time, so iterate. */
4561	for (;;) {
4562		olen = len;
4563		ooffset = offset;
4564
4565		bwillwrite();
4566		mp = NULL;
4567		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4568		if (error != 0)
4569			break;
4570		error = vn_lock(vp, LK_EXCLUSIVE);
4571		if (error != 0) {
4572			vn_finished_write(mp);
4573			break;
4574		}
4575#ifdef MAC
4576		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4577		if (error == 0)
4578#endif
4579			error = VOP_ALLOCATE(vp, &offset, &len);
4580		VOP_UNLOCK(vp, 0);
4581		vn_finished_write(mp);
4582
4583		if (olen + ooffset != offset + len) {
4584			panic("offset + len changed from %jx/%jx to %jx/%jx",
4585			    ooffset, olen, offset, len);
4586		}
4587		if (error != 0 || len == 0)
4588			break;
4589		KASSERT(olen > len, ("Iteration did not make progress?"));
4590		maybe_yield();
4591	}
4592 out:
4593	if (fp != NULL)
4594		fdrop(fp, td);
4595	return (error);
4596}
4597
4598int
4599sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4600{
4601
4602	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4603	    uap->len);
4604	return (0);
4605}
4606
4607/*
4608 * Unlike madvise(2), we do not make a best effort to remember every
4609 * possible caching hint.  Instead, we remember the last setting with
4610 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4611 * region of any current setting.
4612 */
4613int
4614kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4615    int advice)
4616{
4617	struct fadvise_info *fa, *new;
4618	struct file *fp;
4619	struct vnode *vp;
4620	cap_rights_t rights;
4621	off_t end;
4622	int error;
4623
4624	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4625		return (EINVAL);
4626	switch (advice) {
4627	case POSIX_FADV_SEQUENTIAL:
4628	case POSIX_FADV_RANDOM:
4629	case POSIX_FADV_NOREUSE:
4630		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4631		break;
4632	case POSIX_FADV_NORMAL:
4633	case POSIX_FADV_WILLNEED:
4634	case POSIX_FADV_DONTNEED:
4635		new = NULL;
4636		break;
4637	default:
4638		return (EINVAL);
4639	}
4640	/* XXX: CAP_POSIX_FADVISE? */
4641	error = fget(td, fd, cap_rights_init(&rights), &fp);
4642	if (error != 0)
4643		goto out;
4644
4645	switch (fp->f_type) {
4646	case DTYPE_VNODE:
4647		break;
4648	case DTYPE_PIPE:
4649	case DTYPE_FIFO:
4650		error = ESPIPE;
4651		goto out;
4652	default:
4653		error = ENODEV;
4654		goto out;
4655	}
4656	vp = fp->f_vnode;
4657	if (vp->v_type != VREG) {
4658		error = ENODEV;
4659		goto out;
4660	}
4661	if (len == 0)
4662		end = OFF_MAX;
4663	else
4664		end = offset + len - 1;
4665	switch (advice) {
4666	case POSIX_FADV_SEQUENTIAL:
4667	case POSIX_FADV_RANDOM:
4668	case POSIX_FADV_NOREUSE:
4669		/*
4670		 * Try to merge any existing non-standard region with
4671		 * this new region if possible, otherwise create a new
4672		 * non-standard region for this request.
4673		 */
4674		mtx_pool_lock(mtxpool_sleep, fp);
4675		fa = fp->f_advice;
4676		if (fa != NULL && fa->fa_advice == advice &&
4677		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4678		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4679		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4680			if (offset < fa->fa_start)
4681				fa->fa_start = offset;
4682			if (end > fa->fa_end)
4683				fa->fa_end = end;
4684		} else {
4685			new->fa_advice = advice;
4686			new->fa_start = offset;
4687			new->fa_end = end;
4688			new->fa_prevstart = 0;
4689			new->fa_prevend = 0;
4690			fp->f_advice = new;
4691			new = fa;
4692		}
4693		mtx_pool_unlock(mtxpool_sleep, fp);
4694		break;
4695	case POSIX_FADV_NORMAL:
4696		/*
4697		 * If a the "normal" region overlaps with an existing
4698		 * non-standard region, trim or remove the
4699		 * non-standard region.
4700		 */
4701		mtx_pool_lock(mtxpool_sleep, fp);
4702		fa = fp->f_advice;
4703		if (fa != NULL) {
4704			if (offset <= fa->fa_start && end >= fa->fa_end) {
4705				new = fa;
4706				fp->f_advice = NULL;
4707			} else if (offset <= fa->fa_start &&
4708			    end >= fa->fa_start)
4709				fa->fa_start = end + 1;
4710			else if (offset <= fa->fa_end && end >= fa->fa_end)
4711				fa->fa_end = offset - 1;
4712			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4713				/*
4714				 * If the "normal" region is a middle
4715				 * portion of the existing
4716				 * non-standard region, just remove
4717				 * the whole thing rather than picking
4718				 * one side or the other to
4719				 * preserve.
4720				 */
4721				new = fa;
4722				fp->f_advice = NULL;
4723			}
4724		}
4725		mtx_pool_unlock(mtxpool_sleep, fp);
4726		break;
4727	case POSIX_FADV_WILLNEED:
4728	case POSIX_FADV_DONTNEED:
4729		error = VOP_ADVISE(vp, offset, end, advice);
4730		break;
4731	}
4732out:
4733	if (fp != NULL)
4734		fdrop(fp, td);
4735	free(new, M_FADVISE);
4736	return (error);
4737}
4738
4739int
4740sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4741{
4742
4743	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4744	    uap->len, uap->advice);
4745	return (0);
4746}
4747