vfs_syscalls.c revision 261560
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 261560 2014-02-06 19:47:17Z kib $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_kdtrace.h"
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/capability.h>
50#include <sys/disk.h>
51#include <sys/sysent.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/mutex.h>
55#include <sys/sysproto.h>
56#include <sys/namei.h>
57#include <sys/filedesc.h>
58#include <sys/kernel.h>
59#include <sys/fcntl.h>
60#include <sys/file.h>
61#include <sys/filio.h>
62#include <sys/limits.h>
63#include <sys/linker.h>
64#include <sys/rwlock.h>
65#include <sys/sdt.h>
66#include <sys/stat.h>
67#include <sys/sx.h>
68#include <sys/unistd.h>
69#include <sys/vnode.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/dirent.h>
73#include <sys/jail.h>
74#include <sys/syscallsubr.h>
75#include <sys/sysctl.h>
76#ifdef KTRACE
77#include <sys/ktrace.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/audit/audit.h>
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/uma.h>
89
90#include <ufs/ufs/quota.h>
91
92MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
93
94SDT_PROVIDER_DEFINE(vfs);
95SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
96SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
97
98static int chroot_refuse_vdir_fds(struct filedesc *fdp);
99static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
100static int kern_chflags(struct thread *td, const char *path,
101    enum uio_seg pathseg, u_long flags);
102static int kern_chflagsat(struct thread *td, int fd, const char *path,
103    enum uio_seg pathseg, u_long flags, int atflag);
104static int setfflags(struct thread *td, struct vnode *, u_long);
105static int setutimes(struct thread *td, struct vnode *,
106    const struct timespec *, int, int);
107static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
108    struct thread *td);
109
110/*
111 * The module initialization routine for POSIX asynchronous I/O will
112 * set this to the version of AIO that it implements.  (Zero means
113 * that it is not implemented.)  This value is used here by pathconf()
114 * and in kern_descrip.c by fpathconf().
115 */
116int async_io_version;
117
118#ifdef DEBUG
119static int syncprt = 0;
120SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
121#endif
122
123/*
124 * Sync each mounted filesystem.
125 */
126#ifndef _SYS_SYSPROTO_H_
127struct sync_args {
128	int     dummy;
129};
130#endif
131/* ARGSUSED */
132int
133sys_sync(td, uap)
134	struct thread *td;
135	struct sync_args *uap;
136{
137	struct mount *mp, *nmp;
138	int save;
139
140	mtx_lock(&mountlist_mtx);
141	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
142		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
143			nmp = TAILQ_NEXT(mp, mnt_list);
144			continue;
145		}
146		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
147		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
148			save = curthread_pflags_set(TDP_SYNCIO);
149			vfs_msync(mp, MNT_NOWAIT);
150			VFS_SYNC(mp, MNT_NOWAIT);
151			curthread_pflags_restore(save);
152			vn_finished_write(mp);
153		}
154		mtx_lock(&mountlist_mtx);
155		nmp = TAILQ_NEXT(mp, mnt_list);
156		vfs_unbusy(mp);
157	}
158	mtx_unlock(&mountlist_mtx);
159	return (0);
160}
161
162/*
163 * Change filesystem quotas.
164 */
165#ifndef _SYS_SYSPROTO_H_
166struct quotactl_args {
167	char *path;
168	int cmd;
169	int uid;
170	caddr_t arg;
171};
172#endif
173int
174sys_quotactl(td, uap)
175	struct thread *td;
176	register struct quotactl_args /* {
177		char *path;
178		int cmd;
179		int uid;
180		caddr_t arg;
181	} */ *uap;
182{
183	struct mount *mp;
184	struct nameidata nd;
185	int error;
186
187	AUDIT_ARG_CMD(uap->cmd);
188	AUDIT_ARG_UID(uap->uid);
189	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
190		return (EPERM);
191	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
192	    uap->path, td);
193	if ((error = namei(&nd)) != 0)
194		return (error);
195	NDFREE(&nd, NDF_ONLY_PNBUF);
196	mp = nd.ni_vp->v_mount;
197	vfs_ref(mp);
198	vput(nd.ni_vp);
199	error = vfs_busy(mp, 0);
200	vfs_rel(mp);
201	if (error != 0)
202		return (error);
203	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
204
205	/*
206	 * Since quota on operation typically needs to open quota
207	 * file, the Q_QUOTAON handler needs to unbusy the mount point
208	 * before calling into namei.  Otherwise, unmount might be
209	 * started between two vfs_busy() invocations (first is our,
210	 * second is from mount point cross-walk code in lookup()),
211	 * causing deadlock.
212	 *
213	 * Require that Q_QUOTAON handles the vfs_busy() reference on
214	 * its own, always returning with ubusied mount point.
215	 */
216	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
217		vfs_unbusy(mp);
218	return (error);
219}
220
221/*
222 * Used by statfs conversion routines to scale the block size up if
223 * necessary so that all of the block counts are <= 'max_size'.  Note
224 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
225 * value of 'n'.
226 */
227void
228statfs_scale_blocks(struct statfs *sf, long max_size)
229{
230	uint64_t count;
231	int shift;
232
233	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
234
235	/*
236	 * Attempt to scale the block counts to give a more accurate
237	 * overview to userland of the ratio of free space to used
238	 * space.  To do this, find the largest block count and compute
239	 * a divisor that lets it fit into a signed integer <= max_size.
240	 */
241	if (sf->f_bavail < 0)
242		count = -sf->f_bavail;
243	else
244		count = sf->f_bavail;
245	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
246	if (count <= max_size)
247		return;
248
249	count >>= flsl(max_size);
250	shift = 0;
251	while (count > 0) {
252		shift++;
253		count >>=1;
254	}
255
256	sf->f_bsize <<= shift;
257	sf->f_blocks >>= shift;
258	sf->f_bfree >>= shift;
259	sf->f_bavail >>= shift;
260}
261
262/*
263 * Get filesystem statistics.
264 */
265#ifndef _SYS_SYSPROTO_H_
266struct statfs_args {
267	char *path;
268	struct statfs *buf;
269};
270#endif
271int
272sys_statfs(td, uap)
273	struct thread *td;
274	register struct statfs_args /* {
275		char *path;
276		struct statfs *buf;
277	} */ *uap;
278{
279	struct statfs sf;
280	int error;
281
282	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
283	if (error == 0)
284		error = copyout(&sf, uap->buf, sizeof(sf));
285	return (error);
286}
287
288int
289kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
290    struct statfs *buf)
291{
292	struct mount *mp;
293	struct statfs *sp, sb;
294	struct nameidata nd;
295	int error;
296
297	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
298	    pathseg, path, td);
299	error = namei(&nd);
300	if (error != 0)
301		return (error);
302	mp = nd.ni_vp->v_mount;
303	vfs_ref(mp);
304	NDFREE(&nd, NDF_ONLY_PNBUF);
305	vput(nd.ni_vp);
306	error = vfs_busy(mp, 0);
307	vfs_rel(mp);
308	if (error != 0)
309		return (error);
310#ifdef MAC
311	error = mac_mount_check_stat(td->td_ucred, mp);
312	if (error != 0)
313		goto out;
314#endif
315	/*
316	 * Set these in case the underlying filesystem fails to do so.
317	 */
318	sp = &mp->mnt_stat;
319	sp->f_version = STATFS_VERSION;
320	sp->f_namemax = NAME_MAX;
321	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
322	error = VFS_STATFS(mp, sp);
323	if (error != 0)
324		goto out;
325	if (priv_check(td, PRIV_VFS_GENERATION)) {
326		bcopy(sp, &sb, sizeof(sb));
327		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
328		prison_enforce_statfs(td->td_ucred, mp, &sb);
329		sp = &sb;
330	}
331	*buf = *sp;
332out:
333	vfs_unbusy(mp);
334	return (error);
335}
336
337/*
338 * Get filesystem statistics.
339 */
340#ifndef _SYS_SYSPROTO_H_
341struct fstatfs_args {
342	int fd;
343	struct statfs *buf;
344};
345#endif
346int
347sys_fstatfs(td, uap)
348	struct thread *td;
349	register struct fstatfs_args /* {
350		int fd;
351		struct statfs *buf;
352	} */ *uap;
353{
354	struct statfs sf;
355	int error;
356
357	error = kern_fstatfs(td, uap->fd, &sf);
358	if (error == 0)
359		error = copyout(&sf, uap->buf, sizeof(sf));
360	return (error);
361}
362
363int
364kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
365{
366	struct file *fp;
367	struct mount *mp;
368	struct statfs *sp, sb;
369	struct vnode *vp;
370	cap_rights_t rights;
371	int error;
372
373	AUDIT_ARG_FD(fd);
374	error = getvnode(td->td_proc->p_fd, fd,
375	    cap_rights_init(&rights, CAP_FSTATFS), &fp);
376	if (error != 0)
377		return (error);
378	vp = fp->f_vnode;
379	vn_lock(vp, LK_SHARED | LK_RETRY);
380#ifdef AUDIT
381	AUDIT_ARG_VNODE1(vp);
382#endif
383	mp = vp->v_mount;
384	if (mp)
385		vfs_ref(mp);
386	VOP_UNLOCK(vp, 0);
387	fdrop(fp, td);
388	if (mp == NULL) {
389		error = EBADF;
390		goto out;
391	}
392	error = vfs_busy(mp, 0);
393	vfs_rel(mp);
394	if (error != 0)
395		return (error);
396#ifdef MAC
397	error = mac_mount_check_stat(td->td_ucred, mp);
398	if (error != 0)
399		goto out;
400#endif
401	/*
402	 * Set these in case the underlying filesystem fails to do so.
403	 */
404	sp = &mp->mnt_stat;
405	sp->f_version = STATFS_VERSION;
406	sp->f_namemax = NAME_MAX;
407	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
408	error = VFS_STATFS(mp, sp);
409	if (error != 0)
410		goto out;
411	if (priv_check(td, PRIV_VFS_GENERATION)) {
412		bcopy(sp, &sb, sizeof(sb));
413		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
414		prison_enforce_statfs(td->td_ucred, mp, &sb);
415		sp = &sb;
416	}
417	*buf = *sp;
418out:
419	if (mp)
420		vfs_unbusy(mp);
421	return (error);
422}
423
424/*
425 * Get statistics on all filesystems.
426 */
427#ifndef _SYS_SYSPROTO_H_
428struct getfsstat_args {
429	struct statfs *buf;
430	long bufsize;
431	int flags;
432};
433#endif
434int
435sys_getfsstat(td, uap)
436	struct thread *td;
437	register struct getfsstat_args /* {
438		struct statfs *buf;
439		long bufsize;
440		int flags;
441	} */ *uap;
442{
443
444	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
445	    uap->flags));
446}
447
448/*
449 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
450 *	The caller is responsible for freeing memory which will be allocated
451 *	in '*buf'.
452 */
453int
454kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
455    enum uio_seg bufseg, int flags)
456{
457	struct mount *mp, *nmp;
458	struct statfs *sfsp, *sp, sb;
459	size_t count, maxcount;
460	int error;
461
462	maxcount = bufsize / sizeof(struct statfs);
463	if (bufsize == 0)
464		sfsp = NULL;
465	else if (bufseg == UIO_USERSPACE)
466		sfsp = *buf;
467	else /* if (bufseg == UIO_SYSSPACE) */ {
468		count = 0;
469		mtx_lock(&mountlist_mtx);
470		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
471			count++;
472		}
473		mtx_unlock(&mountlist_mtx);
474		if (maxcount > count)
475			maxcount = count;
476		sfsp = *buf = malloc(maxcount * sizeof(struct statfs), M_TEMP,
477		    M_WAITOK);
478	}
479	count = 0;
480	mtx_lock(&mountlist_mtx);
481	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
482		if (prison_canseemount(td->td_ucred, mp) != 0) {
483			nmp = TAILQ_NEXT(mp, mnt_list);
484			continue;
485		}
486#ifdef MAC
487		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
488			nmp = TAILQ_NEXT(mp, mnt_list);
489			continue;
490		}
491#endif
492		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
493			nmp = TAILQ_NEXT(mp, mnt_list);
494			continue;
495		}
496		if (sfsp && count < maxcount) {
497			sp = &mp->mnt_stat;
498			/*
499			 * Set these in case the underlying filesystem
500			 * fails to do so.
501			 */
502			sp->f_version = STATFS_VERSION;
503			sp->f_namemax = NAME_MAX;
504			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
505			/*
506			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
507			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
508			 * overrides MNT_WAIT.
509			 */
510			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
511			    (flags & MNT_WAIT)) &&
512			    (error = VFS_STATFS(mp, sp))) {
513				mtx_lock(&mountlist_mtx);
514				nmp = TAILQ_NEXT(mp, mnt_list);
515				vfs_unbusy(mp);
516				continue;
517			}
518			if (priv_check(td, PRIV_VFS_GENERATION)) {
519				bcopy(sp, &sb, sizeof(sb));
520				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
521				prison_enforce_statfs(td->td_ucred, mp, &sb);
522				sp = &sb;
523			}
524			if (bufseg == UIO_SYSSPACE)
525				bcopy(sp, sfsp, sizeof(*sp));
526			else /* if (bufseg == UIO_USERSPACE) */ {
527				error = copyout(sp, sfsp, sizeof(*sp));
528				if (error != 0) {
529					vfs_unbusy(mp);
530					return (error);
531				}
532			}
533			sfsp++;
534		}
535		count++;
536		mtx_lock(&mountlist_mtx);
537		nmp = TAILQ_NEXT(mp, mnt_list);
538		vfs_unbusy(mp);
539	}
540	mtx_unlock(&mountlist_mtx);
541	if (sfsp && count > maxcount)
542		td->td_retval[0] = maxcount;
543	else
544		td->td_retval[0] = count;
545	return (0);
546}
547
548#ifdef COMPAT_FREEBSD4
549/*
550 * Get old format filesystem statistics.
551 */
552static void cvtstatfs(struct statfs *, struct ostatfs *);
553
554#ifndef _SYS_SYSPROTO_H_
555struct freebsd4_statfs_args {
556	char *path;
557	struct ostatfs *buf;
558};
559#endif
560int
561freebsd4_statfs(td, uap)
562	struct thread *td;
563	struct freebsd4_statfs_args /* {
564		char *path;
565		struct ostatfs *buf;
566	} */ *uap;
567{
568	struct ostatfs osb;
569	struct statfs sf;
570	int error;
571
572	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
573	if (error != 0)
574		return (error);
575	cvtstatfs(&sf, &osb);
576	return (copyout(&osb, uap->buf, sizeof(osb)));
577}
578
579/*
580 * Get filesystem statistics.
581 */
582#ifndef _SYS_SYSPROTO_H_
583struct freebsd4_fstatfs_args {
584	int fd;
585	struct ostatfs *buf;
586};
587#endif
588int
589freebsd4_fstatfs(td, uap)
590	struct thread *td;
591	struct freebsd4_fstatfs_args /* {
592		int fd;
593		struct ostatfs *buf;
594	} */ *uap;
595{
596	struct ostatfs osb;
597	struct statfs sf;
598	int error;
599
600	error = kern_fstatfs(td, uap->fd, &sf);
601	if (error != 0)
602		return (error);
603	cvtstatfs(&sf, &osb);
604	return (copyout(&osb, uap->buf, sizeof(osb)));
605}
606
607/*
608 * Get statistics on all filesystems.
609 */
610#ifndef _SYS_SYSPROTO_H_
611struct freebsd4_getfsstat_args {
612	struct ostatfs *buf;
613	long bufsize;
614	int flags;
615};
616#endif
617int
618freebsd4_getfsstat(td, uap)
619	struct thread *td;
620	register struct freebsd4_getfsstat_args /* {
621		struct ostatfs *buf;
622		long bufsize;
623		int flags;
624	} */ *uap;
625{
626	struct statfs *buf, *sp;
627	struct ostatfs osb;
628	size_t count, size;
629	int error;
630
631	count = uap->bufsize / sizeof(struct ostatfs);
632	size = count * sizeof(struct statfs);
633	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
634	if (size > 0) {
635		count = td->td_retval[0];
636		sp = buf;
637		while (count > 0 && error == 0) {
638			cvtstatfs(sp, &osb);
639			error = copyout(&osb, uap->buf, sizeof(osb));
640			sp++;
641			uap->buf++;
642			count--;
643		}
644		free(buf, M_TEMP);
645	}
646	return (error);
647}
648
649/*
650 * Implement fstatfs() for (NFS) file handles.
651 */
652#ifndef _SYS_SYSPROTO_H_
653struct freebsd4_fhstatfs_args {
654	struct fhandle *u_fhp;
655	struct ostatfs *buf;
656};
657#endif
658int
659freebsd4_fhstatfs(td, uap)
660	struct thread *td;
661	struct freebsd4_fhstatfs_args /* {
662		struct fhandle *u_fhp;
663		struct ostatfs *buf;
664	} */ *uap;
665{
666	struct ostatfs osb;
667	struct statfs sf;
668	fhandle_t fh;
669	int error;
670
671	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
672	if (error != 0)
673		return (error);
674	error = kern_fhstatfs(td, fh, &sf);
675	if (error != 0)
676		return (error);
677	cvtstatfs(&sf, &osb);
678	return (copyout(&osb, uap->buf, sizeof(osb)));
679}
680
681/*
682 * Convert a new format statfs structure to an old format statfs structure.
683 */
684static void
685cvtstatfs(nsp, osp)
686	struct statfs *nsp;
687	struct ostatfs *osp;
688{
689
690	statfs_scale_blocks(nsp, LONG_MAX);
691	bzero(osp, sizeof(*osp));
692	osp->f_bsize = nsp->f_bsize;
693	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
694	osp->f_blocks = nsp->f_blocks;
695	osp->f_bfree = nsp->f_bfree;
696	osp->f_bavail = nsp->f_bavail;
697	osp->f_files = MIN(nsp->f_files, LONG_MAX);
698	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
699	osp->f_owner = nsp->f_owner;
700	osp->f_type = nsp->f_type;
701	osp->f_flags = nsp->f_flags;
702	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
703	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
704	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
705	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
706	strlcpy(osp->f_fstypename, nsp->f_fstypename,
707	    MIN(MFSNAMELEN, OMFSNAMELEN));
708	strlcpy(osp->f_mntonname, nsp->f_mntonname,
709	    MIN(MNAMELEN, OMNAMELEN));
710	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
711	    MIN(MNAMELEN, OMNAMELEN));
712	osp->f_fsid = nsp->f_fsid;
713}
714#endif /* COMPAT_FREEBSD4 */
715
716/*
717 * Change current working directory to a given file descriptor.
718 */
719#ifndef _SYS_SYSPROTO_H_
720struct fchdir_args {
721	int	fd;
722};
723#endif
724int
725sys_fchdir(td, uap)
726	struct thread *td;
727	struct fchdir_args /* {
728		int fd;
729	} */ *uap;
730{
731	register struct filedesc *fdp = td->td_proc->p_fd;
732	struct vnode *vp, *tdp, *vpold;
733	struct mount *mp;
734	struct file *fp;
735	cap_rights_t rights;
736	int error;
737
738	AUDIT_ARG_FD(uap->fd);
739	error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
740	    &fp);
741	if (error != 0)
742		return (error);
743	vp = fp->f_vnode;
744	VREF(vp);
745	fdrop(fp, td);
746	vn_lock(vp, LK_SHARED | LK_RETRY);
747	AUDIT_ARG_VNODE1(vp);
748	error = change_dir(vp, td);
749	while (!error && (mp = vp->v_mountedhere) != NULL) {
750		if (vfs_busy(mp, 0))
751			continue;
752		error = VFS_ROOT(mp, LK_SHARED, &tdp);
753		vfs_unbusy(mp);
754		if (error != 0)
755			break;
756		vput(vp);
757		vp = tdp;
758	}
759	if (error != 0) {
760		vput(vp);
761		return (error);
762	}
763	VOP_UNLOCK(vp, 0);
764	FILEDESC_XLOCK(fdp);
765	vpold = fdp->fd_cdir;
766	fdp->fd_cdir = vp;
767	FILEDESC_XUNLOCK(fdp);
768	vrele(vpold);
769	return (0);
770}
771
772/*
773 * Change current working directory (``.'').
774 */
775#ifndef _SYS_SYSPROTO_H_
776struct chdir_args {
777	char	*path;
778};
779#endif
780int
781sys_chdir(td, uap)
782	struct thread *td;
783	struct chdir_args /* {
784		char *path;
785	} */ *uap;
786{
787
788	return (kern_chdir(td, uap->path, UIO_USERSPACE));
789}
790
791int
792kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
793{
794	register struct filedesc *fdp = td->td_proc->p_fd;
795	struct nameidata nd;
796	struct vnode *vp;
797	int error;
798
799	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
800	    pathseg, path, td);
801	if ((error = namei(&nd)) != 0)
802		return (error);
803	if ((error = change_dir(nd.ni_vp, td)) != 0) {
804		vput(nd.ni_vp);
805		NDFREE(&nd, NDF_ONLY_PNBUF);
806		return (error);
807	}
808	VOP_UNLOCK(nd.ni_vp, 0);
809	NDFREE(&nd, NDF_ONLY_PNBUF);
810	FILEDESC_XLOCK(fdp);
811	vp = fdp->fd_cdir;
812	fdp->fd_cdir = nd.ni_vp;
813	FILEDESC_XUNLOCK(fdp);
814	vrele(vp);
815	return (0);
816}
817
818/*
819 * Helper function for raised chroot(2) security function:  Refuse if
820 * any filedescriptors are open directories.
821 */
822static int
823chroot_refuse_vdir_fds(fdp)
824	struct filedesc *fdp;
825{
826	struct vnode *vp;
827	struct file *fp;
828	int fd;
829
830	FILEDESC_LOCK_ASSERT(fdp);
831
832	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
833		fp = fget_locked(fdp, fd);
834		if (fp == NULL)
835			continue;
836		if (fp->f_type == DTYPE_VNODE) {
837			vp = fp->f_vnode;
838			if (vp->v_type == VDIR)
839				return (EPERM);
840		}
841	}
842	return (0);
843}
844
845/*
846 * This sysctl determines if we will allow a process to chroot(2) if it
847 * has a directory open:
848 *	0: disallowed for all processes.
849 *	1: allowed for processes that were not already chroot(2)'ed.
850 *	2: allowed for all processes.
851 */
852
853static int chroot_allow_open_directories = 1;
854
855SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
856     &chroot_allow_open_directories, 0,
857     "Allow a process to chroot(2) if it has a directory open");
858
859/*
860 * Change notion of root (``/'') directory.
861 */
862#ifndef _SYS_SYSPROTO_H_
863struct chroot_args {
864	char	*path;
865};
866#endif
867int
868sys_chroot(td, uap)
869	struct thread *td;
870	struct chroot_args /* {
871		char *path;
872	} */ *uap;
873{
874	struct nameidata nd;
875	int error;
876
877	error = priv_check(td, PRIV_VFS_CHROOT);
878	if (error != 0)
879		return (error);
880	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
881	    UIO_USERSPACE, uap->path, td);
882	error = namei(&nd);
883	if (error != 0)
884		goto error;
885	error = change_dir(nd.ni_vp, td);
886	if (error != 0)
887		goto e_vunlock;
888#ifdef MAC
889	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
890	if (error != 0)
891		goto e_vunlock;
892#endif
893	VOP_UNLOCK(nd.ni_vp, 0);
894	error = change_root(nd.ni_vp, td);
895	vrele(nd.ni_vp);
896	NDFREE(&nd, NDF_ONLY_PNBUF);
897	return (error);
898e_vunlock:
899	vput(nd.ni_vp);
900error:
901	NDFREE(&nd, NDF_ONLY_PNBUF);
902	return (error);
903}
904
905/*
906 * Common routine for chroot and chdir.  Callers must provide a locked vnode
907 * instance.
908 */
909int
910change_dir(vp, td)
911	struct vnode *vp;
912	struct thread *td;
913{
914#ifdef MAC
915	int error;
916#endif
917
918	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
919	if (vp->v_type != VDIR)
920		return (ENOTDIR);
921#ifdef MAC
922	error = mac_vnode_check_chdir(td->td_ucred, vp);
923	if (error != 0)
924		return (error);
925#endif
926	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
927}
928
929/*
930 * Common routine for kern_chroot() and jail_attach().  The caller is
931 * responsible for invoking priv_check() and mac_vnode_check_chroot() to
932 * authorize this operation.
933 */
934int
935change_root(vp, td)
936	struct vnode *vp;
937	struct thread *td;
938{
939	struct filedesc *fdp;
940	struct vnode *oldvp;
941	int error;
942
943	fdp = td->td_proc->p_fd;
944	FILEDESC_XLOCK(fdp);
945	if (chroot_allow_open_directories == 0 ||
946	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
947		error = chroot_refuse_vdir_fds(fdp);
948		if (error != 0) {
949			FILEDESC_XUNLOCK(fdp);
950			return (error);
951		}
952	}
953	oldvp = fdp->fd_rdir;
954	fdp->fd_rdir = vp;
955	VREF(fdp->fd_rdir);
956	if (!fdp->fd_jdir) {
957		fdp->fd_jdir = vp;
958		VREF(fdp->fd_jdir);
959	}
960	FILEDESC_XUNLOCK(fdp);
961	vrele(oldvp);
962	return (0);
963}
964
965static __inline void
966flags_to_rights(int flags, cap_rights_t *rightsp)
967{
968
969	if (flags & O_EXEC) {
970		cap_rights_set(rightsp, CAP_FEXECVE);
971	} else {
972		switch ((flags & O_ACCMODE)) {
973		case O_RDONLY:
974			cap_rights_set(rightsp, CAP_READ);
975			break;
976		case O_RDWR:
977			cap_rights_set(rightsp, CAP_READ);
978			/* FALLTHROUGH */
979		case O_WRONLY:
980			cap_rights_set(rightsp, CAP_WRITE);
981			if (!(flags & (O_APPEND | O_TRUNC)))
982				cap_rights_set(rightsp, CAP_SEEK);
983			break;
984		}
985	}
986
987	if (flags & O_CREAT)
988		cap_rights_set(rightsp, CAP_CREATE);
989
990	if (flags & O_TRUNC)
991		cap_rights_set(rightsp, CAP_FTRUNCATE);
992
993	if (flags & (O_SYNC | O_FSYNC))
994		cap_rights_set(rightsp, CAP_FSYNC);
995
996	if (flags & (O_EXLOCK | O_SHLOCK))
997		cap_rights_set(rightsp, CAP_FLOCK);
998}
999
1000/*
1001 * Check permissions, allocate an open file structure, and call the device
1002 * open routine if any.
1003 */
1004#ifndef _SYS_SYSPROTO_H_
1005struct open_args {
1006	char	*path;
1007	int	flags;
1008	int	mode;
1009};
1010#endif
1011int
1012sys_open(td, uap)
1013	struct thread *td;
1014	register struct open_args /* {
1015		char *path;
1016		int flags;
1017		int mode;
1018	} */ *uap;
1019{
1020
1021	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
1022}
1023
1024#ifndef _SYS_SYSPROTO_H_
1025struct openat_args {
1026	int	fd;
1027	char	*path;
1028	int	flag;
1029	int	mode;
1030};
1031#endif
1032int
1033sys_openat(struct thread *td, struct openat_args *uap)
1034{
1035
1036	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1037	    uap->mode));
1038}
1039
1040int
1041kern_open(struct thread *td, char *path, enum uio_seg pathseg, int flags,
1042    int mode)
1043{
1044
1045	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
1046}
1047
1048int
1049kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1050    int flags, int mode)
1051{
1052	struct proc *p = td->td_proc;
1053	struct filedesc *fdp = p->p_fd;
1054	struct file *fp;
1055	struct vnode *vp;
1056	struct nameidata nd;
1057	cap_rights_t rights;
1058	int cmode, error, indx;
1059
1060	indx = -1;
1061
1062	AUDIT_ARG_FFLAGS(flags);
1063	AUDIT_ARG_MODE(mode);
1064	/* XXX: audit dirfd */
1065	cap_rights_init(&rights, CAP_LOOKUP);
1066	flags_to_rights(flags, &rights);
1067	/*
1068	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
1069	 * may be specified.
1070	 */
1071	if (flags & O_EXEC) {
1072		if (flags & O_ACCMODE)
1073			return (EINVAL);
1074	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
1075		return (EINVAL);
1076	} else {
1077		flags = FFLAGS(flags);
1078	}
1079
1080	/*
1081	 * Allocate the file descriptor, but don't install a descriptor yet.
1082	 */
1083	error = falloc_noinstall(td, &fp);
1084	if (error != 0)
1085		return (error);
1086	/*
1087	 * An extra reference on `fp' has been held for us by
1088	 * falloc_noinstall().
1089	 */
1090	/* Set the flags early so the finit in devfs can pick them up. */
1091	fp->f_flag = flags & FMASK;
1092	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1093	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1094	    &rights, td);
1095	td->td_dupfd = -1;		/* XXX check for fdopen */
1096	error = vn_open(&nd, &flags, cmode, fp);
1097	if (error != 0) {
1098		/*
1099		 * If the vn_open replaced the method vector, something
1100		 * wonderous happened deep below and we just pass it up
1101		 * pretending we know what we do.
1102		 */
1103		if (error == ENXIO && fp->f_ops != &badfileops)
1104			goto success;
1105
1106		/*
1107		 * Handle special fdopen() case. bleh.
1108		 *
1109		 * Don't do this for relative (capability) lookups; we don't
1110		 * understand exactly what would happen, and we don't think
1111		 * that it ever should.
1112		 */
1113		if (nd.ni_strictrelative == 0 &&
1114		    (error == ENODEV || error == ENXIO) &&
1115		    td->td_dupfd >= 0) {
1116			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1117			    &indx);
1118			if (error == 0)
1119				goto success;
1120		}
1121
1122		goto bad;
1123	}
1124	td->td_dupfd = 0;
1125	NDFREE(&nd, NDF_ONLY_PNBUF);
1126	vp = nd.ni_vp;
1127
1128	/*
1129	 * Store the vnode, for any f_type. Typically, the vnode use
1130	 * count is decremented by direct call to vn_closefile() for
1131	 * files that switched type in the cdevsw fdopen() method.
1132	 */
1133	fp->f_vnode = vp;
1134	/*
1135	 * If the file wasn't claimed by devfs bind it to the normal
1136	 * vnode operations here.
1137	 */
1138	if (fp->f_ops == &badfileops) {
1139		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1140		fp->f_seqcount = 1;
1141		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1142		    DTYPE_VNODE, vp, &vnops);
1143	}
1144
1145	VOP_UNLOCK(vp, 0);
1146	if (flags & O_TRUNC) {
1147		error = fo_truncate(fp, 0, td->td_ucred, td);
1148		if (error != 0)
1149			goto bad;
1150	}
1151success:
1152	/*
1153	 * If we haven't already installed the FD (for dupfdopen), do so now.
1154	 */
1155	if (indx == -1) {
1156		struct filecaps *fcaps;
1157
1158#ifdef CAPABILITIES
1159		if (nd.ni_strictrelative == 1)
1160			fcaps = &nd.ni_filecaps;
1161		else
1162#endif
1163			fcaps = NULL;
1164		error = finstall(td, fp, &indx, flags, fcaps);
1165		/* On success finstall() consumes fcaps. */
1166		if (error != 0) {
1167			filecaps_free(&nd.ni_filecaps);
1168			goto bad;
1169		}
1170	} else {
1171		filecaps_free(&nd.ni_filecaps);
1172	}
1173
1174	/*
1175	 * Release our private reference, leaving the one associated with
1176	 * the descriptor table intact.
1177	 */
1178	fdrop(fp, td);
1179	td->td_retval[0] = indx;
1180	return (0);
1181bad:
1182	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1183	fdrop(fp, td);
1184	return (error);
1185}
1186
1187#ifdef COMPAT_43
1188/*
1189 * Create a file.
1190 */
1191#ifndef _SYS_SYSPROTO_H_
1192struct ocreat_args {
1193	char	*path;
1194	int	mode;
1195};
1196#endif
1197int
1198ocreat(td, uap)
1199	struct thread *td;
1200	register struct ocreat_args /* {
1201		char *path;
1202		int mode;
1203	} */ *uap;
1204{
1205
1206	return (kern_open(td, uap->path, UIO_USERSPACE,
1207	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1208}
1209#endif /* COMPAT_43 */
1210
1211/*
1212 * Create a special file.
1213 */
1214#ifndef _SYS_SYSPROTO_H_
1215struct mknod_args {
1216	char	*path;
1217	int	mode;
1218	int	dev;
1219};
1220#endif
1221int
1222sys_mknod(td, uap)
1223	struct thread *td;
1224	register struct mknod_args /* {
1225		char *path;
1226		int mode;
1227		int dev;
1228	} */ *uap;
1229{
1230
1231	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
1232}
1233
1234#ifndef _SYS_SYSPROTO_H_
1235struct mknodat_args {
1236	int	fd;
1237	char	*path;
1238	mode_t	mode;
1239	dev_t	dev;
1240};
1241#endif
1242int
1243sys_mknodat(struct thread *td, struct mknodat_args *uap)
1244{
1245
1246	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1247	    uap->dev));
1248}
1249
1250int
1251kern_mknod(struct thread *td, char *path, enum uio_seg pathseg, int mode,
1252    int dev)
1253{
1254
1255	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
1256}
1257
1258int
1259kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1260    int mode, int dev)
1261{
1262	struct vnode *vp;
1263	struct mount *mp;
1264	struct vattr vattr;
1265	struct nameidata nd;
1266	cap_rights_t rights;
1267	int error, whiteout = 0;
1268
1269	AUDIT_ARG_MODE(mode);
1270	AUDIT_ARG_DEV(dev);
1271	switch (mode & S_IFMT) {
1272	case S_IFCHR:
1273	case S_IFBLK:
1274		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1275		break;
1276	case S_IFMT:
1277		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
1278		break;
1279	case S_IFWHT:
1280		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1281		break;
1282	case S_IFIFO:
1283		if (dev == 0)
1284			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1285		/* FALLTHROUGH */
1286	default:
1287		error = EINVAL;
1288		break;
1289	}
1290	if (error != 0)
1291		return (error);
1292restart:
1293	bwillwrite();
1294	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1295	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT), td);
1296	if ((error = namei(&nd)) != 0)
1297		return (error);
1298	vp = nd.ni_vp;
1299	if (vp != NULL) {
1300		NDFREE(&nd, NDF_ONLY_PNBUF);
1301		if (vp == nd.ni_dvp)
1302			vrele(nd.ni_dvp);
1303		else
1304			vput(nd.ni_dvp);
1305		vrele(vp);
1306		return (EEXIST);
1307	} else {
1308		VATTR_NULL(&vattr);
1309		vattr.va_mode = (mode & ALLPERMS) &
1310		    ~td->td_proc->p_fd->fd_cmask;
1311		vattr.va_rdev = dev;
1312		whiteout = 0;
1313
1314		switch (mode & S_IFMT) {
1315		case S_IFMT:	/* used by badsect to flag bad sectors */
1316			vattr.va_type = VBAD;
1317			break;
1318		case S_IFCHR:
1319			vattr.va_type = VCHR;
1320			break;
1321		case S_IFBLK:
1322			vattr.va_type = VBLK;
1323			break;
1324		case S_IFWHT:
1325			whiteout = 1;
1326			break;
1327		default:
1328			panic("kern_mknod: invalid mode");
1329		}
1330	}
1331	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1332		NDFREE(&nd, NDF_ONLY_PNBUF);
1333		vput(nd.ni_dvp);
1334		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1335			return (error);
1336		goto restart;
1337	}
1338#ifdef MAC
1339	if (error == 0 && !whiteout)
1340		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1341		    &nd.ni_cnd, &vattr);
1342#endif
1343	if (error == 0) {
1344		if (whiteout)
1345			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1346		else {
1347			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1348						&nd.ni_cnd, &vattr);
1349			if (error == 0)
1350				vput(nd.ni_vp);
1351		}
1352	}
1353	NDFREE(&nd, NDF_ONLY_PNBUF);
1354	vput(nd.ni_dvp);
1355	vn_finished_write(mp);
1356	return (error);
1357}
1358
1359/*
1360 * Create a named pipe.
1361 */
1362#ifndef _SYS_SYSPROTO_H_
1363struct mkfifo_args {
1364	char	*path;
1365	int	mode;
1366};
1367#endif
1368int
1369sys_mkfifo(td, uap)
1370	struct thread *td;
1371	register struct mkfifo_args /* {
1372		char *path;
1373		int mode;
1374	} */ *uap;
1375{
1376
1377	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
1378}
1379
1380#ifndef _SYS_SYSPROTO_H_
1381struct mkfifoat_args {
1382	int	fd;
1383	char	*path;
1384	mode_t	mode;
1385};
1386#endif
1387int
1388sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1389{
1390
1391	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1392	    uap->mode));
1393}
1394
1395int
1396kern_mkfifo(struct thread *td, char *path, enum uio_seg pathseg, int mode)
1397{
1398
1399	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
1400}
1401
1402int
1403kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1404    int mode)
1405{
1406	struct mount *mp;
1407	struct vattr vattr;
1408	struct nameidata nd;
1409	cap_rights_t rights;
1410	int error;
1411
1412	AUDIT_ARG_MODE(mode);
1413restart:
1414	bwillwrite();
1415	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1416	    pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT), td);
1417	if ((error = namei(&nd)) != 0)
1418		return (error);
1419	if (nd.ni_vp != NULL) {
1420		NDFREE(&nd, NDF_ONLY_PNBUF);
1421		if (nd.ni_vp == nd.ni_dvp)
1422			vrele(nd.ni_dvp);
1423		else
1424			vput(nd.ni_dvp);
1425		vrele(nd.ni_vp);
1426		return (EEXIST);
1427	}
1428	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1429		NDFREE(&nd, NDF_ONLY_PNBUF);
1430		vput(nd.ni_dvp);
1431		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1432			return (error);
1433		goto restart;
1434	}
1435	VATTR_NULL(&vattr);
1436	vattr.va_type = VFIFO;
1437	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1438#ifdef MAC
1439	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1440	    &vattr);
1441	if (error != 0)
1442		goto out;
1443#endif
1444	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1445	if (error == 0)
1446		vput(nd.ni_vp);
1447#ifdef MAC
1448out:
1449#endif
1450	vput(nd.ni_dvp);
1451	vn_finished_write(mp);
1452	NDFREE(&nd, NDF_ONLY_PNBUF);
1453	return (error);
1454}
1455
1456/*
1457 * Make a hard file link.
1458 */
1459#ifndef _SYS_SYSPROTO_H_
1460struct link_args {
1461	char	*path;
1462	char	*link;
1463};
1464#endif
1465int
1466sys_link(td, uap)
1467	struct thread *td;
1468	register struct link_args /* {
1469		char *path;
1470		char *link;
1471	} */ *uap;
1472{
1473
1474	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
1475}
1476
1477#ifndef _SYS_SYSPROTO_H_
1478struct linkat_args {
1479	int	fd1;
1480	char	*path1;
1481	int	fd2;
1482	char	*path2;
1483	int	flag;
1484};
1485#endif
1486int
1487sys_linkat(struct thread *td, struct linkat_args *uap)
1488{
1489	int flag;
1490
1491	flag = uap->flag;
1492	if (flag & ~AT_SYMLINK_FOLLOW)
1493		return (EINVAL);
1494
1495	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1496	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1497}
1498
1499int hardlink_check_uid = 0;
1500SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1501    &hardlink_check_uid, 0,
1502    "Unprivileged processes cannot create hard links to files owned by other "
1503    "users");
1504static int hardlink_check_gid = 0;
1505SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1506    &hardlink_check_gid, 0,
1507    "Unprivileged processes cannot create hard links to files owned by other "
1508    "groups");
1509
1510static int
1511can_hardlink(struct vnode *vp, struct ucred *cred)
1512{
1513	struct vattr va;
1514	int error;
1515
1516	if (!hardlink_check_uid && !hardlink_check_gid)
1517		return (0);
1518
1519	error = VOP_GETATTR(vp, &va, cred);
1520	if (error != 0)
1521		return (error);
1522
1523	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1524		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1525		if (error != 0)
1526			return (error);
1527	}
1528
1529	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1530		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1531		if (error != 0)
1532			return (error);
1533	}
1534
1535	return (0);
1536}
1537
1538int
1539kern_link(struct thread *td, char *path, char *link, enum uio_seg segflg)
1540{
1541
1542	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
1543}
1544
1545int
1546kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1547    enum uio_seg segflg, int follow)
1548{
1549	struct vnode *vp;
1550	struct mount *mp;
1551	struct nameidata nd;
1552	cap_rights_t rights;
1553	int error;
1554
1555	bwillwrite();
1556	NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
1557
1558	if ((error = namei(&nd)) != 0)
1559		return (error);
1560	NDFREE(&nd, NDF_ONLY_PNBUF);
1561	vp = nd.ni_vp;
1562	if (vp->v_type == VDIR) {
1563		vrele(vp);
1564		return (EPERM);		/* POSIX */
1565	}
1566	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
1567		vrele(vp);
1568		return (error);
1569	}
1570	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2,
1571	    segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT), td);
1572	if ((error = namei(&nd)) == 0) {
1573		if (nd.ni_vp != NULL) {
1574			if (nd.ni_dvp == nd.ni_vp)
1575				vrele(nd.ni_dvp);
1576			else
1577				vput(nd.ni_dvp);
1578			vrele(nd.ni_vp);
1579			error = EEXIST;
1580		} else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
1581		    == 0) {
1582			error = can_hardlink(vp, td->td_ucred);
1583			if (error == 0)
1584#ifdef MAC
1585				error = mac_vnode_check_link(td->td_ucred,
1586				    nd.ni_dvp, vp, &nd.ni_cnd);
1587			if (error == 0)
1588#endif
1589				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1590			VOP_UNLOCK(vp, 0);
1591			vput(nd.ni_dvp);
1592		}
1593		NDFREE(&nd, NDF_ONLY_PNBUF);
1594	}
1595	vrele(vp);
1596	vn_finished_write(mp);
1597	return (error);
1598}
1599
1600/*
1601 * Make a symbolic link.
1602 */
1603#ifndef _SYS_SYSPROTO_H_
1604struct symlink_args {
1605	char	*path;
1606	char	*link;
1607};
1608#endif
1609int
1610sys_symlink(td, uap)
1611	struct thread *td;
1612	register struct symlink_args /* {
1613		char *path;
1614		char *link;
1615	} */ *uap;
1616{
1617
1618	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
1619}
1620
1621#ifndef _SYS_SYSPROTO_H_
1622struct symlinkat_args {
1623	char	*path;
1624	int	fd;
1625	char	*path2;
1626};
1627#endif
1628int
1629sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1630{
1631
1632	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1633	    UIO_USERSPACE));
1634}
1635
1636int
1637kern_symlink(struct thread *td, char *path, char *link, enum uio_seg segflg)
1638{
1639
1640	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
1641}
1642
1643int
1644kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1645    enum uio_seg segflg)
1646{
1647	struct mount *mp;
1648	struct vattr vattr;
1649	char *syspath;
1650	struct nameidata nd;
1651	int error;
1652	cap_rights_t rights;
1653
1654	if (segflg == UIO_SYSSPACE) {
1655		syspath = path1;
1656	} else {
1657		syspath = uma_zalloc(namei_zone, M_WAITOK);
1658		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1659			goto out;
1660	}
1661	AUDIT_ARG_TEXT(syspath);
1662restart:
1663	bwillwrite();
1664	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
1665	    segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT), td);
1666	if ((error = namei(&nd)) != 0)
1667		goto out;
1668	if (nd.ni_vp) {
1669		NDFREE(&nd, NDF_ONLY_PNBUF);
1670		if (nd.ni_vp == nd.ni_dvp)
1671			vrele(nd.ni_dvp);
1672		else
1673			vput(nd.ni_dvp);
1674		vrele(nd.ni_vp);
1675		error = EEXIST;
1676		goto out;
1677	}
1678	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1679		NDFREE(&nd, NDF_ONLY_PNBUF);
1680		vput(nd.ni_dvp);
1681		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1682			goto out;
1683		goto restart;
1684	}
1685	VATTR_NULL(&vattr);
1686	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1687#ifdef MAC
1688	vattr.va_type = VLNK;
1689	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1690	    &vattr);
1691	if (error != 0)
1692		goto out2;
1693#endif
1694	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1695	if (error == 0)
1696		vput(nd.ni_vp);
1697#ifdef MAC
1698out2:
1699#endif
1700	NDFREE(&nd, NDF_ONLY_PNBUF);
1701	vput(nd.ni_dvp);
1702	vn_finished_write(mp);
1703out:
1704	if (segflg != UIO_SYSSPACE)
1705		uma_zfree(namei_zone, syspath);
1706	return (error);
1707}
1708
1709/*
1710 * Delete a whiteout from the filesystem.
1711 */
1712int
1713sys_undelete(td, uap)
1714	struct thread *td;
1715	register struct undelete_args /* {
1716		char *path;
1717	} */ *uap;
1718{
1719	struct mount *mp;
1720	struct nameidata nd;
1721	int error;
1722
1723restart:
1724	bwillwrite();
1725	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1726	    UIO_USERSPACE, uap->path, td);
1727	error = namei(&nd);
1728	if (error != 0)
1729		return (error);
1730
1731	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1732		NDFREE(&nd, NDF_ONLY_PNBUF);
1733		if (nd.ni_vp == nd.ni_dvp)
1734			vrele(nd.ni_dvp);
1735		else
1736			vput(nd.ni_dvp);
1737		if (nd.ni_vp)
1738			vrele(nd.ni_vp);
1739		return (EEXIST);
1740	}
1741	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1742		NDFREE(&nd, NDF_ONLY_PNBUF);
1743		vput(nd.ni_dvp);
1744		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1745			return (error);
1746		goto restart;
1747	}
1748	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1749	NDFREE(&nd, NDF_ONLY_PNBUF);
1750	vput(nd.ni_dvp);
1751	vn_finished_write(mp);
1752	return (error);
1753}
1754
1755/*
1756 * Delete a name from the filesystem.
1757 */
1758#ifndef _SYS_SYSPROTO_H_
1759struct unlink_args {
1760	char	*path;
1761};
1762#endif
1763int
1764sys_unlink(td, uap)
1765	struct thread *td;
1766	struct unlink_args /* {
1767		char *path;
1768	} */ *uap;
1769{
1770
1771	return (kern_unlink(td, uap->path, UIO_USERSPACE));
1772}
1773
1774#ifndef _SYS_SYSPROTO_H_
1775struct unlinkat_args {
1776	int	fd;
1777	char	*path;
1778	int	flag;
1779};
1780#endif
1781int
1782sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1783{
1784	int flag = uap->flag;
1785	int fd = uap->fd;
1786	char *path = uap->path;
1787
1788	if (flag & ~AT_REMOVEDIR)
1789		return (EINVAL);
1790
1791	if (flag & AT_REMOVEDIR)
1792		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1793	else
1794		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1795}
1796
1797int
1798kern_unlink(struct thread *td, char *path, enum uio_seg pathseg)
1799{
1800
1801	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
1802}
1803
1804int
1805kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1806    ino_t oldinum)
1807{
1808	struct mount *mp;
1809	struct vnode *vp;
1810	struct nameidata nd;
1811	struct stat sb;
1812	cap_rights_t rights;
1813	int error;
1814
1815restart:
1816	bwillwrite();
1817	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1818	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1819	if ((error = namei(&nd)) != 0)
1820		return (error == EINVAL ? EPERM : error);
1821	vp = nd.ni_vp;
1822	if (vp->v_type == VDIR && oldinum == 0) {
1823		error = EPERM;		/* POSIX */
1824	} else if (oldinum != 0 &&
1825		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1826		  sb.st_ino != oldinum) {
1827			error = EIDRM;	/* Identifier removed */
1828	} else {
1829		/*
1830		 * The root of a mounted filesystem cannot be deleted.
1831		 *
1832		 * XXX: can this only be a VDIR case?
1833		 */
1834		if (vp->v_vflag & VV_ROOT)
1835			error = EBUSY;
1836	}
1837	if (error == 0) {
1838		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1839			NDFREE(&nd, NDF_ONLY_PNBUF);
1840			vput(nd.ni_dvp);
1841			if (vp == nd.ni_dvp)
1842				vrele(vp);
1843			else
1844				vput(vp);
1845			if ((error = vn_start_write(NULL, &mp,
1846			    V_XSLEEP | PCATCH)) != 0)
1847				return (error);
1848			goto restart;
1849		}
1850#ifdef MAC
1851		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1852		    &nd.ni_cnd);
1853		if (error != 0)
1854			goto out;
1855#endif
1856		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1857		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1858#ifdef MAC
1859out:
1860#endif
1861		vn_finished_write(mp);
1862	}
1863	NDFREE(&nd, NDF_ONLY_PNBUF);
1864	vput(nd.ni_dvp);
1865	if (vp == nd.ni_dvp)
1866		vrele(vp);
1867	else
1868		vput(vp);
1869	return (error);
1870}
1871
1872/*
1873 * Reposition read/write file offset.
1874 */
1875#ifndef _SYS_SYSPROTO_H_
1876struct lseek_args {
1877	int	fd;
1878	int	pad;
1879	off_t	offset;
1880	int	whence;
1881};
1882#endif
1883int
1884sys_lseek(td, uap)
1885	struct thread *td;
1886	register struct lseek_args /* {
1887		int fd;
1888		int pad;
1889		off_t offset;
1890		int whence;
1891	} */ *uap;
1892{
1893	struct file *fp;
1894	cap_rights_t rights;
1895	int error;
1896
1897	AUDIT_ARG_FD(uap->fd);
1898	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1899	if (error != 0)
1900		return (error);
1901	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1902	    fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
1903	fdrop(fp, td);
1904	return (error);
1905}
1906
1907#if defined(COMPAT_43)
1908/*
1909 * Reposition read/write file offset.
1910 */
1911#ifndef _SYS_SYSPROTO_H_
1912struct olseek_args {
1913	int	fd;
1914	long	offset;
1915	int	whence;
1916};
1917#endif
1918int
1919olseek(td, uap)
1920	struct thread *td;
1921	register struct olseek_args /* {
1922		int fd;
1923		long offset;
1924		int whence;
1925	} */ *uap;
1926{
1927	struct lseek_args /* {
1928		int fd;
1929		int pad;
1930		off_t offset;
1931		int whence;
1932	} */ nuap;
1933
1934	nuap.fd = uap->fd;
1935	nuap.offset = uap->offset;
1936	nuap.whence = uap->whence;
1937	return (sys_lseek(td, &nuap));
1938}
1939#endif /* COMPAT_43 */
1940
1941/* Version with the 'pad' argument */
1942int
1943freebsd6_lseek(td, uap)
1944	struct thread *td;
1945	register struct freebsd6_lseek_args *uap;
1946{
1947	struct lseek_args ouap;
1948
1949	ouap.fd = uap->fd;
1950	ouap.offset = uap->offset;
1951	ouap.whence = uap->whence;
1952	return (sys_lseek(td, &ouap));
1953}
1954
1955/*
1956 * Check access permissions using passed credentials.
1957 */
1958static int
1959vn_access(vp, user_flags, cred, td)
1960	struct vnode	*vp;
1961	int		user_flags;
1962	struct ucred	*cred;
1963	struct thread	*td;
1964{
1965	accmode_t accmode;
1966	int error;
1967
1968	/* Flags == 0 means only check for existence. */
1969	error = 0;
1970	if (user_flags) {
1971		accmode = 0;
1972		if (user_flags & R_OK)
1973			accmode |= VREAD;
1974		if (user_flags & W_OK)
1975			accmode |= VWRITE;
1976		if (user_flags & X_OK)
1977			accmode |= VEXEC;
1978#ifdef MAC
1979		error = mac_vnode_check_access(cred, vp, accmode);
1980		if (error != 0)
1981			return (error);
1982#endif
1983		if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1984			error = VOP_ACCESS(vp, accmode, cred, td);
1985	}
1986	return (error);
1987}
1988
1989/*
1990 * Check access permissions using "real" credentials.
1991 */
1992#ifndef _SYS_SYSPROTO_H_
1993struct access_args {
1994	char	*path;
1995	int	amode;
1996};
1997#endif
1998int
1999sys_access(td, uap)
2000	struct thread *td;
2001	register struct access_args /* {
2002		char *path;
2003		int amode;
2004	} */ *uap;
2005{
2006
2007	return (kern_access(td, uap->path, UIO_USERSPACE, uap->amode));
2008}
2009
2010#ifndef _SYS_SYSPROTO_H_
2011struct faccessat_args {
2012	int	dirfd;
2013	char	*path;
2014	int	amode;
2015	int	flag;
2016}
2017#endif
2018int
2019sys_faccessat(struct thread *td, struct faccessat_args *uap)
2020{
2021
2022	if (uap->flag & ~AT_EACCESS)
2023		return (EINVAL);
2024	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
2025	    uap->amode));
2026}
2027
2028int
2029kern_access(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2030{
2031
2032	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, amode));
2033}
2034
2035int
2036kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2037    int flag, int amode)
2038{
2039	struct ucred *cred, *tmpcred;
2040	struct vnode *vp;
2041	struct nameidata nd;
2042	cap_rights_t rights;
2043	int error;
2044
2045	/*
2046	 * Create and modify a temporary credential instead of one that
2047	 * is potentially shared.
2048	 */
2049	if (!(flag & AT_EACCESS)) {
2050		cred = td->td_ucred;
2051		tmpcred = crdup(cred);
2052		tmpcred->cr_uid = cred->cr_ruid;
2053		tmpcred->cr_groups[0] = cred->cr_rgid;
2054		td->td_ucred = tmpcred;
2055	} else
2056		cred = tmpcred = td->td_ucred;
2057	AUDIT_ARG_VALUE(amode);
2058	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
2059	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
2060	    td);
2061	if ((error = namei(&nd)) != 0)
2062		goto out1;
2063	vp = nd.ni_vp;
2064
2065	error = vn_access(vp, amode, tmpcred, td);
2066	NDFREE(&nd, NDF_ONLY_PNBUF);
2067	vput(vp);
2068out1:
2069	if (!(flag & AT_EACCESS)) {
2070		td->td_ucred = cred;
2071		crfree(tmpcred);
2072	}
2073	return (error);
2074}
2075
2076/*
2077 * Check access permissions using "effective" credentials.
2078 */
2079#ifndef _SYS_SYSPROTO_H_
2080struct eaccess_args {
2081	char	*path;
2082	int	amode;
2083};
2084#endif
2085int
2086sys_eaccess(td, uap)
2087	struct thread *td;
2088	register struct eaccess_args /* {
2089		char *path;
2090		int amode;
2091	} */ *uap;
2092{
2093
2094	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->amode));
2095}
2096
2097int
2098kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int amode)
2099{
2100
2101	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, amode));
2102}
2103
2104#if defined(COMPAT_43)
2105/*
2106 * Get file status; this version follows links.
2107 */
2108#ifndef _SYS_SYSPROTO_H_
2109struct ostat_args {
2110	char	*path;
2111	struct ostat *ub;
2112};
2113#endif
2114int
2115ostat(td, uap)
2116	struct thread *td;
2117	register struct ostat_args /* {
2118		char *path;
2119		struct ostat *ub;
2120	} */ *uap;
2121{
2122	struct stat sb;
2123	struct ostat osb;
2124	int error;
2125
2126	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2127	if (error != 0)
2128		return (error);
2129	cvtstat(&sb, &osb);
2130	return (copyout(&osb, uap->ub, sizeof (osb)));
2131}
2132
2133/*
2134 * Get file status; this version does not follow links.
2135 */
2136#ifndef _SYS_SYSPROTO_H_
2137struct olstat_args {
2138	char	*path;
2139	struct ostat *ub;
2140};
2141#endif
2142int
2143olstat(td, uap)
2144	struct thread *td;
2145	register struct olstat_args /* {
2146		char *path;
2147		struct ostat *ub;
2148	} */ *uap;
2149{
2150	struct stat sb;
2151	struct ostat osb;
2152	int error;
2153
2154	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2155	if (error != 0)
2156		return (error);
2157	cvtstat(&sb, &osb);
2158	return (copyout(&osb, uap->ub, sizeof (osb)));
2159}
2160
2161/*
2162 * Convert from an old to a new stat structure.
2163 */
2164void
2165cvtstat(st, ost)
2166	struct stat *st;
2167	struct ostat *ost;
2168{
2169
2170	ost->st_dev = st->st_dev;
2171	ost->st_ino = st->st_ino;
2172	ost->st_mode = st->st_mode;
2173	ost->st_nlink = st->st_nlink;
2174	ost->st_uid = st->st_uid;
2175	ost->st_gid = st->st_gid;
2176	ost->st_rdev = st->st_rdev;
2177	if (st->st_size < (quad_t)1 << 32)
2178		ost->st_size = st->st_size;
2179	else
2180		ost->st_size = -2;
2181	ost->st_atim = st->st_atim;
2182	ost->st_mtim = st->st_mtim;
2183	ost->st_ctim = st->st_ctim;
2184	ost->st_blksize = st->st_blksize;
2185	ost->st_blocks = st->st_blocks;
2186	ost->st_flags = st->st_flags;
2187	ost->st_gen = st->st_gen;
2188}
2189#endif /* COMPAT_43 */
2190
2191/*
2192 * Get file status; this version follows links.
2193 */
2194#ifndef _SYS_SYSPROTO_H_
2195struct stat_args {
2196	char	*path;
2197	struct stat *ub;
2198};
2199#endif
2200int
2201sys_stat(td, uap)
2202	struct thread *td;
2203	register struct stat_args /* {
2204		char *path;
2205		struct stat *ub;
2206	} */ *uap;
2207{
2208	struct stat sb;
2209	int error;
2210
2211	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2212	if (error == 0)
2213		error = copyout(&sb, uap->ub, sizeof (sb));
2214	return (error);
2215}
2216
2217#ifndef _SYS_SYSPROTO_H_
2218struct fstatat_args {
2219	int	fd;
2220	char	*path;
2221	struct stat	*buf;
2222	int	flag;
2223}
2224#endif
2225int
2226sys_fstatat(struct thread *td, struct fstatat_args *uap)
2227{
2228	struct stat sb;
2229	int error;
2230
2231	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2232	    UIO_USERSPACE, &sb);
2233	if (error == 0)
2234		error = copyout(&sb, uap->buf, sizeof (sb));
2235	return (error);
2236}
2237
2238int
2239kern_stat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2240{
2241
2242	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
2243}
2244
2245int
2246kern_statat(struct thread *td, int flag, int fd, char *path,
2247    enum uio_seg pathseg, struct stat *sbp)
2248{
2249
2250	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
2251}
2252
2253int
2254kern_statat_vnhook(struct thread *td, int flag, int fd, char *path,
2255    enum uio_seg pathseg, struct stat *sbp,
2256    void (*hook)(struct vnode *vp, struct stat *sbp))
2257{
2258	struct nameidata nd;
2259	struct stat sb;
2260	cap_rights_t rights;
2261	int error;
2262
2263	if (flag & ~AT_SYMLINK_NOFOLLOW)
2264		return (EINVAL);
2265
2266	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2267	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2268	    cap_rights_init(&rights, CAP_FSTAT), td);
2269
2270	if ((error = namei(&nd)) != 0)
2271		return (error);
2272	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2273	if (error == 0) {
2274		SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
2275		if (S_ISREG(sb.st_mode))
2276			SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
2277		if (__predict_false(hook != NULL))
2278			hook(nd.ni_vp, &sb);
2279	}
2280	NDFREE(&nd, NDF_ONLY_PNBUF);
2281	vput(nd.ni_vp);
2282	if (error != 0)
2283		return (error);
2284	*sbp = sb;
2285#ifdef KTRACE
2286	if (KTRPOINT(td, KTR_STRUCT))
2287		ktrstat(&sb);
2288#endif
2289	return (0);
2290}
2291
2292/*
2293 * Get file status; this version does not follow links.
2294 */
2295#ifndef _SYS_SYSPROTO_H_
2296struct lstat_args {
2297	char	*path;
2298	struct stat *ub;
2299};
2300#endif
2301int
2302sys_lstat(td, uap)
2303	struct thread *td;
2304	register struct lstat_args /* {
2305		char *path;
2306		struct stat *ub;
2307	} */ *uap;
2308{
2309	struct stat sb;
2310	int error;
2311
2312	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2313	if (error == 0)
2314		error = copyout(&sb, uap->ub, sizeof (sb));
2315	return (error);
2316}
2317
2318int
2319kern_lstat(struct thread *td, char *path, enum uio_seg pathseg, struct stat *sbp)
2320{
2321
2322	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
2323	    sbp));
2324}
2325
2326/*
2327 * Implementation of the NetBSD [l]stat() functions.
2328 */
2329void
2330cvtnstat(sb, nsb)
2331	struct stat *sb;
2332	struct nstat *nsb;
2333{
2334
2335	bzero(nsb, sizeof *nsb);
2336	nsb->st_dev = sb->st_dev;
2337	nsb->st_ino = sb->st_ino;
2338	nsb->st_mode = sb->st_mode;
2339	nsb->st_nlink = sb->st_nlink;
2340	nsb->st_uid = sb->st_uid;
2341	nsb->st_gid = sb->st_gid;
2342	nsb->st_rdev = sb->st_rdev;
2343	nsb->st_atim = sb->st_atim;
2344	nsb->st_mtim = sb->st_mtim;
2345	nsb->st_ctim = sb->st_ctim;
2346	nsb->st_size = sb->st_size;
2347	nsb->st_blocks = sb->st_blocks;
2348	nsb->st_blksize = sb->st_blksize;
2349	nsb->st_flags = sb->st_flags;
2350	nsb->st_gen = sb->st_gen;
2351	nsb->st_birthtim = sb->st_birthtim;
2352}
2353
2354#ifndef _SYS_SYSPROTO_H_
2355struct nstat_args {
2356	char	*path;
2357	struct nstat *ub;
2358};
2359#endif
2360int
2361sys_nstat(td, uap)
2362	struct thread *td;
2363	register struct nstat_args /* {
2364		char *path;
2365		struct nstat *ub;
2366	} */ *uap;
2367{
2368	struct stat sb;
2369	struct nstat nsb;
2370	int error;
2371
2372	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
2373	if (error != 0)
2374		return (error);
2375	cvtnstat(&sb, &nsb);
2376	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2377}
2378
2379/*
2380 * NetBSD lstat.  Get file status; this version does not follow links.
2381 */
2382#ifndef _SYS_SYSPROTO_H_
2383struct lstat_args {
2384	char	*path;
2385	struct stat *ub;
2386};
2387#endif
2388int
2389sys_nlstat(td, uap)
2390	struct thread *td;
2391	register struct nlstat_args /* {
2392		char *path;
2393		struct nstat *ub;
2394	} */ *uap;
2395{
2396	struct stat sb;
2397	struct nstat nsb;
2398	int error;
2399
2400	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
2401	if (error != 0)
2402		return (error);
2403	cvtnstat(&sb, &nsb);
2404	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2405}
2406
2407/*
2408 * Get configurable pathname variables.
2409 */
2410#ifndef _SYS_SYSPROTO_H_
2411struct pathconf_args {
2412	char	*path;
2413	int	name;
2414};
2415#endif
2416int
2417sys_pathconf(td, uap)
2418	struct thread *td;
2419	register struct pathconf_args /* {
2420		char *path;
2421		int name;
2422	} */ *uap;
2423{
2424
2425	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2426}
2427
2428#ifndef _SYS_SYSPROTO_H_
2429struct lpathconf_args {
2430	char	*path;
2431	int	name;
2432};
2433#endif
2434int
2435sys_lpathconf(td, uap)
2436	struct thread *td;
2437	register struct lpathconf_args /* {
2438		char *path;
2439		int name;
2440	} */ *uap;
2441{
2442
2443	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2444	    NOFOLLOW));
2445}
2446
2447int
2448kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2449    u_long flags)
2450{
2451	struct nameidata nd;
2452	int error;
2453
2454	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2455	    pathseg, path, td);
2456	if ((error = namei(&nd)) != 0)
2457		return (error);
2458	NDFREE(&nd, NDF_ONLY_PNBUF);
2459
2460	/* If asynchronous I/O is available, it works for all files. */
2461	if (name == _PC_ASYNC_IO)
2462		td->td_retval[0] = async_io_version;
2463	else
2464		error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2465	vput(nd.ni_vp);
2466	return (error);
2467}
2468
2469/*
2470 * Return target name of a symbolic link.
2471 */
2472#ifndef _SYS_SYSPROTO_H_
2473struct readlink_args {
2474	char	*path;
2475	char	*buf;
2476	size_t	count;
2477};
2478#endif
2479int
2480sys_readlink(td, uap)
2481	struct thread *td;
2482	register struct readlink_args /* {
2483		char *path;
2484		char *buf;
2485		size_t count;
2486	} */ *uap;
2487{
2488
2489	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
2490	    UIO_USERSPACE, uap->count));
2491}
2492#ifndef _SYS_SYSPROTO_H_
2493struct readlinkat_args {
2494	int	fd;
2495	char	*path;
2496	char	*buf;
2497	size_t	bufsize;
2498};
2499#endif
2500int
2501sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2502{
2503
2504	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2505	    uap->buf, UIO_USERSPACE, uap->bufsize));
2506}
2507
2508int
2509kern_readlink(struct thread *td, char *path, enum uio_seg pathseg, char *buf,
2510    enum uio_seg bufseg, size_t count)
2511{
2512
2513	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
2514	    count));
2515}
2516
2517int
2518kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2519    char *buf, enum uio_seg bufseg, size_t count)
2520{
2521	struct vnode *vp;
2522	struct iovec aiov;
2523	struct uio auio;
2524	struct nameidata nd;
2525	int error;
2526
2527	if (count > IOSIZE_MAX)
2528		return (EINVAL);
2529
2530	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2531	    pathseg, path, fd, td);
2532
2533	if ((error = namei(&nd)) != 0)
2534		return (error);
2535	NDFREE(&nd, NDF_ONLY_PNBUF);
2536	vp = nd.ni_vp;
2537#ifdef MAC
2538	error = mac_vnode_check_readlink(td->td_ucred, vp);
2539	if (error != 0) {
2540		vput(vp);
2541		return (error);
2542	}
2543#endif
2544	if (vp->v_type != VLNK)
2545		error = EINVAL;
2546	else {
2547		aiov.iov_base = buf;
2548		aiov.iov_len = count;
2549		auio.uio_iov = &aiov;
2550		auio.uio_iovcnt = 1;
2551		auio.uio_offset = 0;
2552		auio.uio_rw = UIO_READ;
2553		auio.uio_segflg = bufseg;
2554		auio.uio_td = td;
2555		auio.uio_resid = count;
2556		error = VOP_READLINK(vp, &auio, td->td_ucred);
2557	}
2558	vput(vp);
2559	td->td_retval[0] = count - auio.uio_resid;
2560	return (error);
2561}
2562
2563/*
2564 * Common implementation code for chflags() and fchflags().
2565 */
2566static int
2567setfflags(td, vp, flags)
2568	struct thread *td;
2569	struct vnode *vp;
2570	u_long flags;
2571{
2572	struct mount *mp;
2573	struct vattr vattr;
2574	int error;
2575
2576	/* We can't support the value matching VNOVAL. */
2577	if (flags == VNOVAL)
2578		return (EOPNOTSUPP);
2579
2580	/*
2581	 * Prevent non-root users from setting flags on devices.  When
2582	 * a device is reused, users can retain ownership of the device
2583	 * if they are allowed to set flags and programs assume that
2584	 * chown can't fail when done as root.
2585	 */
2586	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2587		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2588		if (error != 0)
2589			return (error);
2590	}
2591
2592	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2593		return (error);
2594	VATTR_NULL(&vattr);
2595	vattr.va_flags = flags;
2596	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2597#ifdef MAC
2598	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2599	if (error == 0)
2600#endif
2601		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2602	VOP_UNLOCK(vp, 0);
2603	vn_finished_write(mp);
2604	return (error);
2605}
2606
2607/*
2608 * Change flags of a file given a path name.
2609 */
2610#ifndef _SYS_SYSPROTO_H_
2611struct chflags_args {
2612	const char *path;
2613	u_long	flags;
2614};
2615#endif
2616int
2617sys_chflags(td, uap)
2618	struct thread *td;
2619	register struct chflags_args /* {
2620		const char *path;
2621		u_long flags;
2622	} */ *uap;
2623{
2624
2625	return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
2626}
2627
2628#ifndef _SYS_SYSPROTO_H_
2629struct chflagsat_args {
2630	int	fd;
2631	const char *path;
2632	u_long	flags;
2633	int	atflag;
2634}
2635#endif
2636int
2637sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2638{
2639	int fd = uap->fd;
2640	const char *path = uap->path;
2641	u_long flags = uap->flags;
2642	int atflag = uap->atflag;
2643
2644	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2645		return (EINVAL);
2646
2647	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2648}
2649
2650static int
2651kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
2652    u_long flags)
2653{
2654
2655	return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
2656}
2657
2658/*
2659 * Same as chflags() but doesn't follow symlinks.
2660 */
2661int
2662sys_lchflags(td, uap)
2663	struct thread *td;
2664	register struct lchflags_args /* {
2665		const char *path;
2666		u_long flags;
2667	} */ *uap;
2668{
2669
2670	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2671	    uap->flags, AT_SYMLINK_NOFOLLOW));
2672}
2673
2674static int
2675kern_chflagsat(struct thread *td, int fd, const char *path,
2676    enum uio_seg pathseg, u_long flags, int atflag)
2677{
2678	struct nameidata nd;
2679	cap_rights_t rights;
2680	int error, follow;
2681
2682	AUDIT_ARG_FFLAGS(flags);
2683	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2684	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2685	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2686	if ((error = namei(&nd)) != 0)
2687		return (error);
2688	NDFREE(&nd, NDF_ONLY_PNBUF);
2689	error = setfflags(td, nd.ni_vp, flags);
2690	vrele(nd.ni_vp);
2691	return (error);
2692}
2693
2694/*
2695 * Change flags of a file given a file descriptor.
2696 */
2697#ifndef _SYS_SYSPROTO_H_
2698struct fchflags_args {
2699	int	fd;
2700	u_long	flags;
2701};
2702#endif
2703int
2704sys_fchflags(td, uap)
2705	struct thread *td;
2706	register struct fchflags_args /* {
2707		int fd;
2708		u_long flags;
2709	} */ *uap;
2710{
2711	struct file *fp;
2712	cap_rights_t rights;
2713	int error;
2714
2715	AUDIT_ARG_FD(uap->fd);
2716	AUDIT_ARG_FFLAGS(uap->flags);
2717	error = getvnode(td->td_proc->p_fd, uap->fd,
2718	    cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
2719	if (error != 0)
2720		return (error);
2721#ifdef AUDIT
2722	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2723	AUDIT_ARG_VNODE1(fp->f_vnode);
2724	VOP_UNLOCK(fp->f_vnode, 0);
2725#endif
2726	error = setfflags(td, fp->f_vnode, uap->flags);
2727	fdrop(fp, td);
2728	return (error);
2729}
2730
2731/*
2732 * Common implementation code for chmod(), lchmod() and fchmod().
2733 */
2734int
2735setfmode(td, cred, vp, mode)
2736	struct thread *td;
2737	struct ucred *cred;
2738	struct vnode *vp;
2739	int mode;
2740{
2741	struct mount *mp;
2742	struct vattr vattr;
2743	int error;
2744
2745	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2746		return (error);
2747	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2748	VATTR_NULL(&vattr);
2749	vattr.va_mode = mode & ALLPERMS;
2750#ifdef MAC
2751	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2752	if (error == 0)
2753#endif
2754		error = VOP_SETATTR(vp, &vattr, cred);
2755	VOP_UNLOCK(vp, 0);
2756	vn_finished_write(mp);
2757	return (error);
2758}
2759
2760/*
2761 * Change mode of a file given path name.
2762 */
2763#ifndef _SYS_SYSPROTO_H_
2764struct chmod_args {
2765	char	*path;
2766	int	mode;
2767};
2768#endif
2769int
2770sys_chmod(td, uap)
2771	struct thread *td;
2772	register struct chmod_args /* {
2773		char *path;
2774		int mode;
2775	} */ *uap;
2776{
2777
2778	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
2779}
2780
2781#ifndef _SYS_SYSPROTO_H_
2782struct fchmodat_args {
2783	int	dirfd;
2784	char	*path;
2785	mode_t	mode;
2786	int	flag;
2787}
2788#endif
2789int
2790sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2791{
2792	int flag = uap->flag;
2793	int fd = uap->fd;
2794	char *path = uap->path;
2795	mode_t mode = uap->mode;
2796
2797	if (flag & ~AT_SYMLINK_NOFOLLOW)
2798		return (EINVAL);
2799
2800	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2801}
2802
2803int
2804kern_chmod(struct thread *td, char *path, enum uio_seg pathseg, int mode)
2805{
2806
2807	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
2808}
2809
2810/*
2811 * Change mode of a file given path name (don't follow links.)
2812 */
2813#ifndef _SYS_SYSPROTO_H_
2814struct lchmod_args {
2815	char	*path;
2816	int	mode;
2817};
2818#endif
2819int
2820sys_lchmod(td, uap)
2821	struct thread *td;
2822	register struct lchmod_args /* {
2823		char *path;
2824		int mode;
2825	} */ *uap;
2826{
2827
2828	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2829	    uap->mode, AT_SYMLINK_NOFOLLOW));
2830}
2831
2832int
2833kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2834    mode_t mode, int flag)
2835{
2836	struct nameidata nd;
2837	cap_rights_t rights;
2838	int error, follow;
2839
2840	AUDIT_ARG_MODE(mode);
2841	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2842	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2843	    cap_rights_init(&rights, CAP_FCHMOD), td);
2844	if ((error = namei(&nd)) != 0)
2845		return (error);
2846	NDFREE(&nd, NDF_ONLY_PNBUF);
2847	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2848	vrele(nd.ni_vp);
2849	return (error);
2850}
2851
2852/*
2853 * Change mode of a file given a file descriptor.
2854 */
2855#ifndef _SYS_SYSPROTO_H_
2856struct fchmod_args {
2857	int	fd;
2858	int	mode;
2859};
2860#endif
2861int
2862sys_fchmod(struct thread *td, struct fchmod_args *uap)
2863{
2864	struct file *fp;
2865	cap_rights_t rights;
2866	int error;
2867
2868	AUDIT_ARG_FD(uap->fd);
2869	AUDIT_ARG_MODE(uap->mode);
2870
2871	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2872	if (error != 0)
2873		return (error);
2874	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2875	fdrop(fp, td);
2876	return (error);
2877}
2878
2879/*
2880 * Common implementation for chown(), lchown(), and fchown()
2881 */
2882int
2883setfown(td, cred, vp, uid, gid)
2884	struct thread *td;
2885	struct ucred *cred;
2886	struct vnode *vp;
2887	uid_t uid;
2888	gid_t gid;
2889{
2890	struct mount *mp;
2891	struct vattr vattr;
2892	int error;
2893
2894	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2895		return (error);
2896	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2897	VATTR_NULL(&vattr);
2898	vattr.va_uid = uid;
2899	vattr.va_gid = gid;
2900#ifdef MAC
2901	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2902	    vattr.va_gid);
2903	if (error == 0)
2904#endif
2905		error = VOP_SETATTR(vp, &vattr, cred);
2906	VOP_UNLOCK(vp, 0);
2907	vn_finished_write(mp);
2908	return (error);
2909}
2910
2911/*
2912 * Set ownership given a path name.
2913 */
2914#ifndef _SYS_SYSPROTO_H_
2915struct chown_args {
2916	char	*path;
2917	int	uid;
2918	int	gid;
2919};
2920#endif
2921int
2922sys_chown(td, uap)
2923	struct thread *td;
2924	register struct chown_args /* {
2925		char *path;
2926		int uid;
2927		int gid;
2928	} */ *uap;
2929{
2930
2931	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
2932}
2933
2934#ifndef _SYS_SYSPROTO_H_
2935struct fchownat_args {
2936	int fd;
2937	const char * path;
2938	uid_t uid;
2939	gid_t gid;
2940	int flag;
2941};
2942#endif
2943int
2944sys_fchownat(struct thread *td, struct fchownat_args *uap)
2945{
2946	int flag;
2947
2948	flag = uap->flag;
2949	if (flag & ~AT_SYMLINK_NOFOLLOW)
2950		return (EINVAL);
2951
2952	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2953	    uap->gid, uap->flag));
2954}
2955
2956int
2957kern_chown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
2958    int gid)
2959{
2960
2961	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
2962}
2963
2964int
2965kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2966    int uid, int gid, int flag)
2967{
2968	struct nameidata nd;
2969	cap_rights_t rights;
2970	int error, follow;
2971
2972	AUDIT_ARG_OWNER(uid, gid);
2973	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2974	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2975	    cap_rights_init(&rights, CAP_FCHOWN), td);
2976
2977	if ((error = namei(&nd)) != 0)
2978		return (error);
2979	NDFREE(&nd, NDF_ONLY_PNBUF);
2980	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2981	vrele(nd.ni_vp);
2982	return (error);
2983}
2984
2985/*
2986 * Set ownership given a path name, do not cross symlinks.
2987 */
2988#ifndef _SYS_SYSPROTO_H_
2989struct lchown_args {
2990	char	*path;
2991	int	uid;
2992	int	gid;
2993};
2994#endif
2995int
2996sys_lchown(td, uap)
2997	struct thread *td;
2998	register struct lchown_args /* {
2999		char *path;
3000		int uid;
3001		int gid;
3002	} */ *uap;
3003{
3004
3005	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
3006}
3007
3008int
3009kern_lchown(struct thread *td, char *path, enum uio_seg pathseg, int uid,
3010    int gid)
3011{
3012
3013	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
3014	    AT_SYMLINK_NOFOLLOW));
3015}
3016
3017/*
3018 * Set ownership given a file descriptor.
3019 */
3020#ifndef _SYS_SYSPROTO_H_
3021struct fchown_args {
3022	int	fd;
3023	int	uid;
3024	int	gid;
3025};
3026#endif
3027int
3028sys_fchown(td, uap)
3029	struct thread *td;
3030	register struct fchown_args /* {
3031		int fd;
3032		int uid;
3033		int gid;
3034	} */ *uap;
3035{
3036	struct file *fp;
3037	cap_rights_t rights;
3038	int error;
3039
3040	AUDIT_ARG_FD(uap->fd);
3041	AUDIT_ARG_OWNER(uap->uid, uap->gid);
3042	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
3043	if (error != 0)
3044		return (error);
3045	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
3046	fdrop(fp, td);
3047	return (error);
3048}
3049
3050/*
3051 * Common implementation code for utimes(), lutimes(), and futimes().
3052 */
3053static int
3054getutimes(usrtvp, tvpseg, tsp)
3055	const struct timeval *usrtvp;
3056	enum uio_seg tvpseg;
3057	struct timespec *tsp;
3058{
3059	struct timeval tv[2];
3060	const struct timeval *tvp;
3061	int error;
3062
3063	if (usrtvp == NULL) {
3064		vfs_timestamp(&tsp[0]);
3065		tsp[1] = tsp[0];
3066	} else {
3067		if (tvpseg == UIO_SYSSPACE) {
3068			tvp = usrtvp;
3069		} else {
3070			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
3071				return (error);
3072			tvp = tv;
3073		}
3074
3075		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
3076		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
3077			return (EINVAL);
3078		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
3079		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
3080	}
3081	return (0);
3082}
3083
3084/*
3085 * Common implementation code for utimes(), lutimes(), and futimes().
3086 */
3087static int
3088setutimes(td, vp, ts, numtimes, nullflag)
3089	struct thread *td;
3090	struct vnode *vp;
3091	const struct timespec *ts;
3092	int numtimes;
3093	int nullflag;
3094{
3095	struct mount *mp;
3096	struct vattr vattr;
3097	int error, setbirthtime;
3098
3099	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
3100		return (error);
3101	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3102	setbirthtime = 0;
3103	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
3104	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
3105		setbirthtime = 1;
3106	VATTR_NULL(&vattr);
3107	vattr.va_atime = ts[0];
3108	vattr.va_mtime = ts[1];
3109	if (setbirthtime)
3110		vattr.va_birthtime = ts[1];
3111	if (numtimes > 2)
3112		vattr.va_birthtime = ts[2];
3113	if (nullflag)
3114		vattr.va_vaflags |= VA_UTIMES_NULL;
3115#ifdef MAC
3116	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
3117	    vattr.va_mtime);
3118#endif
3119	if (error == 0)
3120		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3121	VOP_UNLOCK(vp, 0);
3122	vn_finished_write(mp);
3123	return (error);
3124}
3125
3126/*
3127 * Set the access and modification times of a file.
3128 */
3129#ifndef _SYS_SYSPROTO_H_
3130struct utimes_args {
3131	char	*path;
3132	struct	timeval *tptr;
3133};
3134#endif
3135int
3136sys_utimes(td, uap)
3137	struct thread *td;
3138	register struct utimes_args /* {
3139		char *path;
3140		struct timeval *tptr;
3141	} */ *uap;
3142{
3143
3144	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3145	    UIO_USERSPACE));
3146}
3147
3148#ifndef _SYS_SYSPROTO_H_
3149struct futimesat_args {
3150	int fd;
3151	const char * path;
3152	const struct timeval * times;
3153};
3154#endif
3155int
3156sys_futimesat(struct thread *td, struct futimesat_args *uap)
3157{
3158
3159	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3160	    uap->times, UIO_USERSPACE));
3161}
3162
3163int
3164kern_utimes(struct thread *td, char *path, enum uio_seg pathseg,
3165    struct timeval *tptr, enum uio_seg tptrseg)
3166{
3167
3168	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
3169}
3170
3171int
3172kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3173    struct timeval *tptr, enum uio_seg tptrseg)
3174{
3175	struct nameidata nd;
3176	struct timespec ts[2];
3177	cap_rights_t rights;
3178	int error;
3179
3180	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3181		return (error);
3182	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3183	    cap_rights_init(&rights, CAP_FUTIMES), td);
3184
3185	if ((error = namei(&nd)) != 0)
3186		return (error);
3187	NDFREE(&nd, NDF_ONLY_PNBUF);
3188	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3189	vrele(nd.ni_vp);
3190	return (error);
3191}
3192
3193/*
3194 * Set the access and modification times of a file.
3195 */
3196#ifndef _SYS_SYSPROTO_H_
3197struct lutimes_args {
3198	char	*path;
3199	struct	timeval *tptr;
3200};
3201#endif
3202int
3203sys_lutimes(td, uap)
3204	struct thread *td;
3205	register struct lutimes_args /* {
3206		char *path;
3207		struct timeval *tptr;
3208	} */ *uap;
3209{
3210
3211	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3212	    UIO_USERSPACE));
3213}
3214
3215int
3216kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3217    struct timeval *tptr, enum uio_seg tptrseg)
3218{
3219	struct timespec ts[2];
3220	struct nameidata nd;
3221	int error;
3222
3223	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3224		return (error);
3225	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3226	if ((error = namei(&nd)) != 0)
3227		return (error);
3228	NDFREE(&nd, NDF_ONLY_PNBUF);
3229	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3230	vrele(nd.ni_vp);
3231	return (error);
3232}
3233
3234/*
3235 * Set the access and modification times of a file.
3236 */
3237#ifndef _SYS_SYSPROTO_H_
3238struct futimes_args {
3239	int	fd;
3240	struct	timeval *tptr;
3241};
3242#endif
3243int
3244sys_futimes(td, uap)
3245	struct thread *td;
3246	register struct futimes_args /* {
3247		int  fd;
3248		struct timeval *tptr;
3249	} */ *uap;
3250{
3251
3252	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3253}
3254
3255int
3256kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3257    enum uio_seg tptrseg)
3258{
3259	struct timespec ts[2];
3260	struct file *fp;
3261	cap_rights_t rights;
3262	int error;
3263
3264	AUDIT_ARG_FD(fd);
3265	error = getutimes(tptr, tptrseg, ts);
3266	if (error != 0)
3267		return (error);
3268	error = getvnode(td->td_proc->p_fd, fd,
3269	    cap_rights_init(&rights, CAP_FUTIMES), &fp);
3270	if (error != 0)
3271		return (error);
3272#ifdef AUDIT
3273	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3274	AUDIT_ARG_VNODE1(fp->f_vnode);
3275	VOP_UNLOCK(fp->f_vnode, 0);
3276#endif
3277	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3278	fdrop(fp, td);
3279	return (error);
3280}
3281
3282/*
3283 * Truncate a file given its path name.
3284 */
3285#ifndef _SYS_SYSPROTO_H_
3286struct truncate_args {
3287	char	*path;
3288	int	pad;
3289	off_t	length;
3290};
3291#endif
3292int
3293sys_truncate(td, uap)
3294	struct thread *td;
3295	register struct truncate_args /* {
3296		char *path;
3297		int pad;
3298		off_t length;
3299	} */ *uap;
3300{
3301
3302	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3303}
3304
3305int
3306kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3307{
3308	struct mount *mp;
3309	struct vnode *vp;
3310	void *rl_cookie;
3311	struct vattr vattr;
3312	struct nameidata nd;
3313	int error;
3314
3315	if (length < 0)
3316		return(EINVAL);
3317	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3318	if ((error = namei(&nd)) != 0)
3319		return (error);
3320	vp = nd.ni_vp;
3321	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3322	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3323		vn_rangelock_unlock(vp, rl_cookie);
3324		vrele(vp);
3325		return (error);
3326	}
3327	NDFREE(&nd, NDF_ONLY_PNBUF);
3328	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3329	if (vp->v_type == VDIR)
3330		error = EISDIR;
3331#ifdef MAC
3332	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3333	}
3334#endif
3335	else if ((error = vn_writechk(vp)) == 0 &&
3336	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3337		VATTR_NULL(&vattr);
3338		vattr.va_size = length;
3339		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3340	}
3341	VOP_UNLOCK(vp, 0);
3342	vn_finished_write(mp);
3343	vn_rangelock_unlock(vp, rl_cookie);
3344	vrele(vp);
3345	return (error);
3346}
3347
3348#if defined(COMPAT_43)
3349/*
3350 * Truncate a file given its path name.
3351 */
3352#ifndef _SYS_SYSPROTO_H_
3353struct otruncate_args {
3354	char	*path;
3355	long	length;
3356};
3357#endif
3358int
3359otruncate(td, uap)
3360	struct thread *td;
3361	register struct otruncate_args /* {
3362		char *path;
3363		long length;
3364	} */ *uap;
3365{
3366	struct truncate_args /* {
3367		char *path;
3368		int pad;
3369		off_t length;
3370	} */ nuap;
3371
3372	nuap.path = uap->path;
3373	nuap.length = uap->length;
3374	return (sys_truncate(td, &nuap));
3375}
3376#endif /* COMPAT_43 */
3377
3378/* Versions with the pad argument */
3379int
3380freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3381{
3382	struct truncate_args ouap;
3383
3384	ouap.path = uap->path;
3385	ouap.length = uap->length;
3386	return (sys_truncate(td, &ouap));
3387}
3388
3389int
3390freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3391{
3392	struct ftruncate_args ouap;
3393
3394	ouap.fd = uap->fd;
3395	ouap.length = uap->length;
3396	return (sys_ftruncate(td, &ouap));
3397}
3398
3399/*
3400 * Sync an open file.
3401 */
3402#ifndef _SYS_SYSPROTO_H_
3403struct fsync_args {
3404	int	fd;
3405};
3406#endif
3407int
3408sys_fsync(td, uap)
3409	struct thread *td;
3410	struct fsync_args /* {
3411		int fd;
3412	} */ *uap;
3413{
3414	struct vnode *vp;
3415	struct mount *mp;
3416	struct file *fp;
3417	cap_rights_t rights;
3418	int error, lock_flags;
3419
3420	AUDIT_ARG_FD(uap->fd);
3421	error = getvnode(td->td_proc->p_fd, uap->fd,
3422	    cap_rights_init(&rights, CAP_FSYNC), &fp);
3423	if (error != 0)
3424		return (error);
3425	vp = fp->f_vnode;
3426	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3427	if (error != 0)
3428		goto drop;
3429	if (MNT_SHARED_WRITES(mp) ||
3430	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3431		lock_flags = LK_SHARED;
3432	} else {
3433		lock_flags = LK_EXCLUSIVE;
3434	}
3435	vn_lock(vp, lock_flags | LK_RETRY);
3436	AUDIT_ARG_VNODE1(vp);
3437	if (vp->v_object != NULL) {
3438		VM_OBJECT_WLOCK(vp->v_object);
3439		vm_object_page_clean(vp->v_object, 0, 0, 0);
3440		VM_OBJECT_WUNLOCK(vp->v_object);
3441	}
3442	error = VOP_FSYNC(vp, MNT_WAIT, td);
3443
3444	VOP_UNLOCK(vp, 0);
3445	vn_finished_write(mp);
3446drop:
3447	fdrop(fp, td);
3448	return (error);
3449}
3450
3451/*
3452 * Rename files.  Source and destination must either both be directories, or
3453 * both not be directories.  If target is a directory, it must be empty.
3454 */
3455#ifndef _SYS_SYSPROTO_H_
3456struct rename_args {
3457	char	*from;
3458	char	*to;
3459};
3460#endif
3461int
3462sys_rename(td, uap)
3463	struct thread *td;
3464	register struct rename_args /* {
3465		char *from;
3466		char *to;
3467	} */ *uap;
3468{
3469
3470	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
3471}
3472
3473#ifndef _SYS_SYSPROTO_H_
3474struct renameat_args {
3475	int	oldfd;
3476	char	*old;
3477	int	newfd;
3478	char	*new;
3479};
3480#endif
3481int
3482sys_renameat(struct thread *td, struct renameat_args *uap)
3483{
3484
3485	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3486	    UIO_USERSPACE));
3487}
3488
3489int
3490kern_rename(struct thread *td, char *from, char *to, enum uio_seg pathseg)
3491{
3492
3493	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
3494}
3495
3496int
3497kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3498    enum uio_seg pathseg)
3499{
3500	struct mount *mp = NULL;
3501	struct vnode *tvp, *fvp, *tdvp;
3502	struct nameidata fromnd, tond;
3503	cap_rights_t rights;
3504	int error;
3505
3506	bwillwrite();
3507#ifdef MAC
3508	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3509	    AUDITVNODE1, pathseg, old, oldfd,
3510	    cap_rights_init(&rights, CAP_RENAMEAT), td);
3511#else
3512	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3513	    pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
3514#endif
3515
3516	if ((error = namei(&fromnd)) != 0)
3517		return (error);
3518#ifdef MAC
3519	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3520	    fromnd.ni_vp, &fromnd.ni_cnd);
3521	VOP_UNLOCK(fromnd.ni_dvp, 0);
3522	if (fromnd.ni_dvp != fromnd.ni_vp)
3523		VOP_UNLOCK(fromnd.ni_vp, 0);
3524#endif
3525	fvp = fromnd.ni_vp;
3526	if (error == 0)
3527		error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
3528	if (error != 0) {
3529		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3530		vrele(fromnd.ni_dvp);
3531		vrele(fvp);
3532		goto out1;
3533	}
3534	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3535	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3536	    cap_rights_init(&rights, CAP_LINKAT), td);
3537	if (fromnd.ni_vp->v_type == VDIR)
3538		tond.ni_cnd.cn_flags |= WILLBEDIR;
3539	if ((error = namei(&tond)) != 0) {
3540		/* Translate error code for rename("dir1", "dir2/."). */
3541		if (error == EISDIR && fvp->v_type == VDIR)
3542			error = EINVAL;
3543		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3544		vrele(fromnd.ni_dvp);
3545		vrele(fvp);
3546		vn_finished_write(mp);
3547		goto out1;
3548	}
3549	tdvp = tond.ni_dvp;
3550	tvp = tond.ni_vp;
3551	if (tvp != NULL) {
3552		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3553			error = ENOTDIR;
3554			goto out;
3555		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3556			error = EISDIR;
3557			goto out;
3558		}
3559#ifdef CAPABILITIES
3560		if (newfd != AT_FDCWD) {
3561			/*
3562			 * If the target already exists we require CAP_UNLINKAT
3563			 * from 'newfd'.
3564			 */
3565			error = cap_check(&tond.ni_filecaps.fc_rights,
3566			    cap_rights_init(&rights, CAP_UNLINKAT));
3567			if (error != 0)
3568				goto out;
3569		}
3570#endif
3571	}
3572	if (fvp == tdvp) {
3573		error = EINVAL;
3574		goto out;
3575	}
3576	/*
3577	 * If the source is the same as the destination (that is, if they
3578	 * are links to the same vnode), then there is nothing to do.
3579	 */
3580	if (fvp == tvp)
3581		error = -1;
3582#ifdef MAC
3583	else
3584		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3585		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3586#endif
3587out:
3588	if (error == 0) {
3589		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3590		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3591		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3592		NDFREE(&tond, NDF_ONLY_PNBUF);
3593	} else {
3594		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3595		NDFREE(&tond, NDF_ONLY_PNBUF);
3596		if (tvp != NULL)
3597			vput(tvp);
3598		if (tdvp == tvp)
3599			vrele(tdvp);
3600		else
3601			vput(tdvp);
3602		vrele(fromnd.ni_dvp);
3603		vrele(fvp);
3604	}
3605	vrele(tond.ni_startdir);
3606	vn_finished_write(mp);
3607out1:
3608	if (fromnd.ni_startdir)
3609		vrele(fromnd.ni_startdir);
3610	if (error == -1)
3611		return (0);
3612	return (error);
3613}
3614
3615/*
3616 * Make a directory file.
3617 */
3618#ifndef _SYS_SYSPROTO_H_
3619struct mkdir_args {
3620	char	*path;
3621	int	mode;
3622};
3623#endif
3624int
3625sys_mkdir(td, uap)
3626	struct thread *td;
3627	register struct mkdir_args /* {
3628		char *path;
3629		int mode;
3630	} */ *uap;
3631{
3632
3633	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
3634}
3635
3636#ifndef _SYS_SYSPROTO_H_
3637struct mkdirat_args {
3638	int	fd;
3639	char	*path;
3640	mode_t	mode;
3641};
3642#endif
3643int
3644sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3645{
3646
3647	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3648}
3649
3650int
3651kern_mkdir(struct thread *td, char *path, enum uio_seg segflg, int mode)
3652{
3653
3654	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
3655}
3656
3657int
3658kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3659    int mode)
3660{
3661	struct mount *mp;
3662	struct vnode *vp;
3663	struct vattr vattr;
3664	struct nameidata nd;
3665	cap_rights_t rights;
3666	int error;
3667
3668	AUDIT_ARG_MODE(mode);
3669restart:
3670	bwillwrite();
3671	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1,
3672	    segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT), td);
3673	nd.ni_cnd.cn_flags |= WILLBEDIR;
3674	if ((error = namei(&nd)) != 0)
3675		return (error);
3676	vp = nd.ni_vp;
3677	if (vp != NULL) {
3678		NDFREE(&nd, NDF_ONLY_PNBUF);
3679		/*
3680		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3681		 * the strange behaviour of leaving the vnode unlocked
3682		 * if the target is the same vnode as the parent.
3683		 */
3684		if (vp == nd.ni_dvp)
3685			vrele(nd.ni_dvp);
3686		else
3687			vput(nd.ni_dvp);
3688		vrele(vp);
3689		return (EEXIST);
3690	}
3691	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3692		NDFREE(&nd, NDF_ONLY_PNBUF);
3693		vput(nd.ni_dvp);
3694		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3695			return (error);
3696		goto restart;
3697	}
3698	VATTR_NULL(&vattr);
3699	vattr.va_type = VDIR;
3700	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3701#ifdef MAC
3702	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3703	    &vattr);
3704	if (error != 0)
3705		goto out;
3706#endif
3707	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3708#ifdef MAC
3709out:
3710#endif
3711	NDFREE(&nd, NDF_ONLY_PNBUF);
3712	vput(nd.ni_dvp);
3713	if (error == 0)
3714		vput(nd.ni_vp);
3715	vn_finished_write(mp);
3716	return (error);
3717}
3718
3719/*
3720 * Remove a directory file.
3721 */
3722#ifndef _SYS_SYSPROTO_H_
3723struct rmdir_args {
3724	char	*path;
3725};
3726#endif
3727int
3728sys_rmdir(td, uap)
3729	struct thread *td;
3730	struct rmdir_args /* {
3731		char *path;
3732	} */ *uap;
3733{
3734
3735	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
3736}
3737
3738int
3739kern_rmdir(struct thread *td, char *path, enum uio_seg pathseg)
3740{
3741
3742	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
3743}
3744
3745int
3746kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3747{
3748	struct mount *mp;
3749	struct vnode *vp;
3750	struct nameidata nd;
3751	cap_rights_t rights;
3752	int error;
3753
3754restart:
3755	bwillwrite();
3756	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3757	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3758	if ((error = namei(&nd)) != 0)
3759		return (error);
3760	vp = nd.ni_vp;
3761	if (vp->v_type != VDIR) {
3762		error = ENOTDIR;
3763		goto out;
3764	}
3765	/*
3766	 * No rmdir "." please.
3767	 */
3768	if (nd.ni_dvp == vp) {
3769		error = EINVAL;
3770		goto out;
3771	}
3772	/*
3773	 * The root of a mounted filesystem cannot be deleted.
3774	 */
3775	if (vp->v_vflag & VV_ROOT) {
3776		error = EBUSY;
3777		goto out;
3778	}
3779#ifdef MAC
3780	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3781	    &nd.ni_cnd);
3782	if (error != 0)
3783		goto out;
3784#endif
3785	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3786		NDFREE(&nd, NDF_ONLY_PNBUF);
3787		vput(vp);
3788		if (nd.ni_dvp == vp)
3789			vrele(nd.ni_dvp);
3790		else
3791			vput(nd.ni_dvp);
3792		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3793			return (error);
3794		goto restart;
3795	}
3796	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3797	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3798	vn_finished_write(mp);
3799out:
3800	NDFREE(&nd, NDF_ONLY_PNBUF);
3801	vput(vp);
3802	if (nd.ni_dvp == vp)
3803		vrele(nd.ni_dvp);
3804	else
3805		vput(nd.ni_dvp);
3806	return (error);
3807}
3808
3809#ifdef COMPAT_43
3810/*
3811 * Read a block of directory entries in a filesystem independent format.
3812 */
3813#ifndef _SYS_SYSPROTO_H_
3814struct ogetdirentries_args {
3815	int	fd;
3816	char	*buf;
3817	u_int	count;
3818	long	*basep;
3819};
3820#endif
3821int
3822ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3823{
3824	long loff;
3825	int error;
3826
3827	error = kern_ogetdirentries(td, uap, &loff);
3828	if (error == 0)
3829		error = copyout(&loff, uap->basep, sizeof(long));
3830	return (error);
3831}
3832
3833int
3834kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3835    long *ploff)
3836{
3837	struct vnode *vp;
3838	struct file *fp;
3839	struct uio auio, kuio;
3840	struct iovec aiov, kiov;
3841	struct dirent *dp, *edp;
3842	cap_rights_t rights;
3843	caddr_t dirbuf;
3844	int error, eofflag, readcnt;
3845	long loff;
3846	off_t foffset;
3847
3848	/* XXX arbitrary sanity limit on `count'. */
3849	if (uap->count > 64 * 1024)
3850		return (EINVAL);
3851	error = getvnode(td->td_proc->p_fd, uap->fd,
3852	    cap_rights_init(&rights, CAP_READ), &fp);
3853	if (error != 0)
3854		return (error);
3855	if ((fp->f_flag & FREAD) == 0) {
3856		fdrop(fp, td);
3857		return (EBADF);
3858	}
3859	vp = fp->f_vnode;
3860	foffset = foffset_lock(fp, 0);
3861unionread:
3862	if (vp->v_type != VDIR) {
3863		foffset_unlock(fp, foffset, 0);
3864		fdrop(fp, td);
3865		return (EINVAL);
3866	}
3867	aiov.iov_base = uap->buf;
3868	aiov.iov_len = uap->count;
3869	auio.uio_iov = &aiov;
3870	auio.uio_iovcnt = 1;
3871	auio.uio_rw = UIO_READ;
3872	auio.uio_segflg = UIO_USERSPACE;
3873	auio.uio_td = td;
3874	auio.uio_resid = uap->count;
3875	vn_lock(vp, LK_SHARED | LK_RETRY);
3876	loff = auio.uio_offset = foffset;
3877#ifdef MAC
3878	error = mac_vnode_check_readdir(td->td_ucred, vp);
3879	if (error != 0) {
3880		VOP_UNLOCK(vp, 0);
3881		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3882		fdrop(fp, td);
3883		return (error);
3884	}
3885#endif
3886#	if (BYTE_ORDER != LITTLE_ENDIAN)
3887		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3888			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3889			    NULL, NULL);
3890			foffset = auio.uio_offset;
3891		} else
3892#	endif
3893	{
3894		kuio = auio;
3895		kuio.uio_iov = &kiov;
3896		kuio.uio_segflg = UIO_SYSSPACE;
3897		kiov.iov_len = uap->count;
3898		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3899		kiov.iov_base = dirbuf;
3900		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3901			    NULL, NULL);
3902		foffset = kuio.uio_offset;
3903		if (error == 0) {
3904			readcnt = uap->count - kuio.uio_resid;
3905			edp = (struct dirent *)&dirbuf[readcnt];
3906			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3907#				if (BYTE_ORDER == LITTLE_ENDIAN)
3908					/*
3909					 * The expected low byte of
3910					 * dp->d_namlen is our dp->d_type.
3911					 * The high MBZ byte of dp->d_namlen
3912					 * is our dp->d_namlen.
3913					 */
3914					dp->d_type = dp->d_namlen;
3915					dp->d_namlen = 0;
3916#				else
3917					/*
3918					 * The dp->d_type is the high byte
3919					 * of the expected dp->d_namlen,
3920					 * so must be zero'ed.
3921					 */
3922					dp->d_type = 0;
3923#				endif
3924				if (dp->d_reclen > 0) {
3925					dp = (struct dirent *)
3926					    ((char *)dp + dp->d_reclen);
3927				} else {
3928					error = EIO;
3929					break;
3930				}
3931			}
3932			if (dp >= edp)
3933				error = uiomove(dirbuf, readcnt, &auio);
3934		}
3935		free(dirbuf, M_TEMP);
3936	}
3937	if (error != 0) {
3938		VOP_UNLOCK(vp, 0);
3939		foffset_unlock(fp, foffset, 0);
3940		fdrop(fp, td);
3941		return (error);
3942	}
3943	if (uap->count == auio.uio_resid &&
3944	    (vp->v_vflag & VV_ROOT) &&
3945	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3946		struct vnode *tvp = vp;
3947		vp = vp->v_mount->mnt_vnodecovered;
3948		VREF(vp);
3949		fp->f_vnode = vp;
3950		fp->f_data = vp;
3951		foffset = 0;
3952		vput(tvp);
3953		goto unionread;
3954	}
3955	VOP_UNLOCK(vp, 0);
3956	foffset_unlock(fp, foffset, 0);
3957	fdrop(fp, td);
3958	td->td_retval[0] = uap->count - auio.uio_resid;
3959	if (error == 0)
3960		*ploff = loff;
3961	return (error);
3962}
3963#endif /* COMPAT_43 */
3964
3965/*
3966 * Read a block of directory entries in a filesystem independent format.
3967 */
3968#ifndef _SYS_SYSPROTO_H_
3969struct getdirentries_args {
3970	int	fd;
3971	char	*buf;
3972	u_int	count;
3973	long	*basep;
3974};
3975#endif
3976int
3977sys_getdirentries(td, uap)
3978	struct thread *td;
3979	register struct getdirentries_args /* {
3980		int fd;
3981		char *buf;
3982		u_int count;
3983		long *basep;
3984	} */ *uap;
3985{
3986	long base;
3987	int error;
3988
3989	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
3990	    NULL, UIO_USERSPACE);
3991	if (error != 0)
3992		return (error);
3993	if (uap->basep != NULL)
3994		error = copyout(&base, uap->basep, sizeof(long));
3995	return (error);
3996}
3997
3998int
3999kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
4000    long *basep, ssize_t *residp, enum uio_seg bufseg)
4001{
4002	struct vnode *vp;
4003	struct file *fp;
4004	struct uio auio;
4005	struct iovec aiov;
4006	cap_rights_t rights;
4007	long loff;
4008	int error, eofflag;
4009	off_t foffset;
4010
4011	AUDIT_ARG_FD(fd);
4012	if (count > IOSIZE_MAX)
4013		return (EINVAL);
4014	auio.uio_resid = count;
4015	error = getvnode(td->td_proc->p_fd, fd,
4016	    cap_rights_init(&rights, CAP_READ), &fp);
4017	if (error != 0)
4018		return (error);
4019	if ((fp->f_flag & FREAD) == 0) {
4020		fdrop(fp, td);
4021		return (EBADF);
4022	}
4023	vp = fp->f_vnode;
4024	foffset = foffset_lock(fp, 0);
4025unionread:
4026	if (vp->v_type != VDIR) {
4027		error = EINVAL;
4028		goto fail;
4029	}
4030	aiov.iov_base = buf;
4031	aiov.iov_len = count;
4032	auio.uio_iov = &aiov;
4033	auio.uio_iovcnt = 1;
4034	auio.uio_rw = UIO_READ;
4035	auio.uio_segflg = bufseg;
4036	auio.uio_td = td;
4037	vn_lock(vp, LK_SHARED | LK_RETRY);
4038	AUDIT_ARG_VNODE1(vp);
4039	loff = auio.uio_offset = foffset;
4040#ifdef MAC
4041	error = mac_vnode_check_readdir(td->td_ucred, vp);
4042	if (error == 0)
4043#endif
4044		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
4045		    NULL);
4046	foffset = auio.uio_offset;
4047	if (error != 0) {
4048		VOP_UNLOCK(vp, 0);
4049		goto fail;
4050	}
4051	if (count == auio.uio_resid &&
4052	    (vp->v_vflag & VV_ROOT) &&
4053	    (vp->v_mount->mnt_flag & MNT_UNION)) {
4054		struct vnode *tvp = vp;
4055
4056		vp = vp->v_mount->mnt_vnodecovered;
4057		VREF(vp);
4058		fp->f_vnode = vp;
4059		fp->f_data = vp;
4060		foffset = 0;
4061		vput(tvp);
4062		goto unionread;
4063	}
4064	VOP_UNLOCK(vp, 0);
4065	*basep = loff;
4066	if (residp != NULL)
4067		*residp = auio.uio_resid;
4068	td->td_retval[0] = count - auio.uio_resid;
4069fail:
4070	foffset_unlock(fp, foffset, 0);
4071	fdrop(fp, td);
4072	return (error);
4073}
4074
4075#ifndef _SYS_SYSPROTO_H_
4076struct getdents_args {
4077	int fd;
4078	char *buf;
4079	size_t count;
4080};
4081#endif
4082int
4083sys_getdents(td, uap)
4084	struct thread *td;
4085	register struct getdents_args /* {
4086		int fd;
4087		char *buf;
4088		u_int count;
4089	} */ *uap;
4090{
4091	struct getdirentries_args ap;
4092
4093	ap.fd = uap->fd;
4094	ap.buf = uap->buf;
4095	ap.count = uap->count;
4096	ap.basep = NULL;
4097	return (sys_getdirentries(td, &ap));
4098}
4099
4100/*
4101 * Set the mode mask for creation of filesystem nodes.
4102 */
4103#ifndef _SYS_SYSPROTO_H_
4104struct umask_args {
4105	int	newmask;
4106};
4107#endif
4108int
4109sys_umask(td, uap)
4110	struct thread *td;
4111	struct umask_args /* {
4112		int newmask;
4113	} */ *uap;
4114{
4115	register struct filedesc *fdp;
4116
4117	FILEDESC_XLOCK(td->td_proc->p_fd);
4118	fdp = td->td_proc->p_fd;
4119	td->td_retval[0] = fdp->fd_cmask;
4120	fdp->fd_cmask = uap->newmask & ALLPERMS;
4121	FILEDESC_XUNLOCK(td->td_proc->p_fd);
4122	return (0);
4123}
4124
4125/*
4126 * Void all references to file by ripping underlying filesystem away from
4127 * vnode.
4128 */
4129#ifndef _SYS_SYSPROTO_H_
4130struct revoke_args {
4131	char	*path;
4132};
4133#endif
4134int
4135sys_revoke(td, uap)
4136	struct thread *td;
4137	register struct revoke_args /* {
4138		char *path;
4139	} */ *uap;
4140{
4141	struct vnode *vp;
4142	struct vattr vattr;
4143	struct nameidata nd;
4144	int error;
4145
4146	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4147	    uap->path, td);
4148	if ((error = namei(&nd)) != 0)
4149		return (error);
4150	vp = nd.ni_vp;
4151	NDFREE(&nd, NDF_ONLY_PNBUF);
4152	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4153		error = EINVAL;
4154		goto out;
4155	}
4156#ifdef MAC
4157	error = mac_vnode_check_revoke(td->td_ucred, vp);
4158	if (error != 0)
4159		goto out;
4160#endif
4161	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4162	if (error != 0)
4163		goto out;
4164	if (td->td_ucred->cr_uid != vattr.va_uid) {
4165		error = priv_check(td, PRIV_VFS_ADMIN);
4166		if (error != 0)
4167			goto out;
4168	}
4169	if (vcount(vp) > 1)
4170		VOP_REVOKE(vp, REVOKEALL);
4171out:
4172	vput(vp);
4173	return (error);
4174}
4175
4176/*
4177 * Convert a user file descriptor to a kernel file entry and check that, if it
4178 * is a capability, the correct rights are present. A reference on the file
4179 * entry is held upon returning.
4180 */
4181int
4182getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
4183{
4184	struct file *fp;
4185	int error;
4186
4187	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
4188	if (error != 0)
4189		return (error);
4190
4191	/*
4192	 * The file could be not of the vnode type, or it may be not
4193	 * yet fully initialized, in which case the f_vnode pointer
4194	 * may be set, but f_ops is still badfileops.  E.g.,
4195	 * devfs_open() transiently create such situation to
4196	 * facilitate csw d_fdopen().
4197	 *
4198	 * Dupfdopen() handling in kern_openat() installs the
4199	 * half-baked file into the process descriptor table, allowing
4200	 * other thread to dereference it. Guard against the race by
4201	 * checking f_ops.
4202	 */
4203	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4204		fdrop(fp, curthread);
4205		return (EINVAL);
4206	}
4207	*fpp = fp;
4208	return (0);
4209}
4210
4211
4212/*
4213 * Get an (NFS) file handle.
4214 */
4215#ifndef _SYS_SYSPROTO_H_
4216struct lgetfh_args {
4217	char	*fname;
4218	fhandle_t *fhp;
4219};
4220#endif
4221int
4222sys_lgetfh(td, uap)
4223	struct thread *td;
4224	register struct lgetfh_args *uap;
4225{
4226	struct nameidata nd;
4227	fhandle_t fh;
4228	register struct vnode *vp;
4229	int error;
4230
4231	error = priv_check(td, PRIV_VFS_GETFH);
4232	if (error != 0)
4233		return (error);
4234	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4235	    uap->fname, td);
4236	error = namei(&nd);
4237	if (error != 0)
4238		return (error);
4239	NDFREE(&nd, NDF_ONLY_PNBUF);
4240	vp = nd.ni_vp;
4241	bzero(&fh, sizeof(fh));
4242	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4243	error = VOP_VPTOFH(vp, &fh.fh_fid);
4244	vput(vp);
4245	if (error == 0)
4246		error = copyout(&fh, uap->fhp, sizeof (fh));
4247	return (error);
4248}
4249
4250#ifndef _SYS_SYSPROTO_H_
4251struct getfh_args {
4252	char	*fname;
4253	fhandle_t *fhp;
4254};
4255#endif
4256int
4257sys_getfh(td, uap)
4258	struct thread *td;
4259	register struct getfh_args *uap;
4260{
4261	struct nameidata nd;
4262	fhandle_t fh;
4263	register struct vnode *vp;
4264	int error;
4265
4266	error = priv_check(td, PRIV_VFS_GETFH);
4267	if (error != 0)
4268		return (error);
4269	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4270	    uap->fname, td);
4271	error = namei(&nd);
4272	if (error != 0)
4273		return (error);
4274	NDFREE(&nd, NDF_ONLY_PNBUF);
4275	vp = nd.ni_vp;
4276	bzero(&fh, sizeof(fh));
4277	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4278	error = VOP_VPTOFH(vp, &fh.fh_fid);
4279	vput(vp);
4280	if (error == 0)
4281		error = copyout(&fh, uap->fhp, sizeof (fh));
4282	return (error);
4283}
4284
4285/*
4286 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4287 * open descriptor.
4288 *
4289 * warning: do not remove the priv_check() call or this becomes one giant
4290 * security hole.
4291 */
4292#ifndef _SYS_SYSPROTO_H_
4293struct fhopen_args {
4294	const struct fhandle *u_fhp;
4295	int flags;
4296};
4297#endif
4298int
4299sys_fhopen(td, uap)
4300	struct thread *td;
4301	struct fhopen_args /* {
4302		const struct fhandle *u_fhp;
4303		int flags;
4304	} */ *uap;
4305{
4306	struct mount *mp;
4307	struct vnode *vp;
4308	struct fhandle fhp;
4309	struct file *fp;
4310	int fmode, error;
4311	int indx;
4312
4313	error = priv_check(td, PRIV_VFS_FHOPEN);
4314	if (error != 0)
4315		return (error);
4316	indx = -1;
4317	fmode = FFLAGS(uap->flags);
4318	/* why not allow a non-read/write open for our lockd? */
4319	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4320		return (EINVAL);
4321	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4322	if (error != 0)
4323		return(error);
4324	/* find the mount point */
4325	mp = vfs_busyfs(&fhp.fh_fsid);
4326	if (mp == NULL)
4327		return (ESTALE);
4328	/* now give me my vnode, it gets returned to me locked */
4329	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4330	vfs_unbusy(mp);
4331	if (error != 0)
4332		return (error);
4333
4334	error = falloc_noinstall(td, &fp);
4335	if (error != 0) {
4336		vput(vp);
4337		return (error);
4338	}
4339	/*
4340	 * An extra reference on `fp' has been held for us by
4341	 * falloc_noinstall().
4342	 */
4343
4344#ifdef INVARIANTS
4345	td->td_dupfd = -1;
4346#endif
4347	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4348	if (error != 0) {
4349		KASSERT(fp->f_ops == &badfileops,
4350		    ("VOP_OPEN in fhopen() set f_ops"));
4351		KASSERT(td->td_dupfd < 0,
4352		    ("fhopen() encountered fdopen()"));
4353
4354		vput(vp);
4355		goto bad;
4356	}
4357#ifdef INVARIANTS
4358	td->td_dupfd = 0;
4359#endif
4360	fp->f_vnode = vp;
4361	fp->f_seqcount = 1;
4362	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4363	    &vnops);
4364	VOP_UNLOCK(vp, 0);
4365	if ((fmode & O_TRUNC) != 0) {
4366		error = fo_truncate(fp, 0, td->td_ucred, td);
4367		if (error != 0)
4368			goto bad;
4369	}
4370
4371	error = finstall(td, fp, &indx, fmode, NULL);
4372bad:
4373	fdrop(fp, td);
4374	td->td_retval[0] = indx;
4375	return (error);
4376}
4377
4378/*
4379 * Stat an (NFS) file handle.
4380 */
4381#ifndef _SYS_SYSPROTO_H_
4382struct fhstat_args {
4383	struct fhandle *u_fhp;
4384	struct stat *sb;
4385};
4386#endif
4387int
4388sys_fhstat(td, uap)
4389	struct thread *td;
4390	register struct fhstat_args /* {
4391		struct fhandle *u_fhp;
4392		struct stat *sb;
4393	} */ *uap;
4394{
4395	struct stat sb;
4396	struct fhandle fh;
4397	int error;
4398
4399	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4400	if (error != 0)
4401		return (error);
4402	error = kern_fhstat(td, fh, &sb);
4403	if (error == 0)
4404		error = copyout(&sb, uap->sb, sizeof(sb));
4405	return (error);
4406}
4407
4408int
4409kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4410{
4411	struct mount *mp;
4412	struct vnode *vp;
4413	int error;
4414
4415	error = priv_check(td, PRIV_VFS_FHSTAT);
4416	if (error != 0)
4417		return (error);
4418	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4419		return (ESTALE);
4420	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4421	vfs_unbusy(mp);
4422	if (error != 0)
4423		return (error);
4424	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4425	vput(vp);
4426	return (error);
4427}
4428
4429/*
4430 * Implement fstatfs() for (NFS) file handles.
4431 */
4432#ifndef _SYS_SYSPROTO_H_
4433struct fhstatfs_args {
4434	struct fhandle *u_fhp;
4435	struct statfs *buf;
4436};
4437#endif
4438int
4439sys_fhstatfs(td, uap)
4440	struct thread *td;
4441	struct fhstatfs_args /* {
4442		struct fhandle *u_fhp;
4443		struct statfs *buf;
4444	} */ *uap;
4445{
4446	struct statfs sf;
4447	fhandle_t fh;
4448	int error;
4449
4450	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4451	if (error != 0)
4452		return (error);
4453	error = kern_fhstatfs(td, fh, &sf);
4454	if (error != 0)
4455		return (error);
4456	return (copyout(&sf, uap->buf, sizeof(sf)));
4457}
4458
4459int
4460kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4461{
4462	struct statfs *sp;
4463	struct mount *mp;
4464	struct vnode *vp;
4465	int error;
4466
4467	error = priv_check(td, PRIV_VFS_FHSTATFS);
4468	if (error != 0)
4469		return (error);
4470	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4471		return (ESTALE);
4472	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4473	if (error != 0) {
4474		vfs_unbusy(mp);
4475		return (error);
4476	}
4477	vput(vp);
4478	error = prison_canseemount(td->td_ucred, mp);
4479	if (error != 0)
4480		goto out;
4481#ifdef MAC
4482	error = mac_mount_check_stat(td->td_ucred, mp);
4483	if (error != 0)
4484		goto out;
4485#endif
4486	/*
4487	 * Set these in case the underlying filesystem fails to do so.
4488	 */
4489	sp = &mp->mnt_stat;
4490	sp->f_version = STATFS_VERSION;
4491	sp->f_namemax = NAME_MAX;
4492	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4493	error = VFS_STATFS(mp, sp);
4494	if (error == 0)
4495		*buf = *sp;
4496out:
4497	vfs_unbusy(mp);
4498	return (error);
4499}
4500
4501int
4502kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4503{
4504	struct file *fp;
4505	struct mount *mp;
4506	struct vnode *vp;
4507	cap_rights_t rights;
4508	off_t olen, ooffset;
4509	int error;
4510
4511	fp = NULL;
4512	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4513	if (error != 0)
4514		goto out;
4515
4516	switch (fp->f_type) {
4517	case DTYPE_VNODE:
4518		break;
4519	case DTYPE_PIPE:
4520	case DTYPE_FIFO:
4521		error = ESPIPE;
4522		goto out;
4523	default:
4524		error = ENODEV;
4525		goto out;
4526	}
4527	if ((fp->f_flag & FWRITE) == 0) {
4528		error = EBADF;
4529		goto out;
4530	}
4531	vp = fp->f_vnode;
4532	if (vp->v_type != VREG) {
4533		error = ENODEV;
4534		goto out;
4535	}
4536	if (offset < 0 || len <= 0) {
4537		error = EINVAL;
4538		goto out;
4539	}
4540	/* Check for wrap. */
4541	if (offset > OFF_MAX - len) {
4542		error = EFBIG;
4543		goto out;
4544	}
4545
4546	/* Allocating blocks may take a long time, so iterate. */
4547	for (;;) {
4548		olen = len;
4549		ooffset = offset;
4550
4551		bwillwrite();
4552		mp = NULL;
4553		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4554		if (error != 0)
4555			break;
4556		error = vn_lock(vp, LK_EXCLUSIVE);
4557		if (error != 0) {
4558			vn_finished_write(mp);
4559			break;
4560		}
4561#ifdef MAC
4562		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4563		if (error == 0)
4564#endif
4565			error = VOP_ALLOCATE(vp, &offset, &len);
4566		VOP_UNLOCK(vp, 0);
4567		vn_finished_write(mp);
4568
4569		if (olen + ooffset != offset + len) {
4570			panic("offset + len changed from %jx/%jx to %jx/%jx",
4571			    ooffset, olen, offset, len);
4572		}
4573		if (error != 0 || len == 0)
4574			break;
4575		KASSERT(olen > len, ("Iteration did not make progress?"));
4576		maybe_yield();
4577	}
4578 out:
4579	if (fp != NULL)
4580		fdrop(fp, td);
4581	return (error);
4582}
4583
4584int
4585sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4586{
4587
4588	td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
4589	    uap->len);
4590	return (0);
4591}
4592
4593/*
4594 * Unlike madvise(2), we do not make a best effort to remember every
4595 * possible caching hint.  Instead, we remember the last setting with
4596 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4597 * region of any current setting.
4598 */
4599int
4600kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4601    int advice)
4602{
4603	struct fadvise_info *fa, *new;
4604	struct file *fp;
4605	struct vnode *vp;
4606	cap_rights_t rights;
4607	off_t end;
4608	int error;
4609
4610	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4611		return (EINVAL);
4612	switch (advice) {
4613	case POSIX_FADV_SEQUENTIAL:
4614	case POSIX_FADV_RANDOM:
4615	case POSIX_FADV_NOREUSE:
4616		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4617		break;
4618	case POSIX_FADV_NORMAL:
4619	case POSIX_FADV_WILLNEED:
4620	case POSIX_FADV_DONTNEED:
4621		new = NULL;
4622		break;
4623	default:
4624		return (EINVAL);
4625	}
4626	/* XXX: CAP_POSIX_FADVISE? */
4627	error = fget(td, fd, cap_rights_init(&rights), &fp);
4628	if (error != 0)
4629		goto out;
4630
4631	switch (fp->f_type) {
4632	case DTYPE_VNODE:
4633		break;
4634	case DTYPE_PIPE:
4635	case DTYPE_FIFO:
4636		error = ESPIPE;
4637		goto out;
4638	default:
4639		error = ENODEV;
4640		goto out;
4641	}
4642	vp = fp->f_vnode;
4643	if (vp->v_type != VREG) {
4644		error = ENODEV;
4645		goto out;
4646	}
4647	if (len == 0)
4648		end = OFF_MAX;
4649	else
4650		end = offset + len - 1;
4651	switch (advice) {
4652	case POSIX_FADV_SEQUENTIAL:
4653	case POSIX_FADV_RANDOM:
4654	case POSIX_FADV_NOREUSE:
4655		/*
4656		 * Try to merge any existing non-standard region with
4657		 * this new region if possible, otherwise create a new
4658		 * non-standard region for this request.
4659		 */
4660		mtx_pool_lock(mtxpool_sleep, fp);
4661		fa = fp->f_advice;
4662		if (fa != NULL && fa->fa_advice == advice &&
4663		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4664		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4665		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4666			if (offset < fa->fa_start)
4667				fa->fa_start = offset;
4668			if (end > fa->fa_end)
4669				fa->fa_end = end;
4670		} else {
4671			new->fa_advice = advice;
4672			new->fa_start = offset;
4673			new->fa_end = end;
4674			new->fa_prevstart = 0;
4675			new->fa_prevend = 0;
4676			fp->f_advice = new;
4677			new = fa;
4678		}
4679		mtx_pool_unlock(mtxpool_sleep, fp);
4680		break;
4681	case POSIX_FADV_NORMAL:
4682		/*
4683		 * If a the "normal" region overlaps with an existing
4684		 * non-standard region, trim or remove the
4685		 * non-standard region.
4686		 */
4687		mtx_pool_lock(mtxpool_sleep, fp);
4688		fa = fp->f_advice;
4689		if (fa != NULL) {
4690			if (offset <= fa->fa_start && end >= fa->fa_end) {
4691				new = fa;
4692				fp->f_advice = NULL;
4693			} else if (offset <= fa->fa_start &&
4694			    end >= fa->fa_start)
4695				fa->fa_start = end + 1;
4696			else if (offset <= fa->fa_end && end >= fa->fa_end)
4697				fa->fa_end = offset - 1;
4698			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4699				/*
4700				 * If the "normal" region is a middle
4701				 * portion of the existing
4702				 * non-standard region, just remove
4703				 * the whole thing rather than picking
4704				 * one side or the other to
4705				 * preserve.
4706				 */
4707				new = fa;
4708				fp->f_advice = NULL;
4709			}
4710		}
4711		mtx_pool_unlock(mtxpool_sleep, fp);
4712		break;
4713	case POSIX_FADV_WILLNEED:
4714	case POSIX_FADV_DONTNEED:
4715		error = VOP_ADVISE(vp, offset, end, advice);
4716		break;
4717	}
4718out:
4719	if (fp != NULL)
4720		fdrop(fp, td);
4721	free(new, M_FADVISE);
4722	return (error);
4723}
4724
4725int
4726sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4727{
4728
4729	td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
4730	    uap->len, uap->advice);
4731	return (0);
4732}
4733