vfs_syscalls.c revision 330897
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_syscalls.c 330897 2018-03-14 03:19:51Z eadler $");
41
42#include "opt_capsicum.h"
43#include "opt_compat.h"
44#include "opt_ktrace.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/capsicum.h>
51#include <sys/disk.h>
52#include <sys/sysent.h>
53#include <sys/malloc.h>
54#include <sys/mount.h>
55#include <sys/mutex.h>
56#include <sys/sysproto.h>
57#include <sys/namei.h>
58#include <sys/filedesc.h>
59#include <sys/kernel.h>
60#include <sys/fcntl.h>
61#include <sys/file.h>
62#include <sys/filio.h>
63#include <sys/limits.h>
64#include <sys/linker.h>
65#include <sys/rwlock.h>
66#include <sys/sdt.h>
67#include <sys/stat.h>
68#include <sys/sx.h>
69#include <sys/unistd.h>
70#include <sys/vnode.h>
71#include <sys/priv.h>
72#include <sys/proc.h>
73#include <sys/dirent.h>
74#include <sys/jail.h>
75#include <sys/syscallsubr.h>
76#include <sys/sysctl.h>
77#ifdef KTRACE
78#include <sys/ktrace.h>
79#endif
80
81#include <machine/stdarg.h>
82
83#include <security/audit/audit.h>
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_object.h>
88#include <vm/vm_page.h>
89#include <vm/uma.h>
90
91#include <ufs/ufs/quota.h>
92
93MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
94
95SDT_PROVIDER_DEFINE(vfs);
96SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
97SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
98
99static int kern_chflagsat(struct thread *td, int fd, const char *path,
100    enum uio_seg pathseg, u_long flags, int atflag);
101static int setfflags(struct thread *td, struct vnode *, u_long);
102static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103static int getutimens(const struct timespec *, enum uio_seg,
104    struct timespec *, int *);
105static int setutimes(struct thread *td, struct vnode *,
106    const struct timespec *, int, int);
107static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
108    struct thread *td);
109
110/*
111 * Sync each mounted filesystem.
112 */
113#ifndef _SYS_SYSPROTO_H_
114struct sync_args {
115	int     dummy;
116};
117#endif
118/* ARGSUSED */
119int
120sys_sync(td, uap)
121	struct thread *td;
122	struct sync_args *uap;
123{
124	struct mount *mp, *nmp;
125	int save;
126
127	mtx_lock(&mountlist_mtx);
128	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
129		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
130			nmp = TAILQ_NEXT(mp, mnt_list);
131			continue;
132		}
133		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
134		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
135			save = curthread_pflags_set(TDP_SYNCIO);
136			vfs_msync(mp, MNT_NOWAIT);
137			VFS_SYNC(mp, MNT_NOWAIT);
138			curthread_pflags_restore(save);
139			vn_finished_write(mp);
140		}
141		mtx_lock(&mountlist_mtx);
142		nmp = TAILQ_NEXT(mp, mnt_list);
143		vfs_unbusy(mp);
144	}
145	mtx_unlock(&mountlist_mtx);
146	return (0);
147}
148
149/*
150 * Change filesystem quotas.
151 */
152#ifndef _SYS_SYSPROTO_H_
153struct quotactl_args {
154	char *path;
155	int cmd;
156	int uid;
157	caddr_t arg;
158};
159#endif
160int
161sys_quotactl(td, uap)
162	struct thread *td;
163	register struct quotactl_args /* {
164		char *path;
165		int cmd;
166		int uid;
167		caddr_t arg;
168	} */ *uap;
169{
170	struct mount *mp;
171	struct nameidata nd;
172	int error;
173
174	AUDIT_ARG_CMD(uap->cmd);
175	AUDIT_ARG_UID(uap->uid);
176	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
177		return (EPERM);
178	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
179	    uap->path, td);
180	if ((error = namei(&nd)) != 0)
181		return (error);
182	NDFREE(&nd, NDF_ONLY_PNBUF);
183	mp = nd.ni_vp->v_mount;
184	vfs_ref(mp);
185	vput(nd.ni_vp);
186	error = vfs_busy(mp, 0);
187	vfs_rel(mp);
188	if (error != 0)
189		return (error);
190	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
191
192	/*
193	 * Since quota on operation typically needs to open quota
194	 * file, the Q_QUOTAON handler needs to unbusy the mount point
195	 * before calling into namei.  Otherwise, unmount might be
196	 * started between two vfs_busy() invocations (first is our,
197	 * second is from mount point cross-walk code in lookup()),
198	 * causing deadlock.
199	 *
200	 * Require that Q_QUOTAON handles the vfs_busy() reference on
201	 * its own, always returning with ubusied mount point.
202	 */
203	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
204		vfs_unbusy(mp);
205	return (error);
206}
207
208/*
209 * Used by statfs conversion routines to scale the block size up if
210 * necessary so that all of the block counts are <= 'max_size'.  Note
211 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
212 * value of 'n'.
213 */
214void
215statfs_scale_blocks(struct statfs *sf, long max_size)
216{
217	uint64_t count;
218	int shift;
219
220	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
221
222	/*
223	 * Attempt to scale the block counts to give a more accurate
224	 * overview to userland of the ratio of free space to used
225	 * space.  To do this, find the largest block count and compute
226	 * a divisor that lets it fit into a signed integer <= max_size.
227	 */
228	if (sf->f_bavail < 0)
229		count = -sf->f_bavail;
230	else
231		count = sf->f_bavail;
232	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
233	if (count <= max_size)
234		return;
235
236	count >>= flsl(max_size);
237	shift = 0;
238	while (count > 0) {
239		shift++;
240		count >>=1;
241	}
242
243	sf->f_bsize <<= shift;
244	sf->f_blocks >>= shift;
245	sf->f_bfree >>= shift;
246	sf->f_bavail >>= shift;
247}
248
249static int
250kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
251{
252	struct statfs *sp;
253	int error;
254
255	if (mp == NULL)
256		return (EBADF);
257	error = vfs_busy(mp, 0);
258	vfs_rel(mp);
259	if (error != 0)
260		return (error);
261#ifdef MAC
262	error = mac_mount_check_stat(td->td_ucred, mp);
263	if (error != 0)
264		goto out;
265#endif
266	/*
267	 * Set these in case the underlying filesystem fails to do so.
268	 */
269	sp = &mp->mnt_stat;
270	sp->f_version = STATFS_VERSION;
271	sp->f_namemax = NAME_MAX;
272	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
273	error = VFS_STATFS(mp, sp);
274	if (error != 0)
275		goto out;
276	*buf = *sp;
277	if (priv_check(td, PRIV_VFS_GENERATION)) {
278		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
279		prison_enforce_statfs(td->td_ucred, mp, buf);
280	}
281out:
282	vfs_unbusy(mp);
283	return (error);
284}
285
286/*
287 * Get filesystem statistics.
288 */
289#ifndef _SYS_SYSPROTO_H_
290struct statfs_args {
291	char *path;
292	struct statfs *buf;
293};
294#endif
295int
296sys_statfs(td, uap)
297	struct thread *td;
298	register struct statfs_args /* {
299		char *path;
300		struct statfs *buf;
301	} */ *uap;
302{
303	struct statfs *sfp;
304	int error;
305
306	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
307	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
308	if (error == 0)
309		error = copyout(sfp, uap->buf, sizeof(struct statfs));
310	free(sfp, M_STATFS);
311	return (error);
312}
313
314int
315kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
316    struct statfs *buf)
317{
318	struct mount *mp;
319	struct nameidata nd;
320	int error;
321
322	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
323	    pathseg, path, td);
324	error = namei(&nd);
325	if (error != 0)
326		return (error);
327	mp = nd.ni_vp->v_mount;
328	vfs_ref(mp);
329	NDFREE(&nd, NDF_ONLY_PNBUF);
330	vput(nd.ni_vp);
331	return (kern_do_statfs(td, mp, buf));
332}
333
334/*
335 * Get filesystem statistics.
336 */
337#ifndef _SYS_SYSPROTO_H_
338struct fstatfs_args {
339	int fd;
340	struct statfs *buf;
341};
342#endif
343int
344sys_fstatfs(td, uap)
345	struct thread *td;
346	register struct fstatfs_args /* {
347		int fd;
348		struct statfs *buf;
349	} */ *uap;
350{
351	struct statfs *sfp;
352	int error;
353
354	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
355	error = kern_fstatfs(td, uap->fd, sfp);
356	if (error == 0)
357		error = copyout(sfp, uap->buf, sizeof(struct statfs));
358	free(sfp, M_STATFS);
359	return (error);
360}
361
362int
363kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
364{
365	struct file *fp;
366	struct mount *mp;
367	struct vnode *vp;
368	cap_rights_t rights;
369	int error;
370
371	AUDIT_ARG_FD(fd);
372	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSTATFS), &fp);
373	if (error != 0)
374		return (error);
375	vp = fp->f_vnode;
376	vn_lock(vp, LK_SHARED | LK_RETRY);
377#ifdef AUDIT
378	AUDIT_ARG_VNODE1(vp);
379#endif
380	mp = vp->v_mount;
381	if (mp != NULL)
382		vfs_ref(mp);
383	VOP_UNLOCK(vp, 0);
384	fdrop(fp, td);
385	return (kern_do_statfs(td, mp, buf));
386}
387
388/*
389 * Get statistics on all filesystems.
390 */
391#ifndef _SYS_SYSPROTO_H_
392struct getfsstat_args {
393	struct statfs *buf;
394	long bufsize;
395	int mode;
396};
397#endif
398int
399sys_getfsstat(td, uap)
400	struct thread *td;
401	register struct getfsstat_args /* {
402		struct statfs *buf;
403		long bufsize;
404		int mode;
405	} */ *uap;
406{
407	size_t count;
408	int error;
409
410	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
411		return (EINVAL);
412	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
413	    UIO_USERSPACE, uap->mode);
414	if (error == 0)
415		td->td_retval[0] = count;
416	return (error);
417}
418
419/*
420 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
421 *	The caller is responsible for freeing memory which will be allocated
422 *	in '*buf'.
423 */
424int
425kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
426    size_t *countp, enum uio_seg bufseg, int mode)
427{
428	struct mount *mp, *nmp;
429	struct statfs *sfsp, *sp, *sptmp, *tofree;
430	size_t count, maxcount;
431	int error;
432
433	switch (mode) {
434	case MNT_WAIT:
435	case MNT_NOWAIT:
436		break;
437	default:
438		return (EINVAL);
439	}
440restart:
441	maxcount = bufsize / sizeof(struct statfs);
442	if (bufsize == 0) {
443		sfsp = NULL;
444		tofree = NULL;
445	} else if (bufseg == UIO_USERSPACE) {
446		sfsp = *buf;
447		tofree = NULL;
448	} else /* if (bufseg == UIO_SYSSPACE) */ {
449		count = 0;
450		mtx_lock(&mountlist_mtx);
451		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
452			count++;
453		}
454		mtx_unlock(&mountlist_mtx);
455		if (maxcount > count)
456			maxcount = count;
457		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
458		    M_STATFS, M_WAITOK);
459	}
460	count = 0;
461	mtx_lock(&mountlist_mtx);
462	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
463		if (prison_canseemount(td->td_ucred, mp) != 0) {
464			nmp = TAILQ_NEXT(mp, mnt_list);
465			continue;
466		}
467#ifdef MAC
468		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
469			nmp = TAILQ_NEXT(mp, mnt_list);
470			continue;
471		}
472#endif
473		if (mode == MNT_WAIT) {
474			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
475				/*
476				 * If vfs_busy() failed, and MBF_NOWAIT
477				 * wasn't passed, then the mp is gone.
478				 * Furthermore, because of MBF_MNTLSTLOCK,
479				 * the mountlist_mtx was dropped.  We have
480				 * no other choice than to start over.
481				 */
482				mtx_unlock(&mountlist_mtx);
483				free(tofree, M_STATFS);
484				goto restart;
485			}
486		} else {
487			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
488				nmp = TAILQ_NEXT(mp, mnt_list);
489				continue;
490			}
491		}
492		if (sfsp != NULL && count < maxcount) {
493			sp = &mp->mnt_stat;
494			/*
495			 * Set these in case the underlying filesystem
496			 * fails to do so.
497			 */
498			sp->f_version = STATFS_VERSION;
499			sp->f_namemax = NAME_MAX;
500			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
501			/*
502			 * If MNT_NOWAIT is specified, do not refresh
503			 * the fsstat cache.
504			 */
505			if (mode != MNT_NOWAIT) {
506				error = VFS_STATFS(mp, sp);
507				if (error != 0) {
508					mtx_lock(&mountlist_mtx);
509					nmp = TAILQ_NEXT(mp, mnt_list);
510					vfs_unbusy(mp);
511					continue;
512				}
513			}
514			if (priv_check(td, PRIV_VFS_GENERATION)) {
515				sptmp = malloc(sizeof(struct statfs), M_STATFS,
516				    M_WAITOK);
517				*sptmp = *sp;
518				sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
519				prison_enforce_statfs(td->td_ucred, mp, sptmp);
520				sp = sptmp;
521			} else
522				sptmp = NULL;
523			if (bufseg == UIO_SYSSPACE) {
524				bcopy(sp, sfsp, sizeof(*sp));
525				free(sptmp, M_STATFS);
526			} else /* if (bufseg == UIO_USERSPACE) */ {
527				error = copyout(sp, sfsp, sizeof(*sp));
528				free(sptmp, M_STATFS);
529				if (error != 0) {
530					vfs_unbusy(mp);
531					return (error);
532				}
533			}
534			sfsp++;
535		}
536		count++;
537		mtx_lock(&mountlist_mtx);
538		nmp = TAILQ_NEXT(mp, mnt_list);
539		vfs_unbusy(mp);
540	}
541	mtx_unlock(&mountlist_mtx);
542	if (sfsp != NULL && count > maxcount)
543		*countp = maxcount;
544	else
545		*countp = count;
546	return (0);
547}
548
549#ifdef COMPAT_FREEBSD4
550/*
551 * Get old format filesystem statistics.
552 */
553static void cvtstatfs(struct statfs *, struct ostatfs *);
554
555#ifndef _SYS_SYSPROTO_H_
556struct freebsd4_statfs_args {
557	char *path;
558	struct ostatfs *buf;
559};
560#endif
561int
562freebsd4_statfs(td, uap)
563	struct thread *td;
564	struct freebsd4_statfs_args /* {
565		char *path;
566		struct ostatfs *buf;
567	} */ *uap;
568{
569	struct ostatfs osb;
570	struct statfs *sfp;
571	int error;
572
573	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
574	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
575	if (error == 0) {
576		cvtstatfs(sfp, &osb);
577		error = copyout(&osb, uap->buf, sizeof(osb));
578	}
579	free(sfp, M_STATFS);
580	return (error);
581}
582
583/*
584 * Get filesystem statistics.
585 */
586#ifndef _SYS_SYSPROTO_H_
587struct freebsd4_fstatfs_args {
588	int fd;
589	struct ostatfs *buf;
590};
591#endif
592int
593freebsd4_fstatfs(td, uap)
594	struct thread *td;
595	struct freebsd4_fstatfs_args /* {
596		int fd;
597		struct ostatfs *buf;
598	} */ *uap;
599{
600	struct ostatfs osb;
601	struct statfs *sfp;
602	int error;
603
604	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
605	error = kern_fstatfs(td, uap->fd, sfp);
606	if (error == 0) {
607		cvtstatfs(sfp, &osb);
608		error = copyout(&osb, uap->buf, sizeof(osb));
609	}
610	free(sfp, M_STATFS);
611	return (error);
612}
613
614/*
615 * Get statistics on all filesystems.
616 */
617#ifndef _SYS_SYSPROTO_H_
618struct freebsd4_getfsstat_args {
619	struct ostatfs *buf;
620	long bufsize;
621	int mode;
622};
623#endif
624int
625freebsd4_getfsstat(td, uap)
626	struct thread *td;
627	register struct freebsd4_getfsstat_args /* {
628		struct ostatfs *buf;
629		long bufsize;
630		int mode;
631	} */ *uap;
632{
633	struct statfs *buf, *sp;
634	struct ostatfs osb;
635	size_t count, size;
636	int error;
637
638	if (uap->bufsize < 0)
639		return (EINVAL);
640	count = uap->bufsize / sizeof(struct ostatfs);
641	if (count > SIZE_MAX / sizeof(struct statfs))
642		return (EINVAL);
643	size = count * sizeof(struct statfs);
644	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
645	    uap->mode);
646	td->td_retval[0] = count;
647	if (size != 0) {
648		sp = buf;
649		while (count != 0 && error == 0) {
650			cvtstatfs(sp, &osb);
651			error = copyout(&osb, uap->buf, sizeof(osb));
652			sp++;
653			uap->buf++;
654			count--;
655		}
656		free(buf, M_STATFS);
657	}
658	return (error);
659}
660
661/*
662 * Implement fstatfs() for (NFS) file handles.
663 */
664#ifndef _SYS_SYSPROTO_H_
665struct freebsd4_fhstatfs_args {
666	struct fhandle *u_fhp;
667	struct ostatfs *buf;
668};
669#endif
670int
671freebsd4_fhstatfs(td, uap)
672	struct thread *td;
673	struct freebsd4_fhstatfs_args /* {
674		struct fhandle *u_fhp;
675		struct ostatfs *buf;
676	} */ *uap;
677{
678	struct ostatfs osb;
679	struct statfs *sfp;
680	fhandle_t fh;
681	int error;
682
683	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
684	if (error != 0)
685		return (error);
686	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
687	error = kern_fhstatfs(td, fh, sfp);
688	if (error == 0) {
689		cvtstatfs(sfp, &osb);
690		error = copyout(&osb, uap->buf, sizeof(osb));
691	}
692	free(sfp, M_STATFS);
693	return (error);
694}
695
696/*
697 * Convert a new format statfs structure to an old format statfs structure.
698 */
699static void
700cvtstatfs(nsp, osp)
701	struct statfs *nsp;
702	struct ostatfs *osp;
703{
704
705	statfs_scale_blocks(nsp, LONG_MAX);
706	bzero(osp, sizeof(*osp));
707	osp->f_bsize = nsp->f_bsize;
708	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
709	osp->f_blocks = nsp->f_blocks;
710	osp->f_bfree = nsp->f_bfree;
711	osp->f_bavail = nsp->f_bavail;
712	osp->f_files = MIN(nsp->f_files, LONG_MAX);
713	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
714	osp->f_owner = nsp->f_owner;
715	osp->f_type = nsp->f_type;
716	osp->f_flags = nsp->f_flags;
717	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
718	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
719	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
720	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
721	strlcpy(osp->f_fstypename, nsp->f_fstypename,
722	    MIN(MFSNAMELEN, OMFSNAMELEN));
723	strlcpy(osp->f_mntonname, nsp->f_mntonname,
724	    MIN(MNAMELEN, OMNAMELEN));
725	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
726	    MIN(MNAMELEN, OMNAMELEN));
727	osp->f_fsid = nsp->f_fsid;
728}
729#endif /* COMPAT_FREEBSD4 */
730
731/*
732 * Change current working directory to a given file descriptor.
733 */
734#ifndef _SYS_SYSPROTO_H_
735struct fchdir_args {
736	int	fd;
737};
738#endif
739int
740sys_fchdir(td, uap)
741	struct thread *td;
742	struct fchdir_args /* {
743		int fd;
744	} */ *uap;
745{
746	struct vnode *vp, *tdp;
747	struct mount *mp;
748	struct file *fp;
749	cap_rights_t rights;
750	int error;
751
752	AUDIT_ARG_FD(uap->fd);
753	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
754	    &fp);
755	if (error != 0)
756		return (error);
757	vp = fp->f_vnode;
758	vrefact(vp);
759	fdrop(fp, td);
760	vn_lock(vp, LK_SHARED | LK_RETRY);
761	AUDIT_ARG_VNODE1(vp);
762	error = change_dir(vp, td);
763	while (!error && (mp = vp->v_mountedhere) != NULL) {
764		if (vfs_busy(mp, 0))
765			continue;
766		error = VFS_ROOT(mp, LK_SHARED, &tdp);
767		vfs_unbusy(mp);
768		if (error != 0)
769			break;
770		vput(vp);
771		vp = tdp;
772	}
773	if (error != 0) {
774		vput(vp);
775		return (error);
776	}
777	VOP_UNLOCK(vp, 0);
778	pwd_chdir(td, vp);
779	return (0);
780}
781
782/*
783 * Change current working directory (``.'').
784 */
785#ifndef _SYS_SYSPROTO_H_
786struct chdir_args {
787	char	*path;
788};
789#endif
790int
791sys_chdir(td, uap)
792	struct thread *td;
793	struct chdir_args /* {
794		char *path;
795	} */ *uap;
796{
797
798	return (kern_chdir(td, uap->path, UIO_USERSPACE));
799}
800
801int
802kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
803{
804	struct nameidata nd;
805	int error;
806
807	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
808	    pathseg, path, td);
809	if ((error = namei(&nd)) != 0)
810		return (error);
811	if ((error = change_dir(nd.ni_vp, td)) != 0) {
812		vput(nd.ni_vp);
813		NDFREE(&nd, NDF_ONLY_PNBUF);
814		return (error);
815	}
816	VOP_UNLOCK(nd.ni_vp, 0);
817	NDFREE(&nd, NDF_ONLY_PNBUF);
818	pwd_chdir(td, nd.ni_vp);
819	return (0);
820}
821
822/*
823 * Change notion of root (``/'') directory.
824 */
825#ifndef _SYS_SYSPROTO_H_
826struct chroot_args {
827	char	*path;
828};
829#endif
830int
831sys_chroot(td, uap)
832	struct thread *td;
833	struct chroot_args /* {
834		char *path;
835	} */ *uap;
836{
837	struct nameidata nd;
838	int error;
839
840	error = priv_check(td, PRIV_VFS_CHROOT);
841	if (error != 0)
842		return (error);
843	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
844	    UIO_USERSPACE, uap->path, td);
845	error = namei(&nd);
846	if (error != 0)
847		goto error;
848	error = change_dir(nd.ni_vp, td);
849	if (error != 0)
850		goto e_vunlock;
851#ifdef MAC
852	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
853	if (error != 0)
854		goto e_vunlock;
855#endif
856	VOP_UNLOCK(nd.ni_vp, 0);
857	error = pwd_chroot(td, nd.ni_vp);
858	vrele(nd.ni_vp);
859	NDFREE(&nd, NDF_ONLY_PNBUF);
860	return (error);
861e_vunlock:
862	vput(nd.ni_vp);
863error:
864	NDFREE(&nd, NDF_ONLY_PNBUF);
865	return (error);
866}
867
868/*
869 * Common routine for chroot and chdir.  Callers must provide a locked vnode
870 * instance.
871 */
872int
873change_dir(vp, td)
874	struct vnode *vp;
875	struct thread *td;
876{
877#ifdef MAC
878	int error;
879#endif
880
881	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
882	if (vp->v_type != VDIR)
883		return (ENOTDIR);
884#ifdef MAC
885	error = mac_vnode_check_chdir(td->td_ucred, vp);
886	if (error != 0)
887		return (error);
888#endif
889	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
890}
891
892static __inline void
893flags_to_rights(int flags, cap_rights_t *rightsp)
894{
895
896	if (flags & O_EXEC) {
897		cap_rights_set(rightsp, CAP_FEXECVE);
898	} else {
899		switch ((flags & O_ACCMODE)) {
900		case O_RDONLY:
901			cap_rights_set(rightsp, CAP_READ);
902			break;
903		case O_RDWR:
904			cap_rights_set(rightsp, CAP_READ);
905			/* FALLTHROUGH */
906		case O_WRONLY:
907			cap_rights_set(rightsp, CAP_WRITE);
908			if (!(flags & (O_APPEND | O_TRUNC)))
909				cap_rights_set(rightsp, CAP_SEEK);
910			break;
911		}
912	}
913
914	if (flags & O_CREAT)
915		cap_rights_set(rightsp, CAP_CREATE);
916
917	if (flags & O_TRUNC)
918		cap_rights_set(rightsp, CAP_FTRUNCATE);
919
920	if (flags & (O_SYNC | O_FSYNC))
921		cap_rights_set(rightsp, CAP_FSYNC);
922
923	if (flags & (O_EXLOCK | O_SHLOCK))
924		cap_rights_set(rightsp, CAP_FLOCK);
925}
926
927/*
928 * Check permissions, allocate an open file structure, and call the device
929 * open routine if any.
930 */
931#ifndef _SYS_SYSPROTO_H_
932struct open_args {
933	char	*path;
934	int	flags;
935	int	mode;
936};
937#endif
938int
939sys_open(td, uap)
940	struct thread *td;
941	register struct open_args /* {
942		char *path;
943		int flags;
944		int mode;
945	} */ *uap;
946{
947
948	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
949	    uap->flags, uap->mode));
950}
951
952#ifndef _SYS_SYSPROTO_H_
953struct openat_args {
954	int	fd;
955	char	*path;
956	int	flag;
957	int	mode;
958};
959#endif
960int
961sys_openat(struct thread *td, struct openat_args *uap)
962{
963
964	AUDIT_ARG_FD(uap->fd);
965	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
966	    uap->mode));
967}
968
969int
970kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
971    int flags, int mode)
972{
973	struct proc *p = td->td_proc;
974	struct filedesc *fdp = p->p_fd;
975	struct file *fp;
976	struct vnode *vp;
977	struct nameidata nd;
978	cap_rights_t rights;
979	int cmode, error, indx;
980
981	indx = -1;
982
983	AUDIT_ARG_FFLAGS(flags);
984	AUDIT_ARG_MODE(mode);
985	cap_rights_init(&rights, CAP_LOOKUP);
986	flags_to_rights(flags, &rights);
987	/*
988	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
989	 * may be specified.
990	 */
991	if (flags & O_EXEC) {
992		if (flags & O_ACCMODE)
993			return (EINVAL);
994	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
995		return (EINVAL);
996	} else {
997		flags = FFLAGS(flags);
998	}
999
1000	/*
1001	 * Allocate a file structure. The descriptor to reference it
1002	 * is allocated and set by finstall() below.
1003	 */
1004	error = falloc_noinstall(td, &fp);
1005	if (error != 0)
1006		return (error);
1007	/*
1008	 * An extra reference on `fp' has been held for us by
1009	 * falloc_noinstall().
1010	 */
1011	/* Set the flags early so the finit in devfs can pick them up. */
1012	fp->f_flag = flags & FMASK;
1013	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
1014	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
1015	    &rights, td);
1016	td->td_dupfd = -1;		/* XXX check for fdopen */
1017	error = vn_open(&nd, &flags, cmode, fp);
1018	if (error != 0) {
1019		/*
1020		 * If the vn_open replaced the method vector, something
1021		 * wonderous happened deep below and we just pass it up
1022		 * pretending we know what we do.
1023		 */
1024		if (error == ENXIO && fp->f_ops != &badfileops)
1025			goto success;
1026
1027		/*
1028		 * Handle special fdopen() case. bleh.
1029		 *
1030		 * Don't do this for relative (capability) lookups; we don't
1031		 * understand exactly what would happen, and we don't think
1032		 * that it ever should.
1033		 */
1034		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
1035		    (error == ENODEV || error == ENXIO) &&
1036		    td->td_dupfd >= 0) {
1037			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
1038			    &indx);
1039			if (error == 0)
1040				goto success;
1041		}
1042
1043		goto bad;
1044	}
1045	td->td_dupfd = 0;
1046	NDFREE(&nd, NDF_ONLY_PNBUF);
1047	vp = nd.ni_vp;
1048
1049	/*
1050	 * Store the vnode, for any f_type. Typically, the vnode use
1051	 * count is decremented by direct call to vn_closefile() for
1052	 * files that switched type in the cdevsw fdopen() method.
1053	 */
1054	fp->f_vnode = vp;
1055	/*
1056	 * If the file wasn't claimed by devfs bind it to the normal
1057	 * vnode operations here.
1058	 */
1059	if (fp->f_ops == &badfileops) {
1060		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
1061		fp->f_seqcount = 1;
1062		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
1063		    DTYPE_VNODE, vp, &vnops);
1064	}
1065
1066	VOP_UNLOCK(vp, 0);
1067	if (flags & O_TRUNC) {
1068		error = fo_truncate(fp, 0, td->td_ucred, td);
1069		if (error != 0)
1070			goto bad;
1071	}
1072success:
1073	/*
1074	 * If we haven't already installed the FD (for dupfdopen), do so now.
1075	 */
1076	if (indx == -1) {
1077		struct filecaps *fcaps;
1078
1079#ifdef CAPABILITIES
1080		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
1081			fcaps = &nd.ni_filecaps;
1082		else
1083#endif
1084			fcaps = NULL;
1085		error = finstall(td, fp, &indx, flags, fcaps);
1086		/* On success finstall() consumes fcaps. */
1087		if (error != 0) {
1088			filecaps_free(&nd.ni_filecaps);
1089			goto bad;
1090		}
1091	} else {
1092		filecaps_free(&nd.ni_filecaps);
1093	}
1094
1095	/*
1096	 * Release our private reference, leaving the one associated with
1097	 * the descriptor table intact.
1098	 */
1099	fdrop(fp, td);
1100	td->td_retval[0] = indx;
1101	return (0);
1102bad:
1103	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1104	fdrop(fp, td);
1105	return (error);
1106}
1107
1108#ifdef COMPAT_43
1109/*
1110 * Create a file.
1111 */
1112#ifndef _SYS_SYSPROTO_H_
1113struct ocreat_args {
1114	char	*path;
1115	int	mode;
1116};
1117#endif
1118int
1119ocreat(td, uap)
1120	struct thread *td;
1121	register struct ocreat_args /* {
1122		char *path;
1123		int mode;
1124	} */ *uap;
1125{
1126
1127	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1128	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1129}
1130#endif /* COMPAT_43 */
1131
1132/*
1133 * Create a special file.
1134 */
1135#ifndef _SYS_SYSPROTO_H_
1136struct mknod_args {
1137	char	*path;
1138	int	mode;
1139	int	dev;
1140};
1141#endif
1142int
1143sys_mknod(td, uap)
1144	struct thread *td;
1145	register struct mknod_args /* {
1146		char *path;
1147		int mode;
1148		int dev;
1149	} */ *uap;
1150{
1151
1152	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1153	    uap->mode, uap->dev));
1154}
1155
1156#ifndef _SYS_SYSPROTO_H_
1157struct mknodat_args {
1158	int	fd;
1159	char	*path;
1160	mode_t	mode;
1161	dev_t	dev;
1162};
1163#endif
1164int
1165sys_mknodat(struct thread *td, struct mknodat_args *uap)
1166{
1167
1168	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1169	    uap->dev));
1170}
1171
1172int
1173kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1174    int mode, int dev)
1175{
1176	struct vnode *vp;
1177	struct mount *mp;
1178	struct vattr vattr;
1179	struct nameidata nd;
1180	cap_rights_t rights;
1181	int error, whiteout = 0;
1182
1183	AUDIT_ARG_MODE(mode);
1184	AUDIT_ARG_DEV(dev);
1185	switch (mode & S_IFMT) {
1186	case S_IFCHR:
1187	case S_IFBLK:
1188		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1189		if (error == 0 && dev == VNOVAL)
1190			error = EINVAL;
1191		break;
1192	case S_IFWHT:
1193		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1194		break;
1195	case S_IFIFO:
1196		if (dev == 0)
1197			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1198		/* FALLTHROUGH */
1199	default:
1200		error = EINVAL;
1201		break;
1202	}
1203	if (error != 0)
1204		return (error);
1205restart:
1206	bwillwrite();
1207	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1208	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1209	    td);
1210	if ((error = namei(&nd)) != 0)
1211		return (error);
1212	vp = nd.ni_vp;
1213	if (vp != NULL) {
1214		NDFREE(&nd, NDF_ONLY_PNBUF);
1215		if (vp == nd.ni_dvp)
1216			vrele(nd.ni_dvp);
1217		else
1218			vput(nd.ni_dvp);
1219		vrele(vp);
1220		return (EEXIST);
1221	} else {
1222		VATTR_NULL(&vattr);
1223		vattr.va_mode = (mode & ALLPERMS) &
1224		    ~td->td_proc->p_fd->fd_cmask;
1225		vattr.va_rdev = dev;
1226		whiteout = 0;
1227
1228		switch (mode & S_IFMT) {
1229		case S_IFCHR:
1230			vattr.va_type = VCHR;
1231			break;
1232		case S_IFBLK:
1233			vattr.va_type = VBLK;
1234			break;
1235		case S_IFWHT:
1236			whiteout = 1;
1237			break;
1238		default:
1239			panic("kern_mknod: invalid mode");
1240		}
1241	}
1242	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1243		NDFREE(&nd, NDF_ONLY_PNBUF);
1244		vput(nd.ni_dvp);
1245		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1246			return (error);
1247		goto restart;
1248	}
1249#ifdef MAC
1250	if (error == 0 && !whiteout)
1251		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1252		    &nd.ni_cnd, &vattr);
1253#endif
1254	if (error == 0) {
1255		if (whiteout)
1256			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1257		else {
1258			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1259						&nd.ni_cnd, &vattr);
1260			if (error == 0)
1261				vput(nd.ni_vp);
1262		}
1263	}
1264	NDFREE(&nd, NDF_ONLY_PNBUF);
1265	vput(nd.ni_dvp);
1266	vn_finished_write(mp);
1267	return (error);
1268}
1269
1270/*
1271 * Create a named pipe.
1272 */
1273#ifndef _SYS_SYSPROTO_H_
1274struct mkfifo_args {
1275	char	*path;
1276	int	mode;
1277};
1278#endif
1279int
1280sys_mkfifo(td, uap)
1281	struct thread *td;
1282	register struct mkfifo_args /* {
1283		char *path;
1284		int mode;
1285	} */ *uap;
1286{
1287
1288	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1289	    uap->mode));
1290}
1291
1292#ifndef _SYS_SYSPROTO_H_
1293struct mkfifoat_args {
1294	int	fd;
1295	char	*path;
1296	mode_t	mode;
1297};
1298#endif
1299int
1300sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1301{
1302
1303	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1304	    uap->mode));
1305}
1306
1307int
1308kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1309    int mode)
1310{
1311	struct mount *mp;
1312	struct vattr vattr;
1313	struct nameidata nd;
1314	cap_rights_t rights;
1315	int error;
1316
1317	AUDIT_ARG_MODE(mode);
1318restart:
1319	bwillwrite();
1320	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1321	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1322	    td);
1323	if ((error = namei(&nd)) != 0)
1324		return (error);
1325	if (nd.ni_vp != NULL) {
1326		NDFREE(&nd, NDF_ONLY_PNBUF);
1327		if (nd.ni_vp == nd.ni_dvp)
1328			vrele(nd.ni_dvp);
1329		else
1330			vput(nd.ni_dvp);
1331		vrele(nd.ni_vp);
1332		return (EEXIST);
1333	}
1334	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1335		NDFREE(&nd, NDF_ONLY_PNBUF);
1336		vput(nd.ni_dvp);
1337		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1338			return (error);
1339		goto restart;
1340	}
1341	VATTR_NULL(&vattr);
1342	vattr.va_type = VFIFO;
1343	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1344#ifdef MAC
1345	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1346	    &vattr);
1347	if (error != 0)
1348		goto out;
1349#endif
1350	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1351	if (error == 0)
1352		vput(nd.ni_vp);
1353#ifdef MAC
1354out:
1355#endif
1356	vput(nd.ni_dvp);
1357	vn_finished_write(mp);
1358	NDFREE(&nd, NDF_ONLY_PNBUF);
1359	return (error);
1360}
1361
1362/*
1363 * Make a hard file link.
1364 */
1365#ifndef _SYS_SYSPROTO_H_
1366struct link_args {
1367	char	*path;
1368	char	*link;
1369};
1370#endif
1371int
1372sys_link(td, uap)
1373	struct thread *td;
1374	register struct link_args /* {
1375		char *path;
1376		char *link;
1377	} */ *uap;
1378{
1379
1380	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1381	    UIO_USERSPACE, FOLLOW));
1382}
1383
1384#ifndef _SYS_SYSPROTO_H_
1385struct linkat_args {
1386	int	fd1;
1387	char	*path1;
1388	int	fd2;
1389	char	*path2;
1390	int	flag;
1391};
1392#endif
1393int
1394sys_linkat(struct thread *td, struct linkat_args *uap)
1395{
1396	int flag;
1397
1398	flag = uap->flag;
1399	if (flag & ~AT_SYMLINK_FOLLOW)
1400		return (EINVAL);
1401
1402	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1403	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1404}
1405
1406int hardlink_check_uid = 0;
1407SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1408    &hardlink_check_uid, 0,
1409    "Unprivileged processes cannot create hard links to files owned by other "
1410    "users");
1411static int hardlink_check_gid = 0;
1412SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1413    &hardlink_check_gid, 0,
1414    "Unprivileged processes cannot create hard links to files owned by other "
1415    "groups");
1416
1417static int
1418can_hardlink(struct vnode *vp, struct ucred *cred)
1419{
1420	struct vattr va;
1421	int error;
1422
1423	if (!hardlink_check_uid && !hardlink_check_gid)
1424		return (0);
1425
1426	error = VOP_GETATTR(vp, &va, cred);
1427	if (error != 0)
1428		return (error);
1429
1430	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1431		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1432		if (error != 0)
1433			return (error);
1434	}
1435
1436	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1437		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1438		if (error != 0)
1439			return (error);
1440	}
1441
1442	return (0);
1443}
1444
1445int
1446kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1447    enum uio_seg segflg, int follow)
1448{
1449	struct vnode *vp;
1450	struct mount *mp;
1451	struct nameidata nd;
1452	cap_rights_t rights;
1453	int error;
1454
1455again:
1456	bwillwrite();
1457	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1,
1458	    cap_rights_init(&rights, CAP_LINKAT_SOURCE), td);
1459
1460	if ((error = namei(&nd)) != 0)
1461		return (error);
1462	NDFREE(&nd, NDF_ONLY_PNBUF);
1463	vp = nd.ni_vp;
1464	if (vp->v_type == VDIR) {
1465		vrele(vp);
1466		return (EPERM);		/* POSIX */
1467	}
1468	NDINIT_ATRIGHTS(&nd, CREATE,
1469	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflg, path2, fd2,
1470	    cap_rights_init(&rights, CAP_LINKAT_TARGET), td);
1471	if ((error = namei(&nd)) == 0) {
1472		if (nd.ni_vp != NULL) {
1473			NDFREE(&nd, NDF_ONLY_PNBUF);
1474			if (nd.ni_dvp == nd.ni_vp)
1475				vrele(nd.ni_dvp);
1476			else
1477				vput(nd.ni_dvp);
1478			vrele(nd.ni_vp);
1479			vrele(vp);
1480			return (EEXIST);
1481		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1482			/*
1483			 * Cross-device link.  No need to recheck
1484			 * vp->v_type, since it cannot change, except
1485			 * to VBAD.
1486			 */
1487			NDFREE(&nd, NDF_ONLY_PNBUF);
1488			vput(nd.ni_dvp);
1489			vrele(vp);
1490			return (EXDEV);
1491		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1492			error = can_hardlink(vp, td->td_ucred);
1493#ifdef MAC
1494			if (error == 0)
1495				error = mac_vnode_check_link(td->td_ucred,
1496				    nd.ni_dvp, vp, &nd.ni_cnd);
1497#endif
1498			if (error != 0) {
1499				vput(vp);
1500				vput(nd.ni_dvp);
1501				NDFREE(&nd, NDF_ONLY_PNBUF);
1502				return (error);
1503			}
1504			error = vn_start_write(vp, &mp, V_NOWAIT);
1505			if (error != 0) {
1506				vput(vp);
1507				vput(nd.ni_dvp);
1508				NDFREE(&nd, NDF_ONLY_PNBUF);
1509				error = vn_start_write(NULL, &mp,
1510				    V_XSLEEP | PCATCH);
1511				if (error != 0)
1512					return (error);
1513				goto again;
1514			}
1515			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1516			VOP_UNLOCK(vp, 0);
1517			vput(nd.ni_dvp);
1518			vn_finished_write(mp);
1519			NDFREE(&nd, NDF_ONLY_PNBUF);
1520		} else {
1521			vput(nd.ni_dvp);
1522			NDFREE(&nd, NDF_ONLY_PNBUF);
1523			vrele(vp);
1524			goto again;
1525		}
1526	}
1527	vrele(vp);
1528	return (error);
1529}
1530
1531/*
1532 * Make a symbolic link.
1533 */
1534#ifndef _SYS_SYSPROTO_H_
1535struct symlink_args {
1536	char	*path;
1537	char	*link;
1538};
1539#endif
1540int
1541sys_symlink(td, uap)
1542	struct thread *td;
1543	register struct symlink_args /* {
1544		char *path;
1545		char *link;
1546	} */ *uap;
1547{
1548
1549	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1550	    UIO_USERSPACE));
1551}
1552
1553#ifndef _SYS_SYSPROTO_H_
1554struct symlinkat_args {
1555	char	*path;
1556	int	fd;
1557	char	*path2;
1558};
1559#endif
1560int
1561sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1562{
1563
1564	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1565	    UIO_USERSPACE));
1566}
1567
1568int
1569kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1570    enum uio_seg segflg)
1571{
1572	struct mount *mp;
1573	struct vattr vattr;
1574	char *syspath;
1575	struct nameidata nd;
1576	int error;
1577	cap_rights_t rights;
1578
1579	if (segflg == UIO_SYSSPACE) {
1580		syspath = path1;
1581	} else {
1582		syspath = uma_zalloc(namei_zone, M_WAITOK);
1583		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1584			goto out;
1585	}
1586	AUDIT_ARG_TEXT(syspath);
1587restart:
1588	bwillwrite();
1589	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1590	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1591	    td);
1592	if ((error = namei(&nd)) != 0)
1593		goto out;
1594	if (nd.ni_vp) {
1595		NDFREE(&nd, NDF_ONLY_PNBUF);
1596		if (nd.ni_vp == nd.ni_dvp)
1597			vrele(nd.ni_dvp);
1598		else
1599			vput(nd.ni_dvp);
1600		vrele(nd.ni_vp);
1601		error = EEXIST;
1602		goto out;
1603	}
1604	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1605		NDFREE(&nd, NDF_ONLY_PNBUF);
1606		vput(nd.ni_dvp);
1607		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1608			goto out;
1609		goto restart;
1610	}
1611	VATTR_NULL(&vattr);
1612	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1613#ifdef MAC
1614	vattr.va_type = VLNK;
1615	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1616	    &vattr);
1617	if (error != 0)
1618		goto out2;
1619#endif
1620	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1621	if (error == 0)
1622		vput(nd.ni_vp);
1623#ifdef MAC
1624out2:
1625#endif
1626	NDFREE(&nd, NDF_ONLY_PNBUF);
1627	vput(nd.ni_dvp);
1628	vn_finished_write(mp);
1629out:
1630	if (segflg != UIO_SYSSPACE)
1631		uma_zfree(namei_zone, syspath);
1632	return (error);
1633}
1634
1635/*
1636 * Delete a whiteout from the filesystem.
1637 */
1638int
1639sys_undelete(td, uap)
1640	struct thread *td;
1641	register struct undelete_args /* {
1642		char *path;
1643	} */ *uap;
1644{
1645	struct mount *mp;
1646	struct nameidata nd;
1647	int error;
1648
1649restart:
1650	bwillwrite();
1651	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1652	    UIO_USERSPACE, uap->path, td);
1653	error = namei(&nd);
1654	if (error != 0)
1655		return (error);
1656
1657	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1658		NDFREE(&nd, NDF_ONLY_PNBUF);
1659		if (nd.ni_vp == nd.ni_dvp)
1660			vrele(nd.ni_dvp);
1661		else
1662			vput(nd.ni_dvp);
1663		if (nd.ni_vp)
1664			vrele(nd.ni_vp);
1665		return (EEXIST);
1666	}
1667	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1668		NDFREE(&nd, NDF_ONLY_PNBUF);
1669		vput(nd.ni_dvp);
1670		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1671			return (error);
1672		goto restart;
1673	}
1674	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1675	NDFREE(&nd, NDF_ONLY_PNBUF);
1676	vput(nd.ni_dvp);
1677	vn_finished_write(mp);
1678	return (error);
1679}
1680
1681/*
1682 * Delete a name from the filesystem.
1683 */
1684#ifndef _SYS_SYSPROTO_H_
1685struct unlink_args {
1686	char	*path;
1687};
1688#endif
1689int
1690sys_unlink(td, uap)
1691	struct thread *td;
1692	struct unlink_args /* {
1693		char *path;
1694	} */ *uap;
1695{
1696
1697	return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
1698}
1699
1700#ifndef _SYS_SYSPROTO_H_
1701struct unlinkat_args {
1702	int	fd;
1703	char	*path;
1704	int	flag;
1705};
1706#endif
1707int
1708sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1709{
1710	int flag = uap->flag;
1711	int fd = uap->fd;
1712	char *path = uap->path;
1713
1714	if (flag & ~AT_REMOVEDIR)
1715		return (EINVAL);
1716
1717	if (flag & AT_REMOVEDIR)
1718		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1719	else
1720		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1721}
1722
1723int
1724kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1725    ino_t oldinum)
1726{
1727	struct mount *mp;
1728	struct vnode *vp;
1729	struct nameidata nd;
1730	struct stat sb;
1731	cap_rights_t rights;
1732	int error;
1733
1734restart:
1735	bwillwrite();
1736	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1737	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1738	if ((error = namei(&nd)) != 0)
1739		return (error == EINVAL ? EPERM : error);
1740	vp = nd.ni_vp;
1741	if (vp->v_type == VDIR && oldinum == 0) {
1742		error = EPERM;		/* POSIX */
1743	} else if (oldinum != 0 &&
1744		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1745		  sb.st_ino != oldinum) {
1746			error = EIDRM;	/* Identifier removed */
1747	} else {
1748		/*
1749		 * The root of a mounted filesystem cannot be deleted.
1750		 *
1751		 * XXX: can this only be a VDIR case?
1752		 */
1753		if (vp->v_vflag & VV_ROOT)
1754			error = EBUSY;
1755	}
1756	if (error == 0) {
1757		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1758			NDFREE(&nd, NDF_ONLY_PNBUF);
1759			vput(nd.ni_dvp);
1760			if (vp == nd.ni_dvp)
1761				vrele(vp);
1762			else
1763				vput(vp);
1764			if ((error = vn_start_write(NULL, &mp,
1765			    V_XSLEEP | PCATCH)) != 0)
1766				return (error);
1767			goto restart;
1768		}
1769#ifdef MAC
1770		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1771		    &nd.ni_cnd);
1772		if (error != 0)
1773			goto out;
1774#endif
1775		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1776		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1777#ifdef MAC
1778out:
1779#endif
1780		vn_finished_write(mp);
1781	}
1782	NDFREE(&nd, NDF_ONLY_PNBUF);
1783	vput(nd.ni_dvp);
1784	if (vp == nd.ni_dvp)
1785		vrele(vp);
1786	else
1787		vput(vp);
1788	return (error);
1789}
1790
1791/*
1792 * Reposition read/write file offset.
1793 */
1794#ifndef _SYS_SYSPROTO_H_
1795struct lseek_args {
1796	int	fd;
1797	int	pad;
1798	off_t	offset;
1799	int	whence;
1800};
1801#endif
1802int
1803sys_lseek(struct thread *td, struct lseek_args *uap)
1804{
1805
1806	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1807}
1808
1809int
1810kern_lseek(struct thread *td, int fd, off_t offset, int whence)
1811{
1812	struct file *fp;
1813	cap_rights_t rights;
1814	int error;
1815
1816	AUDIT_ARG_FD(fd);
1817	error = fget(td, fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1818	if (error != 0)
1819		return (error);
1820	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1821	    fo_seek(fp, offset, whence, td) : ESPIPE;
1822	fdrop(fp, td);
1823	return (error);
1824}
1825
1826#if defined(COMPAT_43)
1827/*
1828 * Reposition read/write file offset.
1829 */
1830#ifndef _SYS_SYSPROTO_H_
1831struct olseek_args {
1832	int	fd;
1833	long	offset;
1834	int	whence;
1835};
1836#endif
1837int
1838olseek(struct thread *td, struct olseek_args *uap)
1839{
1840
1841	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1842}
1843#endif /* COMPAT_43 */
1844
1845#if defined(COMPAT_FREEBSD6)
1846/* Version with the 'pad' argument */
1847int
1848freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
1849{
1850
1851	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1852}
1853#endif
1854
1855/*
1856 * Check access permissions using passed credentials.
1857 */
1858static int
1859vn_access(vp, user_flags, cred, td)
1860	struct vnode	*vp;
1861	int		user_flags;
1862	struct ucred	*cred;
1863	struct thread	*td;
1864{
1865	accmode_t accmode;
1866	int error;
1867
1868	/* Flags == 0 means only check for existence. */
1869	if (user_flags == 0)
1870		return (0);
1871
1872	accmode = 0;
1873	if (user_flags & R_OK)
1874		accmode |= VREAD;
1875	if (user_flags & W_OK)
1876		accmode |= VWRITE;
1877	if (user_flags & X_OK)
1878		accmode |= VEXEC;
1879#ifdef MAC
1880	error = mac_vnode_check_access(cred, vp, accmode);
1881	if (error != 0)
1882		return (error);
1883#endif
1884	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1885		error = VOP_ACCESS(vp, accmode, cred, td);
1886	return (error);
1887}
1888
1889/*
1890 * Check access permissions using "real" credentials.
1891 */
1892#ifndef _SYS_SYSPROTO_H_
1893struct access_args {
1894	char	*path;
1895	int	amode;
1896};
1897#endif
1898int
1899sys_access(td, uap)
1900	struct thread *td;
1901	register struct access_args /* {
1902		char *path;
1903		int amode;
1904	} */ *uap;
1905{
1906
1907	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1908	    0, uap->amode));
1909}
1910
1911#ifndef _SYS_SYSPROTO_H_
1912struct faccessat_args {
1913	int	dirfd;
1914	char	*path;
1915	int	amode;
1916	int	flag;
1917}
1918#endif
1919int
1920sys_faccessat(struct thread *td, struct faccessat_args *uap)
1921{
1922
1923	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1924	    uap->amode));
1925}
1926
1927int
1928kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1929    int flag, int amode)
1930{
1931	struct ucred *cred, *usecred;
1932	struct vnode *vp;
1933	struct nameidata nd;
1934	cap_rights_t rights;
1935	int error;
1936
1937	if (flag & ~AT_EACCESS)
1938		return (EINVAL);
1939	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
1940		return (EINVAL);
1941
1942	/*
1943	 * Create and modify a temporary credential instead of one that
1944	 * is potentially shared (if we need one).
1945	 */
1946	cred = td->td_ucred;
1947	if ((flag & AT_EACCESS) == 0 &&
1948	    ((cred->cr_uid != cred->cr_ruid ||
1949	    cred->cr_rgid != cred->cr_groups[0]))) {
1950		usecred = crdup(cred);
1951		usecred->cr_uid = cred->cr_ruid;
1952		usecred->cr_groups[0] = cred->cr_rgid;
1953		td->td_ucred = usecred;
1954	} else
1955		usecred = cred;
1956	AUDIT_ARG_VALUE(amode);
1957	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
1958	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
1959	    td);
1960	if ((error = namei(&nd)) != 0)
1961		goto out;
1962	vp = nd.ni_vp;
1963
1964	error = vn_access(vp, amode, usecred, td);
1965	NDFREE(&nd, NDF_ONLY_PNBUF);
1966	vput(vp);
1967out:
1968	if (usecred != cred) {
1969		td->td_ucred = cred;
1970		crfree(usecred);
1971	}
1972	return (error);
1973}
1974
1975/*
1976 * Check access permissions using "effective" credentials.
1977 */
1978#ifndef _SYS_SYSPROTO_H_
1979struct eaccess_args {
1980	char	*path;
1981	int	amode;
1982};
1983#endif
1984int
1985sys_eaccess(td, uap)
1986	struct thread *td;
1987	register struct eaccess_args /* {
1988		char *path;
1989		int amode;
1990	} */ *uap;
1991{
1992
1993	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1994	    AT_EACCESS, uap->amode));
1995}
1996
1997#if defined(COMPAT_43)
1998/*
1999 * Get file status; this version follows links.
2000 */
2001#ifndef _SYS_SYSPROTO_H_
2002struct ostat_args {
2003	char	*path;
2004	struct ostat *ub;
2005};
2006#endif
2007int
2008ostat(td, uap)
2009	struct thread *td;
2010	register struct ostat_args /* {
2011		char *path;
2012		struct ostat *ub;
2013	} */ *uap;
2014{
2015	struct stat sb;
2016	struct ostat osb;
2017	int error;
2018
2019	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2020	    &sb, NULL);
2021	if (error != 0)
2022		return (error);
2023	cvtstat(&sb, &osb);
2024	return (copyout(&osb, uap->ub, sizeof (osb)));
2025}
2026
2027/*
2028 * Get file status; this version does not follow links.
2029 */
2030#ifndef _SYS_SYSPROTO_H_
2031struct olstat_args {
2032	char	*path;
2033	struct ostat *ub;
2034};
2035#endif
2036int
2037olstat(td, uap)
2038	struct thread *td;
2039	register struct olstat_args /* {
2040		char *path;
2041		struct ostat *ub;
2042	} */ *uap;
2043{
2044	struct stat sb;
2045	struct ostat osb;
2046	int error;
2047
2048	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2049	    UIO_USERSPACE, &sb, NULL);
2050	if (error != 0)
2051		return (error);
2052	cvtstat(&sb, &osb);
2053	return (copyout(&osb, uap->ub, sizeof (osb)));
2054}
2055
2056/*
2057 * Convert from an old to a new stat structure.
2058 */
2059void
2060cvtstat(st, ost)
2061	struct stat *st;
2062	struct ostat *ost;
2063{
2064
2065	bzero(ost, sizeof(*ost));
2066	ost->st_dev = st->st_dev;
2067	ost->st_ino = st->st_ino;
2068	ost->st_mode = st->st_mode;
2069	ost->st_nlink = st->st_nlink;
2070	ost->st_uid = st->st_uid;
2071	ost->st_gid = st->st_gid;
2072	ost->st_rdev = st->st_rdev;
2073	if (st->st_size < (quad_t)1 << 32)
2074		ost->st_size = st->st_size;
2075	else
2076		ost->st_size = -2;
2077	ost->st_atim = st->st_atim;
2078	ost->st_mtim = st->st_mtim;
2079	ost->st_ctim = st->st_ctim;
2080	ost->st_blksize = st->st_blksize;
2081	ost->st_blocks = st->st_blocks;
2082	ost->st_flags = st->st_flags;
2083	ost->st_gen = st->st_gen;
2084}
2085#endif /* COMPAT_43 */
2086
2087/*
2088 * Get file status; this version follows links.
2089 */
2090#ifndef _SYS_SYSPROTO_H_
2091struct stat_args {
2092	char	*path;
2093	struct stat *ub;
2094};
2095#endif
2096int
2097sys_stat(td, uap)
2098	struct thread *td;
2099	register struct stat_args /* {
2100		char *path;
2101		struct stat *ub;
2102	} */ *uap;
2103{
2104	struct stat sb;
2105	int error;
2106
2107	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2108	    &sb, NULL);
2109	if (error == 0)
2110		error = copyout(&sb, uap->ub, sizeof (sb));
2111	return (error);
2112}
2113
2114#ifndef _SYS_SYSPROTO_H_
2115struct fstatat_args {
2116	int	fd;
2117	char	*path;
2118	struct stat	*buf;
2119	int	flag;
2120}
2121#endif
2122int
2123sys_fstatat(struct thread *td, struct fstatat_args *uap)
2124{
2125	struct stat sb;
2126	int error;
2127
2128	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2129	    UIO_USERSPACE, &sb, NULL);
2130	if (error == 0)
2131		error = copyout(&sb, uap->buf, sizeof (sb));
2132	return (error);
2133}
2134
2135int
2136kern_statat(struct thread *td, int flag, int fd, char *path,
2137    enum uio_seg pathseg, struct stat *sbp,
2138    void (*hook)(struct vnode *vp, struct stat *sbp))
2139{
2140	struct nameidata nd;
2141	struct stat sb;
2142	cap_rights_t rights;
2143	int error;
2144
2145	if (flag & ~AT_SYMLINK_NOFOLLOW)
2146		return (EINVAL);
2147
2148	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2149	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2150	    cap_rights_init(&rights, CAP_FSTAT), td);
2151
2152	if ((error = namei(&nd)) != 0)
2153		return (error);
2154	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2155	if (error == 0) {
2156		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2157		if (S_ISREG(sb.st_mode))
2158			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2159		if (__predict_false(hook != NULL))
2160			hook(nd.ni_vp, &sb);
2161	}
2162	NDFREE(&nd, NDF_ONLY_PNBUF);
2163	vput(nd.ni_vp);
2164	if (error != 0)
2165		return (error);
2166	*sbp = sb;
2167#ifdef KTRACE
2168	if (KTRPOINT(td, KTR_STRUCT))
2169		ktrstat(&sb);
2170#endif
2171	return (0);
2172}
2173
2174/*
2175 * Get file status; this version does not follow links.
2176 */
2177#ifndef _SYS_SYSPROTO_H_
2178struct lstat_args {
2179	char	*path;
2180	struct stat *ub;
2181};
2182#endif
2183int
2184sys_lstat(td, uap)
2185	struct thread *td;
2186	register struct lstat_args /* {
2187		char *path;
2188		struct stat *ub;
2189	} */ *uap;
2190{
2191	struct stat sb;
2192	int error;
2193
2194	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2195	    UIO_USERSPACE, &sb, NULL);
2196	if (error == 0)
2197		error = copyout(&sb, uap->ub, sizeof (sb));
2198	return (error);
2199}
2200
2201/*
2202 * Implementation of the NetBSD [l]stat() functions.
2203 */
2204void
2205cvtnstat(sb, nsb)
2206	struct stat *sb;
2207	struct nstat *nsb;
2208{
2209
2210	bzero(nsb, sizeof *nsb);
2211	nsb->st_dev = sb->st_dev;
2212	nsb->st_ino = sb->st_ino;
2213	nsb->st_mode = sb->st_mode;
2214	nsb->st_nlink = sb->st_nlink;
2215	nsb->st_uid = sb->st_uid;
2216	nsb->st_gid = sb->st_gid;
2217	nsb->st_rdev = sb->st_rdev;
2218	nsb->st_atim = sb->st_atim;
2219	nsb->st_mtim = sb->st_mtim;
2220	nsb->st_ctim = sb->st_ctim;
2221	nsb->st_size = sb->st_size;
2222	nsb->st_blocks = sb->st_blocks;
2223	nsb->st_blksize = sb->st_blksize;
2224	nsb->st_flags = sb->st_flags;
2225	nsb->st_gen = sb->st_gen;
2226	nsb->st_birthtim = sb->st_birthtim;
2227}
2228
2229#ifndef _SYS_SYSPROTO_H_
2230struct nstat_args {
2231	char	*path;
2232	struct nstat *ub;
2233};
2234#endif
2235int
2236sys_nstat(td, uap)
2237	struct thread *td;
2238	register struct nstat_args /* {
2239		char *path;
2240		struct nstat *ub;
2241	} */ *uap;
2242{
2243	struct stat sb;
2244	struct nstat nsb;
2245	int error;
2246
2247	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2248	    &sb, NULL);
2249	if (error != 0)
2250		return (error);
2251	cvtnstat(&sb, &nsb);
2252	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2253}
2254
2255/*
2256 * NetBSD lstat.  Get file status; this version does not follow links.
2257 */
2258#ifndef _SYS_SYSPROTO_H_
2259struct lstat_args {
2260	char	*path;
2261	struct stat *ub;
2262};
2263#endif
2264int
2265sys_nlstat(td, uap)
2266	struct thread *td;
2267	register struct nlstat_args /* {
2268		char *path;
2269		struct nstat *ub;
2270	} */ *uap;
2271{
2272	struct stat sb;
2273	struct nstat nsb;
2274	int error;
2275
2276	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2277	    UIO_USERSPACE, &sb, NULL);
2278	if (error != 0)
2279		return (error);
2280	cvtnstat(&sb, &nsb);
2281	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2282}
2283
2284/*
2285 * Get configurable pathname variables.
2286 */
2287#ifndef _SYS_SYSPROTO_H_
2288struct pathconf_args {
2289	char	*path;
2290	int	name;
2291};
2292#endif
2293int
2294sys_pathconf(td, uap)
2295	struct thread *td;
2296	register struct pathconf_args /* {
2297		char *path;
2298		int name;
2299	} */ *uap;
2300{
2301
2302	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2303}
2304
2305#ifndef _SYS_SYSPROTO_H_
2306struct lpathconf_args {
2307	char	*path;
2308	int	name;
2309};
2310#endif
2311int
2312sys_lpathconf(td, uap)
2313	struct thread *td;
2314	register struct lpathconf_args /* {
2315		char *path;
2316		int name;
2317	} */ *uap;
2318{
2319
2320	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2321	    NOFOLLOW));
2322}
2323
2324int
2325kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2326    u_long flags)
2327{
2328	struct nameidata nd;
2329	int error;
2330
2331	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2332	    pathseg, path, td);
2333	if ((error = namei(&nd)) != 0)
2334		return (error);
2335	NDFREE(&nd, NDF_ONLY_PNBUF);
2336
2337	error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2338	vput(nd.ni_vp);
2339	return (error);
2340}
2341
2342/*
2343 * Return target name of a symbolic link.
2344 */
2345#ifndef _SYS_SYSPROTO_H_
2346struct readlink_args {
2347	char	*path;
2348	char	*buf;
2349	size_t	count;
2350};
2351#endif
2352int
2353sys_readlink(td, uap)
2354	struct thread *td;
2355	register struct readlink_args /* {
2356		char *path;
2357		char *buf;
2358		size_t count;
2359	} */ *uap;
2360{
2361
2362	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2363	    uap->buf, UIO_USERSPACE, uap->count));
2364}
2365#ifndef _SYS_SYSPROTO_H_
2366struct readlinkat_args {
2367	int	fd;
2368	char	*path;
2369	char	*buf;
2370	size_t	bufsize;
2371};
2372#endif
2373int
2374sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2375{
2376
2377	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2378	    uap->buf, UIO_USERSPACE, uap->bufsize));
2379}
2380
2381int
2382kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2383    char *buf, enum uio_seg bufseg, size_t count)
2384{
2385	struct vnode *vp;
2386	struct iovec aiov;
2387	struct uio auio;
2388	struct nameidata nd;
2389	int error;
2390
2391	if (count > IOSIZE_MAX)
2392		return (EINVAL);
2393
2394	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2395	    pathseg, path, fd, td);
2396
2397	if ((error = namei(&nd)) != 0)
2398		return (error);
2399	NDFREE(&nd, NDF_ONLY_PNBUF);
2400	vp = nd.ni_vp;
2401#ifdef MAC
2402	error = mac_vnode_check_readlink(td->td_ucred, vp);
2403	if (error != 0) {
2404		vput(vp);
2405		return (error);
2406	}
2407#endif
2408	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
2409		error = EINVAL;
2410	else {
2411		aiov.iov_base = buf;
2412		aiov.iov_len = count;
2413		auio.uio_iov = &aiov;
2414		auio.uio_iovcnt = 1;
2415		auio.uio_offset = 0;
2416		auio.uio_rw = UIO_READ;
2417		auio.uio_segflg = bufseg;
2418		auio.uio_td = td;
2419		auio.uio_resid = count;
2420		error = VOP_READLINK(vp, &auio, td->td_ucred);
2421		td->td_retval[0] = count - auio.uio_resid;
2422	}
2423	vput(vp);
2424	return (error);
2425}
2426
2427/*
2428 * Common implementation code for chflags() and fchflags().
2429 */
2430static int
2431setfflags(td, vp, flags)
2432	struct thread *td;
2433	struct vnode *vp;
2434	u_long flags;
2435{
2436	struct mount *mp;
2437	struct vattr vattr;
2438	int error;
2439
2440	/* We can't support the value matching VNOVAL. */
2441	if (flags == VNOVAL)
2442		return (EOPNOTSUPP);
2443
2444	/*
2445	 * Prevent non-root users from setting flags on devices.  When
2446	 * a device is reused, users can retain ownership of the device
2447	 * if they are allowed to set flags and programs assume that
2448	 * chown can't fail when done as root.
2449	 */
2450	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2451		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2452		if (error != 0)
2453			return (error);
2454	}
2455
2456	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2457		return (error);
2458	VATTR_NULL(&vattr);
2459	vattr.va_flags = flags;
2460	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2461#ifdef MAC
2462	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2463	if (error == 0)
2464#endif
2465		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2466	VOP_UNLOCK(vp, 0);
2467	vn_finished_write(mp);
2468	return (error);
2469}
2470
2471/*
2472 * Change flags of a file given a path name.
2473 */
2474#ifndef _SYS_SYSPROTO_H_
2475struct chflags_args {
2476	const char *path;
2477	u_long	flags;
2478};
2479#endif
2480int
2481sys_chflags(td, uap)
2482	struct thread *td;
2483	register struct chflags_args /* {
2484		const char *path;
2485		u_long flags;
2486	} */ *uap;
2487{
2488
2489	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2490	    uap->flags, 0));
2491}
2492
2493#ifndef _SYS_SYSPROTO_H_
2494struct chflagsat_args {
2495	int	fd;
2496	const char *path;
2497	u_long	flags;
2498	int	atflag;
2499}
2500#endif
2501int
2502sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2503{
2504	int fd = uap->fd;
2505	const char *path = uap->path;
2506	u_long flags = uap->flags;
2507	int atflag = uap->atflag;
2508
2509	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2510		return (EINVAL);
2511
2512	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2513}
2514
2515/*
2516 * Same as chflags() but doesn't follow symlinks.
2517 */
2518int
2519sys_lchflags(td, uap)
2520	struct thread *td;
2521	register struct lchflags_args /* {
2522		const char *path;
2523		u_long flags;
2524	} */ *uap;
2525{
2526
2527	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2528	    uap->flags, AT_SYMLINK_NOFOLLOW));
2529}
2530
2531static int
2532kern_chflagsat(struct thread *td, int fd, const char *path,
2533    enum uio_seg pathseg, u_long flags, int atflag)
2534{
2535	struct nameidata nd;
2536	cap_rights_t rights;
2537	int error, follow;
2538
2539	AUDIT_ARG_FFLAGS(flags);
2540	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2541	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2542	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2543	if ((error = namei(&nd)) != 0)
2544		return (error);
2545	NDFREE(&nd, NDF_ONLY_PNBUF);
2546	error = setfflags(td, nd.ni_vp, flags);
2547	vrele(nd.ni_vp);
2548	return (error);
2549}
2550
2551/*
2552 * Change flags of a file given a file descriptor.
2553 */
2554#ifndef _SYS_SYSPROTO_H_
2555struct fchflags_args {
2556	int	fd;
2557	u_long	flags;
2558};
2559#endif
2560int
2561sys_fchflags(td, uap)
2562	struct thread *td;
2563	register struct fchflags_args /* {
2564		int fd;
2565		u_long flags;
2566	} */ *uap;
2567{
2568	struct file *fp;
2569	cap_rights_t rights;
2570	int error;
2571
2572	AUDIT_ARG_FD(uap->fd);
2573	AUDIT_ARG_FFLAGS(uap->flags);
2574	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHFLAGS),
2575	    &fp);
2576	if (error != 0)
2577		return (error);
2578#ifdef AUDIT
2579	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2580	AUDIT_ARG_VNODE1(fp->f_vnode);
2581	VOP_UNLOCK(fp->f_vnode, 0);
2582#endif
2583	error = setfflags(td, fp->f_vnode, uap->flags);
2584	fdrop(fp, td);
2585	return (error);
2586}
2587
2588/*
2589 * Common implementation code for chmod(), lchmod() and fchmod().
2590 */
2591int
2592setfmode(td, cred, vp, mode)
2593	struct thread *td;
2594	struct ucred *cred;
2595	struct vnode *vp;
2596	int mode;
2597{
2598	struct mount *mp;
2599	struct vattr vattr;
2600	int error;
2601
2602	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2603		return (error);
2604	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2605	VATTR_NULL(&vattr);
2606	vattr.va_mode = mode & ALLPERMS;
2607#ifdef MAC
2608	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2609	if (error == 0)
2610#endif
2611		error = VOP_SETATTR(vp, &vattr, cred);
2612	VOP_UNLOCK(vp, 0);
2613	vn_finished_write(mp);
2614	return (error);
2615}
2616
2617/*
2618 * Change mode of a file given path name.
2619 */
2620#ifndef _SYS_SYSPROTO_H_
2621struct chmod_args {
2622	char	*path;
2623	int	mode;
2624};
2625#endif
2626int
2627sys_chmod(td, uap)
2628	struct thread *td;
2629	register struct chmod_args /* {
2630		char *path;
2631		int mode;
2632	} */ *uap;
2633{
2634
2635	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2636	    uap->mode, 0));
2637}
2638
2639#ifndef _SYS_SYSPROTO_H_
2640struct fchmodat_args {
2641	int	dirfd;
2642	char	*path;
2643	mode_t	mode;
2644	int	flag;
2645}
2646#endif
2647int
2648sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2649{
2650	int flag = uap->flag;
2651	int fd = uap->fd;
2652	char *path = uap->path;
2653	mode_t mode = uap->mode;
2654
2655	if (flag & ~AT_SYMLINK_NOFOLLOW)
2656		return (EINVAL);
2657
2658	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2659}
2660
2661/*
2662 * Change mode of a file given path name (don't follow links.)
2663 */
2664#ifndef _SYS_SYSPROTO_H_
2665struct lchmod_args {
2666	char	*path;
2667	int	mode;
2668};
2669#endif
2670int
2671sys_lchmod(td, uap)
2672	struct thread *td;
2673	register struct lchmod_args /* {
2674		char *path;
2675		int mode;
2676	} */ *uap;
2677{
2678
2679	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2680	    uap->mode, AT_SYMLINK_NOFOLLOW));
2681}
2682
2683int
2684kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2685    mode_t mode, int flag)
2686{
2687	struct nameidata nd;
2688	cap_rights_t rights;
2689	int error, follow;
2690
2691	AUDIT_ARG_MODE(mode);
2692	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2693	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2694	    cap_rights_init(&rights, CAP_FCHMOD), td);
2695	if ((error = namei(&nd)) != 0)
2696		return (error);
2697	NDFREE(&nd, NDF_ONLY_PNBUF);
2698	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2699	vrele(nd.ni_vp);
2700	return (error);
2701}
2702
2703/*
2704 * Change mode of a file given a file descriptor.
2705 */
2706#ifndef _SYS_SYSPROTO_H_
2707struct fchmod_args {
2708	int	fd;
2709	int	mode;
2710};
2711#endif
2712int
2713sys_fchmod(struct thread *td, struct fchmod_args *uap)
2714{
2715	struct file *fp;
2716	cap_rights_t rights;
2717	int error;
2718
2719	AUDIT_ARG_FD(uap->fd);
2720	AUDIT_ARG_MODE(uap->mode);
2721
2722	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2723	if (error != 0)
2724		return (error);
2725	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2726	fdrop(fp, td);
2727	return (error);
2728}
2729
2730/*
2731 * Common implementation for chown(), lchown(), and fchown()
2732 */
2733int
2734setfown(td, cred, vp, uid, gid)
2735	struct thread *td;
2736	struct ucred *cred;
2737	struct vnode *vp;
2738	uid_t uid;
2739	gid_t gid;
2740{
2741	struct mount *mp;
2742	struct vattr vattr;
2743	int error;
2744
2745	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2746		return (error);
2747	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2748	VATTR_NULL(&vattr);
2749	vattr.va_uid = uid;
2750	vattr.va_gid = gid;
2751#ifdef MAC
2752	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2753	    vattr.va_gid);
2754	if (error == 0)
2755#endif
2756		error = VOP_SETATTR(vp, &vattr, cred);
2757	VOP_UNLOCK(vp, 0);
2758	vn_finished_write(mp);
2759	return (error);
2760}
2761
2762/*
2763 * Set ownership given a path name.
2764 */
2765#ifndef _SYS_SYSPROTO_H_
2766struct chown_args {
2767	char	*path;
2768	int	uid;
2769	int	gid;
2770};
2771#endif
2772int
2773sys_chown(td, uap)
2774	struct thread *td;
2775	register struct chown_args /* {
2776		char *path;
2777		int uid;
2778		int gid;
2779	} */ *uap;
2780{
2781
2782	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
2783	    uap->gid, 0));
2784}
2785
2786#ifndef _SYS_SYSPROTO_H_
2787struct fchownat_args {
2788	int fd;
2789	const char * path;
2790	uid_t uid;
2791	gid_t gid;
2792	int flag;
2793};
2794#endif
2795int
2796sys_fchownat(struct thread *td, struct fchownat_args *uap)
2797{
2798	int flag;
2799
2800	flag = uap->flag;
2801	if (flag & ~AT_SYMLINK_NOFOLLOW)
2802		return (EINVAL);
2803
2804	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2805	    uap->gid, uap->flag));
2806}
2807
2808int
2809kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2810    int uid, int gid, int flag)
2811{
2812	struct nameidata nd;
2813	cap_rights_t rights;
2814	int error, follow;
2815
2816	AUDIT_ARG_OWNER(uid, gid);
2817	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2818	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2819	    cap_rights_init(&rights, CAP_FCHOWN), td);
2820
2821	if ((error = namei(&nd)) != 0)
2822		return (error);
2823	NDFREE(&nd, NDF_ONLY_PNBUF);
2824	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2825	vrele(nd.ni_vp);
2826	return (error);
2827}
2828
2829/*
2830 * Set ownership given a path name, do not cross symlinks.
2831 */
2832#ifndef _SYS_SYSPROTO_H_
2833struct lchown_args {
2834	char	*path;
2835	int	uid;
2836	int	gid;
2837};
2838#endif
2839int
2840sys_lchown(td, uap)
2841	struct thread *td;
2842	register struct lchown_args /* {
2843		char *path;
2844		int uid;
2845		int gid;
2846	} */ *uap;
2847{
2848
2849	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2850	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
2851}
2852
2853/*
2854 * Set ownership given a file descriptor.
2855 */
2856#ifndef _SYS_SYSPROTO_H_
2857struct fchown_args {
2858	int	fd;
2859	int	uid;
2860	int	gid;
2861};
2862#endif
2863int
2864sys_fchown(td, uap)
2865	struct thread *td;
2866	register struct fchown_args /* {
2867		int fd;
2868		int uid;
2869		int gid;
2870	} */ *uap;
2871{
2872	struct file *fp;
2873	cap_rights_t rights;
2874	int error;
2875
2876	AUDIT_ARG_FD(uap->fd);
2877	AUDIT_ARG_OWNER(uap->uid, uap->gid);
2878	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
2879	if (error != 0)
2880		return (error);
2881	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
2882	fdrop(fp, td);
2883	return (error);
2884}
2885
2886/*
2887 * Common implementation code for utimes(), lutimes(), and futimes().
2888 */
2889static int
2890getutimes(usrtvp, tvpseg, tsp)
2891	const struct timeval *usrtvp;
2892	enum uio_seg tvpseg;
2893	struct timespec *tsp;
2894{
2895	struct timeval tv[2];
2896	const struct timeval *tvp;
2897	int error;
2898
2899	if (usrtvp == NULL) {
2900		vfs_timestamp(&tsp[0]);
2901		tsp[1] = tsp[0];
2902	} else {
2903		if (tvpseg == UIO_SYSSPACE) {
2904			tvp = usrtvp;
2905		} else {
2906			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
2907				return (error);
2908			tvp = tv;
2909		}
2910
2911		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
2912		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
2913			return (EINVAL);
2914		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2915		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2916	}
2917	return (0);
2918}
2919
2920/*
2921 * Common implementation code for futimens(), utimensat().
2922 */
2923#define	UTIMENS_NULL	0x1
2924#define	UTIMENS_EXIT	0x2
2925static int
2926getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
2927    struct timespec *tsp, int *retflags)
2928{
2929	struct timespec tsnow;
2930	int error;
2931
2932	vfs_timestamp(&tsnow);
2933	*retflags = 0;
2934	if (usrtsp == NULL) {
2935		tsp[0] = tsnow;
2936		tsp[1] = tsnow;
2937		*retflags |= UTIMENS_NULL;
2938		return (0);
2939	}
2940	if (tspseg == UIO_SYSSPACE) {
2941		tsp[0] = usrtsp[0];
2942		tsp[1] = usrtsp[1];
2943	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
2944		return (error);
2945	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
2946		*retflags |= UTIMENS_EXIT;
2947	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
2948		*retflags |= UTIMENS_NULL;
2949	if (tsp[0].tv_nsec == UTIME_OMIT)
2950		tsp[0].tv_sec = VNOVAL;
2951	else if (tsp[0].tv_nsec == UTIME_NOW)
2952		tsp[0] = tsnow;
2953	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
2954		return (EINVAL);
2955	if (tsp[1].tv_nsec == UTIME_OMIT)
2956		tsp[1].tv_sec = VNOVAL;
2957	else if (tsp[1].tv_nsec == UTIME_NOW)
2958		tsp[1] = tsnow;
2959	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
2960		return (EINVAL);
2961
2962	return (0);
2963}
2964
2965/*
2966 * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
2967 * and utimensat().
2968 */
2969static int
2970setutimes(td, vp, ts, numtimes, nullflag)
2971	struct thread *td;
2972	struct vnode *vp;
2973	const struct timespec *ts;
2974	int numtimes;
2975	int nullflag;
2976{
2977	struct mount *mp;
2978	struct vattr vattr;
2979	int error, setbirthtime;
2980
2981	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2982		return (error);
2983	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2984	setbirthtime = 0;
2985	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
2986	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
2987		setbirthtime = 1;
2988	VATTR_NULL(&vattr);
2989	vattr.va_atime = ts[0];
2990	vattr.va_mtime = ts[1];
2991	if (setbirthtime)
2992		vattr.va_birthtime = ts[1];
2993	if (numtimes > 2)
2994		vattr.va_birthtime = ts[2];
2995	if (nullflag)
2996		vattr.va_vaflags |= VA_UTIMES_NULL;
2997#ifdef MAC
2998	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
2999	    vattr.va_mtime);
3000#endif
3001	if (error == 0)
3002		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3003	VOP_UNLOCK(vp, 0);
3004	vn_finished_write(mp);
3005	return (error);
3006}
3007
3008/*
3009 * Set the access and modification times of a file.
3010 */
3011#ifndef _SYS_SYSPROTO_H_
3012struct utimes_args {
3013	char	*path;
3014	struct	timeval *tptr;
3015};
3016#endif
3017int
3018sys_utimes(td, uap)
3019	struct thread *td;
3020	register struct utimes_args /* {
3021		char *path;
3022		struct timeval *tptr;
3023	} */ *uap;
3024{
3025
3026	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3027	    uap->tptr, UIO_USERSPACE));
3028}
3029
3030#ifndef _SYS_SYSPROTO_H_
3031struct futimesat_args {
3032	int fd;
3033	const char * path;
3034	const struct timeval * times;
3035};
3036#endif
3037int
3038sys_futimesat(struct thread *td, struct futimesat_args *uap)
3039{
3040
3041	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
3042	    uap->times, UIO_USERSPACE));
3043}
3044
3045int
3046kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3047    struct timeval *tptr, enum uio_seg tptrseg)
3048{
3049	struct nameidata nd;
3050	struct timespec ts[2];
3051	cap_rights_t rights;
3052	int error;
3053
3054	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3055		return (error);
3056	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
3057	    cap_rights_init(&rights, CAP_FUTIMES), td);
3058
3059	if ((error = namei(&nd)) != 0)
3060		return (error);
3061	NDFREE(&nd, NDF_ONLY_PNBUF);
3062	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3063	vrele(nd.ni_vp);
3064	return (error);
3065}
3066
3067/*
3068 * Set the access and modification times of a file.
3069 */
3070#ifndef _SYS_SYSPROTO_H_
3071struct lutimes_args {
3072	char	*path;
3073	struct	timeval *tptr;
3074};
3075#endif
3076int
3077sys_lutimes(td, uap)
3078	struct thread *td;
3079	register struct lutimes_args /* {
3080		char *path;
3081		struct timeval *tptr;
3082	} */ *uap;
3083{
3084
3085	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
3086	    UIO_USERSPACE));
3087}
3088
3089int
3090kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
3091    struct timeval *tptr, enum uio_seg tptrseg)
3092{
3093	struct timespec ts[2];
3094	struct nameidata nd;
3095	int error;
3096
3097	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
3098		return (error);
3099	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
3100	if ((error = namei(&nd)) != 0)
3101		return (error);
3102	NDFREE(&nd, NDF_ONLY_PNBUF);
3103	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
3104	vrele(nd.ni_vp);
3105	return (error);
3106}
3107
3108/*
3109 * Set the access and modification times of a file.
3110 */
3111#ifndef _SYS_SYSPROTO_H_
3112struct futimes_args {
3113	int	fd;
3114	struct	timeval *tptr;
3115};
3116#endif
3117int
3118sys_futimes(td, uap)
3119	struct thread *td;
3120	register struct futimes_args /* {
3121		int  fd;
3122		struct timeval *tptr;
3123	} */ *uap;
3124{
3125
3126	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
3127}
3128
3129int
3130kern_futimes(struct thread *td, int fd, struct timeval *tptr,
3131    enum uio_seg tptrseg)
3132{
3133	struct timespec ts[2];
3134	struct file *fp;
3135	cap_rights_t rights;
3136	int error;
3137
3138	AUDIT_ARG_FD(fd);
3139	error = getutimes(tptr, tptrseg, ts);
3140	if (error != 0)
3141		return (error);
3142	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
3143	if (error != 0)
3144		return (error);
3145#ifdef AUDIT
3146	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3147	AUDIT_ARG_VNODE1(fp->f_vnode);
3148	VOP_UNLOCK(fp->f_vnode, 0);
3149#endif
3150	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
3151	fdrop(fp, td);
3152	return (error);
3153}
3154
3155int
3156sys_futimens(struct thread *td, struct futimens_args *uap)
3157{
3158
3159	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
3160}
3161
3162int
3163kern_futimens(struct thread *td, int fd, struct timespec *tptr,
3164    enum uio_seg tptrseg)
3165{
3166	struct timespec ts[2];
3167	struct file *fp;
3168	cap_rights_t rights;
3169	int error, flags;
3170
3171	AUDIT_ARG_FD(fd);
3172	error = getutimens(tptr, tptrseg, ts, &flags);
3173	if (error != 0)
3174		return (error);
3175	if (flags & UTIMENS_EXIT)
3176		return (0);
3177	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
3178	if (error != 0)
3179		return (error);
3180#ifdef AUDIT
3181	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
3182	AUDIT_ARG_VNODE1(fp->f_vnode);
3183	VOP_UNLOCK(fp->f_vnode, 0);
3184#endif
3185	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
3186	fdrop(fp, td);
3187	return (error);
3188}
3189
3190int
3191sys_utimensat(struct thread *td, struct utimensat_args *uap)
3192{
3193
3194	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
3195	    uap->times, UIO_USERSPACE, uap->flag));
3196}
3197
3198int
3199kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
3200    struct timespec *tptr, enum uio_seg tptrseg, int flag)
3201{
3202	struct nameidata nd;
3203	struct timespec ts[2];
3204	cap_rights_t rights;
3205	int error, flags;
3206
3207	if (flag & ~AT_SYMLINK_NOFOLLOW)
3208		return (EINVAL);
3209
3210	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
3211		return (error);
3212	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
3213	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
3214	    cap_rights_init(&rights, CAP_FUTIMES), td);
3215	if ((error = namei(&nd)) != 0)
3216		return (error);
3217	/*
3218	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
3219	 * POSIX states:
3220	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
3221	 * "Search permission is denied by a component of the path prefix."
3222	 */
3223	NDFREE(&nd, NDF_ONLY_PNBUF);
3224	if ((flags & UTIMENS_EXIT) == 0)
3225		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
3226	vrele(nd.ni_vp);
3227	return (error);
3228}
3229
3230/*
3231 * Truncate a file given its path name.
3232 */
3233#ifndef _SYS_SYSPROTO_H_
3234struct truncate_args {
3235	char	*path;
3236	int	pad;
3237	off_t	length;
3238};
3239#endif
3240int
3241sys_truncate(td, uap)
3242	struct thread *td;
3243	register struct truncate_args /* {
3244		char *path;
3245		int pad;
3246		off_t length;
3247	} */ *uap;
3248{
3249
3250	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3251}
3252
3253int
3254kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3255{
3256	struct mount *mp;
3257	struct vnode *vp;
3258	void *rl_cookie;
3259	struct vattr vattr;
3260	struct nameidata nd;
3261	int error;
3262
3263	if (length < 0)
3264		return(EINVAL);
3265	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3266	if ((error = namei(&nd)) != 0)
3267		return (error);
3268	vp = nd.ni_vp;
3269	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3270	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3271		vn_rangelock_unlock(vp, rl_cookie);
3272		vrele(vp);
3273		return (error);
3274	}
3275	NDFREE(&nd, NDF_ONLY_PNBUF);
3276	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3277	if (vp->v_type == VDIR)
3278		error = EISDIR;
3279#ifdef MAC
3280	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3281	}
3282#endif
3283	else if ((error = vn_writechk(vp)) == 0 &&
3284	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3285		VATTR_NULL(&vattr);
3286		vattr.va_size = length;
3287		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3288	}
3289	VOP_UNLOCK(vp, 0);
3290	vn_finished_write(mp);
3291	vn_rangelock_unlock(vp, rl_cookie);
3292	vrele(vp);
3293	return (error);
3294}
3295
3296#if defined(COMPAT_43)
3297/*
3298 * Truncate a file given its path name.
3299 */
3300#ifndef _SYS_SYSPROTO_H_
3301struct otruncate_args {
3302	char	*path;
3303	long	length;
3304};
3305#endif
3306int
3307otruncate(struct thread *td, struct otruncate_args *uap)
3308{
3309
3310	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3311}
3312#endif /* COMPAT_43 */
3313
3314#if defined(COMPAT_FREEBSD6)
3315/* Versions with the pad argument */
3316int
3317freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3318{
3319
3320	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3321}
3322
3323int
3324freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3325{
3326
3327	return (kern_ftruncate(td, uap->fd, uap->length));
3328}
3329#endif
3330
3331int
3332kern_fsync(struct thread *td, int fd, bool fullsync)
3333{
3334	struct vnode *vp;
3335	struct mount *mp;
3336	struct file *fp;
3337	cap_rights_t rights;
3338	int error, lock_flags;
3339
3340	AUDIT_ARG_FD(fd);
3341	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
3342	if (error != 0)
3343		return (error);
3344	vp = fp->f_vnode;
3345#if 0
3346	if (!fullsync)
3347		/* XXXKIB: compete outstanding aio writes */;
3348#endif
3349	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3350	if (error != 0)
3351		goto drop;
3352	if (MNT_SHARED_WRITES(mp) ||
3353	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3354		lock_flags = LK_SHARED;
3355	} else {
3356		lock_flags = LK_EXCLUSIVE;
3357	}
3358	vn_lock(vp, lock_flags | LK_RETRY);
3359	AUDIT_ARG_VNODE1(vp);
3360	if (vp->v_object != NULL) {
3361		VM_OBJECT_WLOCK(vp->v_object);
3362		vm_object_page_clean(vp->v_object, 0, 0, 0);
3363		VM_OBJECT_WUNLOCK(vp->v_object);
3364	}
3365	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3366	VOP_UNLOCK(vp, 0);
3367	vn_finished_write(mp);
3368drop:
3369	fdrop(fp, td);
3370	return (error);
3371}
3372
3373/*
3374 * Sync an open file.
3375 */
3376#ifndef _SYS_SYSPROTO_H_
3377struct fsync_args {
3378	int	fd;
3379};
3380#endif
3381int
3382sys_fsync(struct thread *td, struct fsync_args *uap)
3383{
3384
3385	return (kern_fsync(td, uap->fd, true));
3386}
3387
3388int
3389sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3390{
3391
3392	return (kern_fsync(td, uap->fd, false));
3393}
3394
3395/*
3396 * Rename files.  Source and destination must either both be directories, or
3397 * both not be directories.  If target is a directory, it must be empty.
3398 */
3399#ifndef _SYS_SYSPROTO_H_
3400struct rename_args {
3401	char	*from;
3402	char	*to;
3403};
3404#endif
3405int
3406sys_rename(td, uap)
3407	struct thread *td;
3408	register struct rename_args /* {
3409		char *from;
3410		char *to;
3411	} */ *uap;
3412{
3413
3414	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3415	    uap->to, UIO_USERSPACE));
3416}
3417
3418#ifndef _SYS_SYSPROTO_H_
3419struct renameat_args {
3420	int	oldfd;
3421	char	*old;
3422	int	newfd;
3423	char	*new;
3424};
3425#endif
3426int
3427sys_renameat(struct thread *td, struct renameat_args *uap)
3428{
3429
3430	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3431	    UIO_USERSPACE));
3432}
3433
3434int
3435kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3436    enum uio_seg pathseg)
3437{
3438	struct mount *mp = NULL;
3439	struct vnode *tvp, *fvp, *tdvp;
3440	struct nameidata fromnd, tond;
3441	cap_rights_t rights;
3442	int error;
3443
3444again:
3445	bwillwrite();
3446#ifdef MAC
3447	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3448	    AUDITVNODE1, pathseg, old, oldfd,
3449	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3450#else
3451	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3452	    pathseg, old, oldfd,
3453	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3454#endif
3455
3456	if ((error = namei(&fromnd)) != 0)
3457		return (error);
3458#ifdef MAC
3459	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3460	    fromnd.ni_vp, &fromnd.ni_cnd);
3461	VOP_UNLOCK(fromnd.ni_dvp, 0);
3462	if (fromnd.ni_dvp != fromnd.ni_vp)
3463		VOP_UNLOCK(fromnd.ni_vp, 0);
3464#endif
3465	fvp = fromnd.ni_vp;
3466	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3467	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3468	    cap_rights_init(&rights, CAP_RENAMEAT_TARGET), td);
3469	if (fromnd.ni_vp->v_type == VDIR)
3470		tond.ni_cnd.cn_flags |= WILLBEDIR;
3471	if ((error = namei(&tond)) != 0) {
3472		/* Translate error code for rename("dir1", "dir2/."). */
3473		if (error == EISDIR && fvp->v_type == VDIR)
3474			error = EINVAL;
3475		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3476		vrele(fromnd.ni_dvp);
3477		vrele(fvp);
3478		goto out1;
3479	}
3480	tdvp = tond.ni_dvp;
3481	tvp = tond.ni_vp;
3482	error = vn_start_write(fvp, &mp, V_NOWAIT);
3483	if (error != 0) {
3484		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3485		NDFREE(&tond, NDF_ONLY_PNBUF);
3486		if (tvp != NULL)
3487			vput(tvp);
3488		if (tdvp == tvp)
3489			vrele(tdvp);
3490		else
3491			vput(tdvp);
3492		vrele(fromnd.ni_dvp);
3493		vrele(fvp);
3494		vrele(tond.ni_startdir);
3495		if (fromnd.ni_startdir != NULL)
3496			vrele(fromnd.ni_startdir);
3497		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3498		if (error != 0)
3499			return (error);
3500		goto again;
3501	}
3502	if (tvp != NULL) {
3503		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3504			error = ENOTDIR;
3505			goto out;
3506		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3507			error = EISDIR;
3508			goto out;
3509		}
3510#ifdef CAPABILITIES
3511		if (newfd != AT_FDCWD) {
3512			/*
3513			 * If the target already exists we require CAP_UNLINKAT
3514			 * from 'newfd'.
3515			 */
3516			error = cap_check(&tond.ni_filecaps.fc_rights,
3517			    cap_rights_init(&rights, CAP_UNLINKAT));
3518			if (error != 0)
3519				goto out;
3520		}
3521#endif
3522	}
3523	if (fvp == tdvp) {
3524		error = EINVAL;
3525		goto out;
3526	}
3527	/*
3528	 * If the source is the same as the destination (that is, if they
3529	 * are links to the same vnode), then there is nothing to do.
3530	 */
3531	if (fvp == tvp)
3532		error = -1;
3533#ifdef MAC
3534	else
3535		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3536		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3537#endif
3538out:
3539	if (error == 0) {
3540		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3541		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3542		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3543		NDFREE(&tond, NDF_ONLY_PNBUF);
3544	} else {
3545		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3546		NDFREE(&tond, NDF_ONLY_PNBUF);
3547		if (tvp != NULL)
3548			vput(tvp);
3549		if (tdvp == tvp)
3550			vrele(tdvp);
3551		else
3552			vput(tdvp);
3553		vrele(fromnd.ni_dvp);
3554		vrele(fvp);
3555	}
3556	vrele(tond.ni_startdir);
3557	vn_finished_write(mp);
3558out1:
3559	if (fromnd.ni_startdir)
3560		vrele(fromnd.ni_startdir);
3561	if (error == -1)
3562		return (0);
3563	return (error);
3564}
3565
3566/*
3567 * Make a directory file.
3568 */
3569#ifndef _SYS_SYSPROTO_H_
3570struct mkdir_args {
3571	char	*path;
3572	int	mode;
3573};
3574#endif
3575int
3576sys_mkdir(td, uap)
3577	struct thread *td;
3578	register struct mkdir_args /* {
3579		char *path;
3580		int mode;
3581	} */ *uap;
3582{
3583
3584	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3585	    uap->mode));
3586}
3587
3588#ifndef _SYS_SYSPROTO_H_
3589struct mkdirat_args {
3590	int	fd;
3591	char	*path;
3592	mode_t	mode;
3593};
3594#endif
3595int
3596sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3597{
3598
3599	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3600}
3601
3602int
3603kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3604    int mode)
3605{
3606	struct mount *mp;
3607	struct vnode *vp;
3608	struct vattr vattr;
3609	struct nameidata nd;
3610	cap_rights_t rights;
3611	int error;
3612
3613	AUDIT_ARG_MODE(mode);
3614restart:
3615	bwillwrite();
3616	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3617	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3618	    td);
3619	nd.ni_cnd.cn_flags |= WILLBEDIR;
3620	if ((error = namei(&nd)) != 0)
3621		return (error);
3622	vp = nd.ni_vp;
3623	if (vp != NULL) {
3624		NDFREE(&nd, NDF_ONLY_PNBUF);
3625		/*
3626		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3627		 * the strange behaviour of leaving the vnode unlocked
3628		 * if the target is the same vnode as the parent.
3629		 */
3630		if (vp == nd.ni_dvp)
3631			vrele(nd.ni_dvp);
3632		else
3633			vput(nd.ni_dvp);
3634		vrele(vp);
3635		return (EEXIST);
3636	}
3637	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3638		NDFREE(&nd, NDF_ONLY_PNBUF);
3639		vput(nd.ni_dvp);
3640		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3641			return (error);
3642		goto restart;
3643	}
3644	VATTR_NULL(&vattr);
3645	vattr.va_type = VDIR;
3646	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3647#ifdef MAC
3648	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3649	    &vattr);
3650	if (error != 0)
3651		goto out;
3652#endif
3653	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3654#ifdef MAC
3655out:
3656#endif
3657	NDFREE(&nd, NDF_ONLY_PNBUF);
3658	vput(nd.ni_dvp);
3659	if (error == 0)
3660		vput(nd.ni_vp);
3661	vn_finished_write(mp);
3662	return (error);
3663}
3664
3665/*
3666 * Remove a directory file.
3667 */
3668#ifndef _SYS_SYSPROTO_H_
3669struct rmdir_args {
3670	char	*path;
3671};
3672#endif
3673int
3674sys_rmdir(td, uap)
3675	struct thread *td;
3676	struct rmdir_args /* {
3677		char *path;
3678	} */ *uap;
3679{
3680
3681	return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
3682}
3683
3684int
3685kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3686{
3687	struct mount *mp;
3688	struct vnode *vp;
3689	struct nameidata nd;
3690	cap_rights_t rights;
3691	int error;
3692
3693restart:
3694	bwillwrite();
3695	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3696	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3697	if ((error = namei(&nd)) != 0)
3698		return (error);
3699	vp = nd.ni_vp;
3700	if (vp->v_type != VDIR) {
3701		error = ENOTDIR;
3702		goto out;
3703	}
3704	/*
3705	 * No rmdir "." please.
3706	 */
3707	if (nd.ni_dvp == vp) {
3708		error = EINVAL;
3709		goto out;
3710	}
3711	/*
3712	 * The root of a mounted filesystem cannot be deleted.
3713	 */
3714	if (vp->v_vflag & VV_ROOT) {
3715		error = EBUSY;
3716		goto out;
3717	}
3718#ifdef MAC
3719	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3720	    &nd.ni_cnd);
3721	if (error != 0)
3722		goto out;
3723#endif
3724	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3725		NDFREE(&nd, NDF_ONLY_PNBUF);
3726		vput(vp);
3727		if (nd.ni_dvp == vp)
3728			vrele(nd.ni_dvp);
3729		else
3730			vput(nd.ni_dvp);
3731		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3732			return (error);
3733		goto restart;
3734	}
3735	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3736	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3737	vn_finished_write(mp);
3738out:
3739	NDFREE(&nd, NDF_ONLY_PNBUF);
3740	vput(vp);
3741	if (nd.ni_dvp == vp)
3742		vrele(nd.ni_dvp);
3743	else
3744		vput(nd.ni_dvp);
3745	return (error);
3746}
3747
3748#ifdef COMPAT_43
3749/*
3750 * Read a block of directory entries in a filesystem independent format.
3751 */
3752#ifndef _SYS_SYSPROTO_H_
3753struct ogetdirentries_args {
3754	int	fd;
3755	char	*buf;
3756	u_int	count;
3757	long	*basep;
3758};
3759#endif
3760int
3761ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3762{
3763	long loff;
3764	int error;
3765
3766	error = kern_ogetdirentries(td, uap, &loff);
3767	if (error == 0)
3768		error = copyout(&loff, uap->basep, sizeof(long));
3769	return (error);
3770}
3771
3772int
3773kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3774    long *ploff)
3775{
3776	struct vnode *vp;
3777	struct file *fp;
3778	struct uio auio, kuio;
3779	struct iovec aiov, kiov;
3780	struct dirent *dp, *edp;
3781	cap_rights_t rights;
3782	caddr_t dirbuf;
3783	int error, eofflag, readcnt;
3784	long loff;
3785	off_t foffset;
3786
3787	/* XXX arbitrary sanity limit on `count'. */
3788	if (uap->count > 64 * 1024)
3789		return (EINVAL);
3790	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp);
3791	if (error != 0)
3792		return (error);
3793	if ((fp->f_flag & FREAD) == 0) {
3794		fdrop(fp, td);
3795		return (EBADF);
3796	}
3797	vp = fp->f_vnode;
3798	foffset = foffset_lock(fp, 0);
3799unionread:
3800	if (vp->v_type != VDIR) {
3801		foffset_unlock(fp, foffset, 0);
3802		fdrop(fp, td);
3803		return (EINVAL);
3804	}
3805	aiov.iov_base = uap->buf;
3806	aiov.iov_len = uap->count;
3807	auio.uio_iov = &aiov;
3808	auio.uio_iovcnt = 1;
3809	auio.uio_rw = UIO_READ;
3810	auio.uio_segflg = UIO_USERSPACE;
3811	auio.uio_td = td;
3812	auio.uio_resid = uap->count;
3813	vn_lock(vp, LK_SHARED | LK_RETRY);
3814	loff = auio.uio_offset = foffset;
3815#ifdef MAC
3816	error = mac_vnode_check_readdir(td->td_ucred, vp);
3817	if (error != 0) {
3818		VOP_UNLOCK(vp, 0);
3819		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3820		fdrop(fp, td);
3821		return (error);
3822	}
3823#endif
3824#	if (BYTE_ORDER != LITTLE_ENDIAN)
3825		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3826			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3827			    NULL, NULL);
3828			foffset = auio.uio_offset;
3829		} else
3830#	endif
3831	{
3832		kuio = auio;
3833		kuio.uio_iov = &kiov;
3834		kuio.uio_segflg = UIO_SYSSPACE;
3835		kiov.iov_len = uap->count;
3836		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3837		kiov.iov_base = dirbuf;
3838		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3839			    NULL, NULL);
3840		foffset = kuio.uio_offset;
3841		if (error == 0) {
3842			readcnt = uap->count - kuio.uio_resid;
3843			edp = (struct dirent *)&dirbuf[readcnt];
3844			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3845#				if (BYTE_ORDER == LITTLE_ENDIAN)
3846					/*
3847					 * The expected low byte of
3848					 * dp->d_namlen is our dp->d_type.
3849					 * The high MBZ byte of dp->d_namlen
3850					 * is our dp->d_namlen.
3851					 */
3852					dp->d_type = dp->d_namlen;
3853					dp->d_namlen = 0;
3854#				else
3855					/*
3856					 * The dp->d_type is the high byte
3857					 * of the expected dp->d_namlen,
3858					 * so must be zero'ed.
3859					 */
3860					dp->d_type = 0;
3861#				endif
3862				if (dp->d_reclen > 0) {
3863					dp = (struct dirent *)
3864					    ((char *)dp + dp->d_reclen);
3865				} else {
3866					error = EIO;
3867					break;
3868				}
3869			}
3870			if (dp >= edp)
3871				error = uiomove(dirbuf, readcnt, &auio);
3872		}
3873		free(dirbuf, M_TEMP);
3874	}
3875	if (error != 0) {
3876		VOP_UNLOCK(vp, 0);
3877		foffset_unlock(fp, foffset, 0);
3878		fdrop(fp, td);
3879		return (error);
3880	}
3881	if (uap->count == auio.uio_resid &&
3882	    (vp->v_vflag & VV_ROOT) &&
3883	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3884		struct vnode *tvp = vp;
3885		vp = vp->v_mount->mnt_vnodecovered;
3886		VREF(vp);
3887		fp->f_vnode = vp;
3888		fp->f_data = vp;
3889		foffset = 0;
3890		vput(tvp);
3891		goto unionread;
3892	}
3893	VOP_UNLOCK(vp, 0);
3894	foffset_unlock(fp, foffset, 0);
3895	fdrop(fp, td);
3896	td->td_retval[0] = uap->count - auio.uio_resid;
3897	if (error == 0)
3898		*ploff = loff;
3899	return (error);
3900}
3901#endif /* COMPAT_43 */
3902
3903/*
3904 * Read a block of directory entries in a filesystem independent format.
3905 */
3906#ifndef _SYS_SYSPROTO_H_
3907struct getdirentries_args {
3908	int	fd;
3909	char	*buf;
3910	u_int	count;
3911	long	*basep;
3912};
3913#endif
3914int
3915sys_getdirentries(td, uap)
3916	struct thread *td;
3917	register struct getdirentries_args /* {
3918		int fd;
3919		char *buf;
3920		u_int count;
3921		long *basep;
3922	} */ *uap;
3923{
3924	long base;
3925	int error;
3926
3927	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
3928	    NULL, UIO_USERSPACE);
3929	if (error != 0)
3930		return (error);
3931	if (uap->basep != NULL)
3932		error = copyout(&base, uap->basep, sizeof(long));
3933	return (error);
3934}
3935
3936int
3937kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
3938    long *basep, ssize_t *residp, enum uio_seg bufseg)
3939{
3940	struct vnode *vp;
3941	struct file *fp;
3942	struct uio auio;
3943	struct iovec aiov;
3944	cap_rights_t rights;
3945	long loff;
3946	int error, eofflag;
3947	off_t foffset;
3948
3949	AUDIT_ARG_FD(fd);
3950	if (count > IOSIZE_MAX)
3951		return (EINVAL);
3952	auio.uio_resid = count;
3953	error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
3954	if (error != 0)
3955		return (error);
3956	if ((fp->f_flag & FREAD) == 0) {
3957		fdrop(fp, td);
3958		return (EBADF);
3959	}
3960	vp = fp->f_vnode;
3961	foffset = foffset_lock(fp, 0);
3962unionread:
3963	if (vp->v_type != VDIR) {
3964		error = EINVAL;
3965		goto fail;
3966	}
3967	aiov.iov_base = buf;
3968	aiov.iov_len = count;
3969	auio.uio_iov = &aiov;
3970	auio.uio_iovcnt = 1;
3971	auio.uio_rw = UIO_READ;
3972	auio.uio_segflg = bufseg;
3973	auio.uio_td = td;
3974	vn_lock(vp, LK_SHARED | LK_RETRY);
3975	AUDIT_ARG_VNODE1(vp);
3976	loff = auio.uio_offset = foffset;
3977#ifdef MAC
3978	error = mac_vnode_check_readdir(td->td_ucred, vp);
3979	if (error == 0)
3980#endif
3981		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
3982		    NULL);
3983	foffset = auio.uio_offset;
3984	if (error != 0) {
3985		VOP_UNLOCK(vp, 0);
3986		goto fail;
3987	}
3988	if (count == auio.uio_resid &&
3989	    (vp->v_vflag & VV_ROOT) &&
3990	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3991		struct vnode *tvp = vp;
3992
3993		vp = vp->v_mount->mnt_vnodecovered;
3994		VREF(vp);
3995		fp->f_vnode = vp;
3996		fp->f_data = vp;
3997		foffset = 0;
3998		vput(tvp);
3999		goto unionread;
4000	}
4001	VOP_UNLOCK(vp, 0);
4002	*basep = loff;
4003	if (residp != NULL)
4004		*residp = auio.uio_resid;
4005	td->td_retval[0] = count - auio.uio_resid;
4006fail:
4007	foffset_unlock(fp, foffset, 0);
4008	fdrop(fp, td);
4009	return (error);
4010}
4011
4012#ifndef _SYS_SYSPROTO_H_
4013struct getdents_args {
4014	int fd;
4015	char *buf;
4016	size_t count;
4017};
4018#endif
4019int
4020sys_getdents(td, uap)
4021	struct thread *td;
4022	register struct getdents_args /* {
4023		int fd;
4024		char *buf;
4025		u_int count;
4026	} */ *uap;
4027{
4028	struct getdirentries_args ap;
4029
4030	ap.fd = uap->fd;
4031	ap.buf = uap->buf;
4032	ap.count = uap->count;
4033	ap.basep = NULL;
4034	return (sys_getdirentries(td, &ap));
4035}
4036
4037/*
4038 * Set the mode mask for creation of filesystem nodes.
4039 */
4040#ifndef _SYS_SYSPROTO_H_
4041struct umask_args {
4042	int	newmask;
4043};
4044#endif
4045int
4046sys_umask(td, uap)
4047	struct thread *td;
4048	struct umask_args /* {
4049		int newmask;
4050	} */ *uap;
4051{
4052	struct filedesc *fdp;
4053
4054	fdp = td->td_proc->p_fd;
4055	FILEDESC_XLOCK(fdp);
4056	td->td_retval[0] = fdp->fd_cmask;
4057	fdp->fd_cmask = uap->newmask & ALLPERMS;
4058	FILEDESC_XUNLOCK(fdp);
4059	return (0);
4060}
4061
4062/*
4063 * Void all references to file by ripping underlying filesystem away from
4064 * vnode.
4065 */
4066#ifndef _SYS_SYSPROTO_H_
4067struct revoke_args {
4068	char	*path;
4069};
4070#endif
4071int
4072sys_revoke(td, uap)
4073	struct thread *td;
4074	register struct revoke_args /* {
4075		char *path;
4076	} */ *uap;
4077{
4078	struct vnode *vp;
4079	struct vattr vattr;
4080	struct nameidata nd;
4081	int error;
4082
4083	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4084	    uap->path, td);
4085	if ((error = namei(&nd)) != 0)
4086		return (error);
4087	vp = nd.ni_vp;
4088	NDFREE(&nd, NDF_ONLY_PNBUF);
4089	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
4090		error = EINVAL;
4091		goto out;
4092	}
4093#ifdef MAC
4094	error = mac_vnode_check_revoke(td->td_ucred, vp);
4095	if (error != 0)
4096		goto out;
4097#endif
4098	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
4099	if (error != 0)
4100		goto out;
4101	if (td->td_ucred->cr_uid != vattr.va_uid) {
4102		error = priv_check(td, PRIV_VFS_ADMIN);
4103		if (error != 0)
4104			goto out;
4105	}
4106	if (vcount(vp) > 1)
4107		VOP_REVOKE(vp, REVOKEALL);
4108out:
4109	vput(vp);
4110	return (error);
4111}
4112
4113/*
4114 * Convert a user file descriptor to a kernel file entry and check that, if it
4115 * is a capability, the correct rights are present. A reference on the file
4116 * entry is held upon returning.
4117 */
4118int
4119getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
4120{
4121	struct file *fp;
4122	int error;
4123
4124	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
4125	if (error != 0)
4126		return (error);
4127
4128	/*
4129	 * The file could be not of the vnode type, or it may be not
4130	 * yet fully initialized, in which case the f_vnode pointer
4131	 * may be set, but f_ops is still badfileops.  E.g.,
4132	 * devfs_open() transiently create such situation to
4133	 * facilitate csw d_fdopen().
4134	 *
4135	 * Dupfdopen() handling in kern_openat() installs the
4136	 * half-baked file into the process descriptor table, allowing
4137	 * other thread to dereference it. Guard against the race by
4138	 * checking f_ops.
4139	 */
4140	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
4141		fdrop(fp, td);
4142		return (EINVAL);
4143	}
4144	*fpp = fp;
4145	return (0);
4146}
4147
4148
4149/*
4150 * Get an (NFS) file handle.
4151 */
4152#ifndef _SYS_SYSPROTO_H_
4153struct lgetfh_args {
4154	char	*fname;
4155	fhandle_t *fhp;
4156};
4157#endif
4158int
4159sys_lgetfh(td, uap)
4160	struct thread *td;
4161	register struct lgetfh_args *uap;
4162{
4163	struct nameidata nd;
4164	fhandle_t fh;
4165	register struct vnode *vp;
4166	int error;
4167
4168	error = priv_check(td, PRIV_VFS_GETFH);
4169	if (error != 0)
4170		return (error);
4171	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4172	    uap->fname, td);
4173	error = namei(&nd);
4174	if (error != 0)
4175		return (error);
4176	NDFREE(&nd, NDF_ONLY_PNBUF);
4177	vp = nd.ni_vp;
4178	bzero(&fh, sizeof(fh));
4179	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4180	error = VOP_VPTOFH(vp, &fh.fh_fid);
4181	vput(vp);
4182	if (error == 0)
4183		error = copyout(&fh, uap->fhp, sizeof (fh));
4184	return (error);
4185}
4186
4187#ifndef _SYS_SYSPROTO_H_
4188struct getfh_args {
4189	char	*fname;
4190	fhandle_t *fhp;
4191};
4192#endif
4193int
4194sys_getfh(td, uap)
4195	struct thread *td;
4196	register struct getfh_args *uap;
4197{
4198	struct nameidata nd;
4199	fhandle_t fh;
4200	register struct vnode *vp;
4201	int error;
4202
4203	error = priv_check(td, PRIV_VFS_GETFH);
4204	if (error != 0)
4205		return (error);
4206	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
4207	    uap->fname, td);
4208	error = namei(&nd);
4209	if (error != 0)
4210		return (error);
4211	NDFREE(&nd, NDF_ONLY_PNBUF);
4212	vp = nd.ni_vp;
4213	bzero(&fh, sizeof(fh));
4214	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
4215	error = VOP_VPTOFH(vp, &fh.fh_fid);
4216	vput(vp);
4217	if (error == 0)
4218		error = copyout(&fh, uap->fhp, sizeof (fh));
4219	return (error);
4220}
4221
4222/*
4223 * syscall for the rpc.lockd to use to translate a NFS file handle into an
4224 * open descriptor.
4225 *
4226 * warning: do not remove the priv_check() call or this becomes one giant
4227 * security hole.
4228 */
4229#ifndef _SYS_SYSPROTO_H_
4230struct fhopen_args {
4231	const struct fhandle *u_fhp;
4232	int flags;
4233};
4234#endif
4235int
4236sys_fhopen(td, uap)
4237	struct thread *td;
4238	struct fhopen_args /* {
4239		const struct fhandle *u_fhp;
4240		int flags;
4241	} */ *uap;
4242{
4243	struct mount *mp;
4244	struct vnode *vp;
4245	struct fhandle fhp;
4246	struct file *fp;
4247	int fmode, error;
4248	int indx;
4249
4250	error = priv_check(td, PRIV_VFS_FHOPEN);
4251	if (error != 0)
4252		return (error);
4253	indx = -1;
4254	fmode = FFLAGS(uap->flags);
4255	/* why not allow a non-read/write open for our lockd? */
4256	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
4257		return (EINVAL);
4258	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
4259	if (error != 0)
4260		return(error);
4261	/* find the mount point */
4262	mp = vfs_busyfs(&fhp.fh_fsid);
4263	if (mp == NULL)
4264		return (ESTALE);
4265	/* now give me my vnode, it gets returned to me locked */
4266	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
4267	vfs_unbusy(mp);
4268	if (error != 0)
4269		return (error);
4270
4271	error = falloc_noinstall(td, &fp);
4272	if (error != 0) {
4273		vput(vp);
4274		return (error);
4275	}
4276	/*
4277	 * An extra reference on `fp' has been held for us by
4278	 * falloc_noinstall().
4279	 */
4280
4281#ifdef INVARIANTS
4282	td->td_dupfd = -1;
4283#endif
4284	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4285	if (error != 0) {
4286		KASSERT(fp->f_ops == &badfileops,
4287		    ("VOP_OPEN in fhopen() set f_ops"));
4288		KASSERT(td->td_dupfd < 0,
4289		    ("fhopen() encountered fdopen()"));
4290
4291		vput(vp);
4292		goto bad;
4293	}
4294#ifdef INVARIANTS
4295	td->td_dupfd = 0;
4296#endif
4297	fp->f_vnode = vp;
4298	fp->f_seqcount = 1;
4299	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4300	    &vnops);
4301	VOP_UNLOCK(vp, 0);
4302	if ((fmode & O_TRUNC) != 0) {
4303		error = fo_truncate(fp, 0, td->td_ucred, td);
4304		if (error != 0)
4305			goto bad;
4306	}
4307
4308	error = finstall(td, fp, &indx, fmode, NULL);
4309bad:
4310	fdrop(fp, td);
4311	td->td_retval[0] = indx;
4312	return (error);
4313}
4314
4315/*
4316 * Stat an (NFS) file handle.
4317 */
4318#ifndef _SYS_SYSPROTO_H_
4319struct fhstat_args {
4320	struct fhandle *u_fhp;
4321	struct stat *sb;
4322};
4323#endif
4324int
4325sys_fhstat(td, uap)
4326	struct thread *td;
4327	register struct fhstat_args /* {
4328		struct fhandle *u_fhp;
4329		struct stat *sb;
4330	} */ *uap;
4331{
4332	struct stat sb;
4333	struct fhandle fh;
4334	int error;
4335
4336	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4337	if (error != 0)
4338		return (error);
4339	error = kern_fhstat(td, fh, &sb);
4340	if (error == 0)
4341		error = copyout(&sb, uap->sb, sizeof(sb));
4342	return (error);
4343}
4344
4345int
4346kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4347{
4348	struct mount *mp;
4349	struct vnode *vp;
4350	int error;
4351
4352	error = priv_check(td, PRIV_VFS_FHSTAT);
4353	if (error != 0)
4354		return (error);
4355	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4356		return (ESTALE);
4357	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4358	vfs_unbusy(mp);
4359	if (error != 0)
4360		return (error);
4361	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4362	vput(vp);
4363	return (error);
4364}
4365
4366/*
4367 * Implement fstatfs() for (NFS) file handles.
4368 */
4369#ifndef _SYS_SYSPROTO_H_
4370struct fhstatfs_args {
4371	struct fhandle *u_fhp;
4372	struct statfs *buf;
4373};
4374#endif
4375int
4376sys_fhstatfs(td, uap)
4377	struct thread *td;
4378	struct fhstatfs_args /* {
4379		struct fhandle *u_fhp;
4380		struct statfs *buf;
4381	} */ *uap;
4382{
4383	struct statfs *sfp;
4384	fhandle_t fh;
4385	int error;
4386
4387	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4388	if (error != 0)
4389		return (error);
4390	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
4391	error = kern_fhstatfs(td, fh, sfp);
4392	if (error == 0)
4393		error = copyout(sfp, uap->buf, sizeof(*sfp));
4394	free(sfp, M_STATFS);
4395	return (error);
4396}
4397
4398int
4399kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4400{
4401	struct statfs *sp;
4402	struct mount *mp;
4403	struct vnode *vp;
4404	int error;
4405
4406	error = priv_check(td, PRIV_VFS_FHSTATFS);
4407	if (error != 0)
4408		return (error);
4409	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4410		return (ESTALE);
4411	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4412	if (error != 0) {
4413		vfs_unbusy(mp);
4414		return (error);
4415	}
4416	vput(vp);
4417	error = prison_canseemount(td->td_ucred, mp);
4418	if (error != 0)
4419		goto out;
4420#ifdef MAC
4421	error = mac_mount_check_stat(td->td_ucred, mp);
4422	if (error != 0)
4423		goto out;
4424#endif
4425	/*
4426	 * Set these in case the underlying filesystem fails to do so.
4427	 */
4428	sp = &mp->mnt_stat;
4429	sp->f_version = STATFS_VERSION;
4430	sp->f_namemax = NAME_MAX;
4431	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4432	error = VFS_STATFS(mp, sp);
4433	if (error == 0)
4434		*buf = *sp;
4435out:
4436	vfs_unbusy(mp);
4437	return (error);
4438}
4439
4440int
4441kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4442{
4443	struct file *fp;
4444	struct mount *mp;
4445	struct vnode *vp;
4446	cap_rights_t rights;
4447	off_t olen, ooffset;
4448	int error;
4449
4450	if (offset < 0 || len <= 0)
4451		return (EINVAL);
4452	/* Check for wrap. */
4453	if (offset > OFF_MAX - len)
4454		return (EFBIG);
4455	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4456	if (error != 0)
4457		return (error);
4458	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4459		error = ESPIPE;
4460		goto out;
4461	}
4462	if ((fp->f_flag & FWRITE) == 0) {
4463		error = EBADF;
4464		goto out;
4465	}
4466	if (fp->f_type != DTYPE_VNODE) {
4467		error = ENODEV;
4468		goto out;
4469	}
4470	vp = fp->f_vnode;
4471	if (vp->v_type != VREG) {
4472		error = ENODEV;
4473		goto out;
4474	}
4475
4476	/* Allocating blocks may take a long time, so iterate. */
4477	for (;;) {
4478		olen = len;
4479		ooffset = offset;
4480
4481		bwillwrite();
4482		mp = NULL;
4483		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4484		if (error != 0)
4485			break;
4486		error = vn_lock(vp, LK_EXCLUSIVE);
4487		if (error != 0) {
4488			vn_finished_write(mp);
4489			break;
4490		}
4491#ifdef MAC
4492		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4493		if (error == 0)
4494#endif
4495			error = VOP_ALLOCATE(vp, &offset, &len);
4496		VOP_UNLOCK(vp, 0);
4497		vn_finished_write(mp);
4498
4499		if (olen + ooffset != offset + len) {
4500			panic("offset + len changed from %jx/%jx to %jx/%jx",
4501			    ooffset, olen, offset, len);
4502		}
4503		if (error != 0 || len == 0)
4504			break;
4505		KASSERT(olen > len, ("Iteration did not make progress?"));
4506		maybe_yield();
4507	}
4508 out:
4509	fdrop(fp, td);
4510	return (error);
4511}
4512
4513int
4514sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4515{
4516	int error;
4517
4518	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
4519	return (kern_posix_error(td, error));
4520}
4521
4522/*
4523 * Unlike madvise(2), we do not make a best effort to remember every
4524 * possible caching hint.  Instead, we remember the last setting with
4525 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4526 * region of any current setting.
4527 */
4528int
4529kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4530    int advice)
4531{
4532	struct fadvise_info *fa, *new;
4533	struct file *fp;
4534	struct vnode *vp;
4535	cap_rights_t rights;
4536	off_t end;
4537	int error;
4538
4539	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4540		return (EINVAL);
4541	switch (advice) {
4542	case POSIX_FADV_SEQUENTIAL:
4543	case POSIX_FADV_RANDOM:
4544	case POSIX_FADV_NOREUSE:
4545		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4546		break;
4547	case POSIX_FADV_NORMAL:
4548	case POSIX_FADV_WILLNEED:
4549	case POSIX_FADV_DONTNEED:
4550		new = NULL;
4551		break;
4552	default:
4553		return (EINVAL);
4554	}
4555	/* XXX: CAP_POSIX_FADVISE? */
4556	error = fget(td, fd, cap_rights_init(&rights), &fp);
4557	if (error != 0)
4558		goto out;
4559	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4560		error = ESPIPE;
4561		goto out;
4562	}
4563	if (fp->f_type != DTYPE_VNODE) {
4564		error = ENODEV;
4565		goto out;
4566	}
4567	vp = fp->f_vnode;
4568	if (vp->v_type != VREG) {
4569		error = ENODEV;
4570		goto out;
4571	}
4572	if (len == 0)
4573		end = OFF_MAX;
4574	else
4575		end = offset + len - 1;
4576	switch (advice) {
4577	case POSIX_FADV_SEQUENTIAL:
4578	case POSIX_FADV_RANDOM:
4579	case POSIX_FADV_NOREUSE:
4580		/*
4581		 * Try to merge any existing non-standard region with
4582		 * this new region if possible, otherwise create a new
4583		 * non-standard region for this request.
4584		 */
4585		mtx_pool_lock(mtxpool_sleep, fp);
4586		fa = fp->f_advice;
4587		if (fa != NULL && fa->fa_advice == advice &&
4588		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4589		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4590		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4591			if (offset < fa->fa_start)
4592				fa->fa_start = offset;
4593			if (end > fa->fa_end)
4594				fa->fa_end = end;
4595		} else {
4596			new->fa_advice = advice;
4597			new->fa_start = offset;
4598			new->fa_end = end;
4599			fp->f_advice = new;
4600			new = fa;
4601		}
4602		mtx_pool_unlock(mtxpool_sleep, fp);
4603		break;
4604	case POSIX_FADV_NORMAL:
4605		/*
4606		 * If a the "normal" region overlaps with an existing
4607		 * non-standard region, trim or remove the
4608		 * non-standard region.
4609		 */
4610		mtx_pool_lock(mtxpool_sleep, fp);
4611		fa = fp->f_advice;
4612		if (fa != NULL) {
4613			if (offset <= fa->fa_start && end >= fa->fa_end) {
4614				new = fa;
4615				fp->f_advice = NULL;
4616			} else if (offset <= fa->fa_start &&
4617			    end >= fa->fa_start)
4618				fa->fa_start = end + 1;
4619			else if (offset <= fa->fa_end && end >= fa->fa_end)
4620				fa->fa_end = offset - 1;
4621			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4622				/*
4623				 * If the "normal" region is a middle
4624				 * portion of the existing
4625				 * non-standard region, just remove
4626				 * the whole thing rather than picking
4627				 * one side or the other to
4628				 * preserve.
4629				 */
4630				new = fa;
4631				fp->f_advice = NULL;
4632			}
4633		}
4634		mtx_pool_unlock(mtxpool_sleep, fp);
4635		break;
4636	case POSIX_FADV_WILLNEED:
4637	case POSIX_FADV_DONTNEED:
4638		error = VOP_ADVISE(vp, offset, end, advice);
4639		break;
4640	}
4641out:
4642	if (fp != NULL)
4643		fdrop(fp, td);
4644	free(new, M_FADVISE);
4645	return (error);
4646}
4647
4648int
4649sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4650{
4651	int error;
4652
4653	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4654	    uap->advice);
4655	return (kern_posix_error(td, error));
4656}
4657