vfs_syscalls.c revision 331643
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
37 */
38
39#include <sys/cdefs.h>
40__FBSDID("$FreeBSD: stable/11/sys/kern/vfs_syscalls.c 331643 2018-03-27 18:52:27Z dim $");
41
42#include "opt_capsicum.h"
43#include "opt_compat.h"
44#include "opt_ktrace.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/capsicum.h>
51#include <sys/disk.h>
52#include <sys/sysent.h>
53#include <sys/malloc.h>
54#include <sys/mount.h>
55#include <sys/mutex.h>
56#include <sys/sysproto.h>
57#include <sys/namei.h>
58#include <sys/filedesc.h>
59#include <sys/kernel.h>
60#include <sys/fcntl.h>
61#include <sys/file.h>
62#include <sys/filio.h>
63#include <sys/limits.h>
64#include <sys/linker.h>
65#include <sys/rwlock.h>
66#include <sys/sdt.h>
67#include <sys/stat.h>
68#include <sys/sx.h>
69#include <sys/unistd.h>
70#include <sys/vnode.h>
71#include <sys/priv.h>
72#include <sys/proc.h>
73#include <sys/dirent.h>
74#include <sys/jail.h>
75#include <sys/syscallsubr.h>
76#include <sys/sysctl.h>
77#ifdef KTRACE
78#include <sys/ktrace.h>
79#endif
80
81#include <machine/stdarg.h>
82
83#include <security/audit/audit.h>
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_object.h>
88#include <vm/vm_page.h>
89#include <vm/uma.h>
90
91#include <ufs/ufs/quota.h>
92
93MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
94
95SDT_PROVIDER_DEFINE(vfs);
96SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
97SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
98
99static int kern_chflagsat(struct thread *td, int fd, const char *path,
100    enum uio_seg pathseg, u_long flags, int atflag);
101static int setfflags(struct thread *td, struct vnode *, u_long);
102static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
103static int getutimens(const struct timespec *, enum uio_seg,
104    struct timespec *, int *);
105static int setutimes(struct thread *td, struct vnode *,
106    const struct timespec *, int, int);
107static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
108    struct thread *td);
109
110/*
111 * Sync each mounted filesystem.
112 */
113#ifndef _SYS_SYSPROTO_H_
114struct sync_args {
115	int     dummy;
116};
117#endif
118/* ARGSUSED */
119int
120sys_sync(struct thread *td, struct sync_args *uap)
121{
122	struct mount *mp, *nmp;
123	int save;
124
125	mtx_lock(&mountlist_mtx);
126	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
127		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
128			nmp = TAILQ_NEXT(mp, mnt_list);
129			continue;
130		}
131		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
132		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
133			save = curthread_pflags_set(TDP_SYNCIO);
134			vfs_msync(mp, MNT_NOWAIT);
135			VFS_SYNC(mp, MNT_NOWAIT);
136			curthread_pflags_restore(save);
137			vn_finished_write(mp);
138		}
139		mtx_lock(&mountlist_mtx);
140		nmp = TAILQ_NEXT(mp, mnt_list);
141		vfs_unbusy(mp);
142	}
143	mtx_unlock(&mountlist_mtx);
144	return (0);
145}
146
147/*
148 * Change filesystem quotas.
149 */
150#ifndef _SYS_SYSPROTO_H_
151struct quotactl_args {
152	char *path;
153	int cmd;
154	int uid;
155	caddr_t arg;
156};
157#endif
158int
159sys_quotactl(struct thread *td, struct quotactl_args *uap)
160{
161	struct mount *mp;
162	struct nameidata nd;
163	int error;
164
165	AUDIT_ARG_CMD(uap->cmd);
166	AUDIT_ARG_UID(uap->uid);
167	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
168		return (EPERM);
169	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
170	    uap->path, td);
171	if ((error = namei(&nd)) != 0)
172		return (error);
173	NDFREE(&nd, NDF_ONLY_PNBUF);
174	mp = nd.ni_vp->v_mount;
175	vfs_ref(mp);
176	vput(nd.ni_vp);
177	error = vfs_busy(mp, 0);
178	vfs_rel(mp);
179	if (error != 0)
180		return (error);
181	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
182
183	/*
184	 * Since quota on operation typically needs to open quota
185	 * file, the Q_QUOTAON handler needs to unbusy the mount point
186	 * before calling into namei.  Otherwise, unmount might be
187	 * started between two vfs_busy() invocations (first is our,
188	 * second is from mount point cross-walk code in lookup()),
189	 * causing deadlock.
190	 *
191	 * Require that Q_QUOTAON handles the vfs_busy() reference on
192	 * its own, always returning with ubusied mount point.
193	 */
194	if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
195		vfs_unbusy(mp);
196	return (error);
197}
198
199/*
200 * Used by statfs conversion routines to scale the block size up if
201 * necessary so that all of the block counts are <= 'max_size'.  Note
202 * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
203 * value of 'n'.
204 */
205void
206statfs_scale_blocks(struct statfs *sf, long max_size)
207{
208	uint64_t count;
209	int shift;
210
211	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
212
213	/*
214	 * Attempt to scale the block counts to give a more accurate
215	 * overview to userland of the ratio of free space to used
216	 * space.  To do this, find the largest block count and compute
217	 * a divisor that lets it fit into a signed integer <= max_size.
218	 */
219	if (sf->f_bavail < 0)
220		count = -sf->f_bavail;
221	else
222		count = sf->f_bavail;
223	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
224	if (count <= max_size)
225		return;
226
227	count >>= flsl(max_size);
228	shift = 0;
229	while (count > 0) {
230		shift++;
231		count >>=1;
232	}
233
234	sf->f_bsize <<= shift;
235	sf->f_blocks >>= shift;
236	sf->f_bfree >>= shift;
237	sf->f_bavail >>= shift;
238}
239
240static int
241kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
242{
243	struct statfs *sp;
244	int error;
245
246	if (mp == NULL)
247		return (EBADF);
248	error = vfs_busy(mp, 0);
249	vfs_rel(mp);
250	if (error != 0)
251		return (error);
252#ifdef MAC
253	error = mac_mount_check_stat(td->td_ucred, mp);
254	if (error != 0)
255		goto out;
256#endif
257	/*
258	 * Set these in case the underlying filesystem fails to do so.
259	 */
260	sp = &mp->mnt_stat;
261	sp->f_version = STATFS_VERSION;
262	sp->f_namemax = NAME_MAX;
263	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
264	error = VFS_STATFS(mp, sp);
265	if (error != 0)
266		goto out;
267	*buf = *sp;
268	if (priv_check(td, PRIV_VFS_GENERATION)) {
269		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
270		prison_enforce_statfs(td->td_ucred, mp, buf);
271	}
272out:
273	vfs_unbusy(mp);
274	return (error);
275}
276
277/*
278 * Get filesystem statistics.
279 */
280#ifndef _SYS_SYSPROTO_H_
281struct statfs_args {
282	char *path;
283	struct statfs *buf;
284};
285#endif
286int
287sys_statfs(struct thread *td, struct statfs_args *uap)
288{
289	struct statfs *sfp;
290	int error;
291
292	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
293	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
294	if (error == 0)
295		error = copyout(sfp, uap->buf, sizeof(struct statfs));
296	free(sfp, M_STATFS);
297	return (error);
298}
299
300int
301kern_statfs(struct thread *td, char *path, enum uio_seg pathseg,
302    struct statfs *buf)
303{
304	struct mount *mp;
305	struct nameidata nd;
306	int error;
307
308	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
309	    pathseg, path, td);
310	error = namei(&nd);
311	if (error != 0)
312		return (error);
313	mp = nd.ni_vp->v_mount;
314	vfs_ref(mp);
315	NDFREE(&nd, NDF_ONLY_PNBUF);
316	vput(nd.ni_vp);
317	return (kern_do_statfs(td, mp, buf));
318}
319
320/*
321 * Get filesystem statistics.
322 */
323#ifndef _SYS_SYSPROTO_H_
324struct fstatfs_args {
325	int fd;
326	struct statfs *buf;
327};
328#endif
329int
330sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
331{
332	struct statfs *sfp;
333	int error;
334
335	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
336	error = kern_fstatfs(td, uap->fd, sfp);
337	if (error == 0)
338		error = copyout(sfp, uap->buf, sizeof(struct statfs));
339	free(sfp, M_STATFS);
340	return (error);
341}
342
343int
344kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
345{
346	struct file *fp;
347	struct mount *mp;
348	struct vnode *vp;
349	cap_rights_t rights;
350	int error;
351
352	AUDIT_ARG_FD(fd);
353	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSTATFS), &fp);
354	if (error != 0)
355		return (error);
356	vp = fp->f_vnode;
357	vn_lock(vp, LK_SHARED | LK_RETRY);
358#ifdef AUDIT
359	AUDIT_ARG_VNODE1(vp);
360#endif
361	mp = vp->v_mount;
362	if (mp != NULL)
363		vfs_ref(mp);
364	VOP_UNLOCK(vp, 0);
365	fdrop(fp, td);
366	return (kern_do_statfs(td, mp, buf));
367}
368
369/*
370 * Get statistics on all filesystems.
371 */
372#ifndef _SYS_SYSPROTO_H_
373struct getfsstat_args {
374	struct statfs *buf;
375	long bufsize;
376	int mode;
377};
378#endif
379int
380sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
381{
382	size_t count;
383	int error;
384
385	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
386		return (EINVAL);
387	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
388	    UIO_USERSPACE, uap->mode);
389	if (error == 0)
390		td->td_retval[0] = count;
391	return (error);
392}
393
394/*
395 * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
396 *	The caller is responsible for freeing memory which will be allocated
397 *	in '*buf'.
398 */
399int
400kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
401    size_t *countp, enum uio_seg bufseg, int mode)
402{
403	struct mount *mp, *nmp;
404	struct statfs *sfsp, *sp, *sptmp, *tofree;
405	size_t count, maxcount;
406	int error;
407
408	switch (mode) {
409	case MNT_WAIT:
410	case MNT_NOWAIT:
411		break;
412	default:
413		return (EINVAL);
414	}
415restart:
416	maxcount = bufsize / sizeof(struct statfs);
417	if (bufsize == 0) {
418		sfsp = NULL;
419		tofree = NULL;
420	} else if (bufseg == UIO_USERSPACE) {
421		sfsp = *buf;
422		tofree = NULL;
423	} else /* if (bufseg == UIO_SYSSPACE) */ {
424		count = 0;
425		mtx_lock(&mountlist_mtx);
426		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
427			count++;
428		}
429		mtx_unlock(&mountlist_mtx);
430		if (maxcount > count)
431			maxcount = count;
432		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
433		    M_STATFS, M_WAITOK);
434	}
435	count = 0;
436	mtx_lock(&mountlist_mtx);
437	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
438		if (prison_canseemount(td->td_ucred, mp) != 0) {
439			nmp = TAILQ_NEXT(mp, mnt_list);
440			continue;
441		}
442#ifdef MAC
443		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
444			nmp = TAILQ_NEXT(mp, mnt_list);
445			continue;
446		}
447#endif
448		if (mode == MNT_WAIT) {
449			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
450				/*
451				 * If vfs_busy() failed, and MBF_NOWAIT
452				 * wasn't passed, then the mp is gone.
453				 * Furthermore, because of MBF_MNTLSTLOCK,
454				 * the mountlist_mtx was dropped.  We have
455				 * no other choice than to start over.
456				 */
457				mtx_unlock(&mountlist_mtx);
458				free(tofree, M_STATFS);
459				goto restart;
460			}
461		} else {
462			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
463				nmp = TAILQ_NEXT(mp, mnt_list);
464				continue;
465			}
466		}
467		if (sfsp != NULL && count < maxcount) {
468			sp = &mp->mnt_stat;
469			/*
470			 * Set these in case the underlying filesystem
471			 * fails to do so.
472			 */
473			sp->f_version = STATFS_VERSION;
474			sp->f_namemax = NAME_MAX;
475			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
476			/*
477			 * If MNT_NOWAIT is specified, do not refresh
478			 * the fsstat cache.
479			 */
480			if (mode != MNT_NOWAIT) {
481				error = VFS_STATFS(mp, sp);
482				if (error != 0) {
483					mtx_lock(&mountlist_mtx);
484					nmp = TAILQ_NEXT(mp, mnt_list);
485					vfs_unbusy(mp);
486					continue;
487				}
488			}
489			if (priv_check(td, PRIV_VFS_GENERATION)) {
490				sptmp = malloc(sizeof(struct statfs), M_STATFS,
491				    M_WAITOK);
492				*sptmp = *sp;
493				sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
494				prison_enforce_statfs(td->td_ucred, mp, sptmp);
495				sp = sptmp;
496			} else
497				sptmp = NULL;
498			if (bufseg == UIO_SYSSPACE) {
499				bcopy(sp, sfsp, sizeof(*sp));
500				free(sptmp, M_STATFS);
501			} else /* if (bufseg == UIO_USERSPACE) */ {
502				error = copyout(sp, sfsp, sizeof(*sp));
503				free(sptmp, M_STATFS);
504				if (error != 0) {
505					vfs_unbusy(mp);
506					return (error);
507				}
508			}
509			sfsp++;
510		}
511		count++;
512		mtx_lock(&mountlist_mtx);
513		nmp = TAILQ_NEXT(mp, mnt_list);
514		vfs_unbusy(mp);
515	}
516	mtx_unlock(&mountlist_mtx);
517	if (sfsp != NULL && count > maxcount)
518		*countp = maxcount;
519	else
520		*countp = count;
521	return (0);
522}
523
524#ifdef COMPAT_FREEBSD4
525/*
526 * Get old format filesystem statistics.
527 */
528static void cvtstatfs(struct statfs *, struct ostatfs *);
529
530#ifndef _SYS_SYSPROTO_H_
531struct freebsd4_statfs_args {
532	char *path;
533	struct ostatfs *buf;
534};
535#endif
536int
537freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
538{
539	struct ostatfs osb;
540	struct statfs *sfp;
541	int error;
542
543	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
544	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
545	if (error == 0) {
546		cvtstatfs(sfp, &osb);
547		error = copyout(&osb, uap->buf, sizeof(osb));
548	}
549	free(sfp, M_STATFS);
550	return (error);
551}
552
553/*
554 * Get filesystem statistics.
555 */
556#ifndef _SYS_SYSPROTO_H_
557struct freebsd4_fstatfs_args {
558	int fd;
559	struct ostatfs *buf;
560};
561#endif
562int
563freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
564{
565	struct ostatfs osb;
566	struct statfs *sfp;
567	int error;
568
569	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
570	error = kern_fstatfs(td, uap->fd, sfp);
571	if (error == 0) {
572		cvtstatfs(sfp, &osb);
573		error = copyout(&osb, uap->buf, sizeof(osb));
574	}
575	free(sfp, M_STATFS);
576	return (error);
577}
578
579/*
580 * Get statistics on all filesystems.
581 */
582#ifndef _SYS_SYSPROTO_H_
583struct freebsd4_getfsstat_args {
584	struct ostatfs *buf;
585	long bufsize;
586	int mode;
587};
588#endif
589int
590freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
591{
592	struct statfs *buf, *sp;
593	struct ostatfs osb;
594	size_t count, size;
595	int error;
596
597	if (uap->bufsize < 0)
598		return (EINVAL);
599	count = uap->bufsize / sizeof(struct ostatfs);
600	if (count > SIZE_MAX / sizeof(struct statfs))
601		return (EINVAL);
602	size = count * sizeof(struct statfs);
603	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
604	    uap->mode);
605	td->td_retval[0] = count;
606	if (size != 0) {
607		sp = buf;
608		while (count != 0 && error == 0) {
609			cvtstatfs(sp, &osb);
610			error = copyout(&osb, uap->buf, sizeof(osb));
611			sp++;
612			uap->buf++;
613			count--;
614		}
615		free(buf, M_STATFS);
616	}
617	return (error);
618}
619
620/*
621 * Implement fstatfs() for (NFS) file handles.
622 */
623#ifndef _SYS_SYSPROTO_H_
624struct freebsd4_fhstatfs_args {
625	struct fhandle *u_fhp;
626	struct ostatfs *buf;
627};
628#endif
629int
630freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
631{
632	struct ostatfs osb;
633	struct statfs *sfp;
634	fhandle_t fh;
635	int error;
636
637	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
638	if (error != 0)
639		return (error);
640	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
641	error = kern_fhstatfs(td, fh, sfp);
642	if (error == 0) {
643		cvtstatfs(sfp, &osb);
644		error = copyout(&osb, uap->buf, sizeof(osb));
645	}
646	free(sfp, M_STATFS);
647	return (error);
648}
649
650/*
651 * Convert a new format statfs structure to an old format statfs structure.
652 */
653static void
654cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
655{
656
657	statfs_scale_blocks(nsp, LONG_MAX);
658	bzero(osp, sizeof(*osp));
659	osp->f_bsize = nsp->f_bsize;
660	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
661	osp->f_blocks = nsp->f_blocks;
662	osp->f_bfree = nsp->f_bfree;
663	osp->f_bavail = nsp->f_bavail;
664	osp->f_files = MIN(nsp->f_files, LONG_MAX);
665	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
666	osp->f_owner = nsp->f_owner;
667	osp->f_type = nsp->f_type;
668	osp->f_flags = nsp->f_flags;
669	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
670	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
671	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
672	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
673	strlcpy(osp->f_fstypename, nsp->f_fstypename,
674	    MIN(MFSNAMELEN, OMFSNAMELEN));
675	strlcpy(osp->f_mntonname, nsp->f_mntonname,
676	    MIN(MNAMELEN, OMNAMELEN));
677	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
678	    MIN(MNAMELEN, OMNAMELEN));
679	osp->f_fsid = nsp->f_fsid;
680}
681#endif /* COMPAT_FREEBSD4 */
682
683/*
684 * Change current working directory to a given file descriptor.
685 */
686#ifndef _SYS_SYSPROTO_H_
687struct fchdir_args {
688	int	fd;
689};
690#endif
691int
692sys_fchdir(struct thread *td, struct fchdir_args *uap)
693{
694	struct vnode *vp, *tdp;
695	struct mount *mp;
696	struct file *fp;
697	cap_rights_t rights;
698	int error;
699
700	AUDIT_ARG_FD(uap->fd);
701	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
702	    &fp);
703	if (error != 0)
704		return (error);
705	vp = fp->f_vnode;
706	vrefact(vp);
707	fdrop(fp, td);
708	vn_lock(vp, LK_SHARED | LK_RETRY);
709	AUDIT_ARG_VNODE1(vp);
710	error = change_dir(vp, td);
711	while (!error && (mp = vp->v_mountedhere) != NULL) {
712		if (vfs_busy(mp, 0))
713			continue;
714		error = VFS_ROOT(mp, LK_SHARED, &tdp);
715		vfs_unbusy(mp);
716		if (error != 0)
717			break;
718		vput(vp);
719		vp = tdp;
720	}
721	if (error != 0) {
722		vput(vp);
723		return (error);
724	}
725	VOP_UNLOCK(vp, 0);
726	pwd_chdir(td, vp);
727	return (0);
728}
729
730/*
731 * Change current working directory (``.'').
732 */
733#ifndef _SYS_SYSPROTO_H_
734struct chdir_args {
735	char	*path;
736};
737#endif
738int
739sys_chdir(struct thread *td, struct chdir_args *uap)
740{
741
742	return (kern_chdir(td, uap->path, UIO_USERSPACE));
743}
744
745int
746kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
747{
748	struct nameidata nd;
749	int error;
750
751	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
752	    pathseg, path, td);
753	if ((error = namei(&nd)) != 0)
754		return (error);
755	if ((error = change_dir(nd.ni_vp, td)) != 0) {
756		vput(nd.ni_vp);
757		NDFREE(&nd, NDF_ONLY_PNBUF);
758		return (error);
759	}
760	VOP_UNLOCK(nd.ni_vp, 0);
761	NDFREE(&nd, NDF_ONLY_PNBUF);
762	pwd_chdir(td, nd.ni_vp);
763	return (0);
764}
765
766/*
767 * Change notion of root (``/'') directory.
768 */
769#ifndef _SYS_SYSPROTO_H_
770struct chroot_args {
771	char	*path;
772};
773#endif
774int
775sys_chroot(struct thread *td, struct chroot_args *uap)
776{
777	struct nameidata nd;
778	int error;
779
780	error = priv_check(td, PRIV_VFS_CHROOT);
781	if (error != 0)
782		return (error);
783	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
784	    UIO_USERSPACE, uap->path, td);
785	error = namei(&nd);
786	if (error != 0)
787		goto error;
788	error = change_dir(nd.ni_vp, td);
789	if (error != 0)
790		goto e_vunlock;
791#ifdef MAC
792	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
793	if (error != 0)
794		goto e_vunlock;
795#endif
796	VOP_UNLOCK(nd.ni_vp, 0);
797	error = pwd_chroot(td, nd.ni_vp);
798	vrele(nd.ni_vp);
799	NDFREE(&nd, NDF_ONLY_PNBUF);
800	return (error);
801e_vunlock:
802	vput(nd.ni_vp);
803error:
804	NDFREE(&nd, NDF_ONLY_PNBUF);
805	return (error);
806}
807
808/*
809 * Common routine for chroot and chdir.  Callers must provide a locked vnode
810 * instance.
811 */
812int
813change_dir(struct vnode *vp, struct thread *td)
814{
815#ifdef MAC
816	int error;
817#endif
818
819	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
820	if (vp->v_type != VDIR)
821		return (ENOTDIR);
822#ifdef MAC
823	error = mac_vnode_check_chdir(td->td_ucred, vp);
824	if (error != 0)
825		return (error);
826#endif
827	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
828}
829
830static __inline void
831flags_to_rights(int flags, cap_rights_t *rightsp)
832{
833
834	if (flags & O_EXEC) {
835		cap_rights_set(rightsp, CAP_FEXECVE);
836	} else {
837		switch ((flags & O_ACCMODE)) {
838		case O_RDONLY:
839			cap_rights_set(rightsp, CAP_READ);
840			break;
841		case O_RDWR:
842			cap_rights_set(rightsp, CAP_READ);
843			/* FALLTHROUGH */
844		case O_WRONLY:
845			cap_rights_set(rightsp, CAP_WRITE);
846			if (!(flags & (O_APPEND | O_TRUNC)))
847				cap_rights_set(rightsp, CAP_SEEK);
848			break;
849		}
850	}
851
852	if (flags & O_CREAT)
853		cap_rights_set(rightsp, CAP_CREATE);
854
855	if (flags & O_TRUNC)
856		cap_rights_set(rightsp, CAP_FTRUNCATE);
857
858	if (flags & (O_SYNC | O_FSYNC))
859		cap_rights_set(rightsp, CAP_FSYNC);
860
861	if (flags & (O_EXLOCK | O_SHLOCK))
862		cap_rights_set(rightsp, CAP_FLOCK);
863}
864
865/*
866 * Check permissions, allocate an open file structure, and call the device
867 * open routine if any.
868 */
869#ifndef _SYS_SYSPROTO_H_
870struct open_args {
871	char	*path;
872	int	flags;
873	int	mode;
874};
875#endif
876int
877sys_open(struct thread *td, struct open_args *uap)
878{
879
880	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
881	    uap->flags, uap->mode));
882}
883
884#ifndef _SYS_SYSPROTO_H_
885struct openat_args {
886	int	fd;
887	char	*path;
888	int	flag;
889	int	mode;
890};
891#endif
892int
893sys_openat(struct thread *td, struct openat_args *uap)
894{
895
896	AUDIT_ARG_FD(uap->fd);
897	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
898	    uap->mode));
899}
900
901int
902kern_openat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
903    int flags, int mode)
904{
905	struct proc *p = td->td_proc;
906	struct filedesc *fdp = p->p_fd;
907	struct file *fp;
908	struct vnode *vp;
909	struct nameidata nd;
910	cap_rights_t rights;
911	int cmode, error, indx;
912
913	indx = -1;
914
915	AUDIT_ARG_FFLAGS(flags);
916	AUDIT_ARG_MODE(mode);
917	cap_rights_init(&rights, CAP_LOOKUP);
918	flags_to_rights(flags, &rights);
919	/*
920	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
921	 * may be specified.
922	 */
923	if (flags & O_EXEC) {
924		if (flags & O_ACCMODE)
925			return (EINVAL);
926	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
927		return (EINVAL);
928	} else {
929		flags = FFLAGS(flags);
930	}
931
932	/*
933	 * Allocate a file structure. The descriptor to reference it
934	 * is allocated and set by finstall() below.
935	 */
936	error = falloc_noinstall(td, &fp);
937	if (error != 0)
938		return (error);
939	/*
940	 * An extra reference on `fp' has been held for us by
941	 * falloc_noinstall().
942	 */
943	/* Set the flags early so the finit in devfs can pick them up. */
944	fp->f_flag = flags & FMASK;
945	cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
946	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
947	    &rights, td);
948	td->td_dupfd = -1;		/* XXX check for fdopen */
949	error = vn_open(&nd, &flags, cmode, fp);
950	if (error != 0) {
951		/*
952		 * If the vn_open replaced the method vector, something
953		 * wonderous happened deep below and we just pass it up
954		 * pretending we know what we do.
955		 */
956		if (error == ENXIO && fp->f_ops != &badfileops)
957			goto success;
958
959		/*
960		 * Handle special fdopen() case. bleh.
961		 *
962		 * Don't do this for relative (capability) lookups; we don't
963		 * understand exactly what would happen, and we don't think
964		 * that it ever should.
965		 */
966		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
967		    (error == ENODEV || error == ENXIO) &&
968		    td->td_dupfd >= 0) {
969			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
970			    &indx);
971			if (error == 0)
972				goto success;
973		}
974
975		goto bad;
976	}
977	td->td_dupfd = 0;
978	NDFREE(&nd, NDF_ONLY_PNBUF);
979	vp = nd.ni_vp;
980
981	/*
982	 * Store the vnode, for any f_type. Typically, the vnode use
983	 * count is decremented by direct call to vn_closefile() for
984	 * files that switched type in the cdevsw fdopen() method.
985	 */
986	fp->f_vnode = vp;
987	/*
988	 * If the file wasn't claimed by devfs bind it to the normal
989	 * vnode operations here.
990	 */
991	if (fp->f_ops == &badfileops) {
992		KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
993		fp->f_seqcount = 1;
994		finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
995		    DTYPE_VNODE, vp, &vnops);
996	}
997
998	VOP_UNLOCK(vp, 0);
999	if (flags & O_TRUNC) {
1000		error = fo_truncate(fp, 0, td->td_ucred, td);
1001		if (error != 0)
1002			goto bad;
1003	}
1004success:
1005	/*
1006	 * If we haven't already installed the FD (for dupfdopen), do so now.
1007	 */
1008	if (indx == -1) {
1009		struct filecaps *fcaps;
1010
1011#ifdef CAPABILITIES
1012		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
1013			fcaps = &nd.ni_filecaps;
1014		else
1015#endif
1016			fcaps = NULL;
1017		error = finstall(td, fp, &indx, flags, fcaps);
1018		/* On success finstall() consumes fcaps. */
1019		if (error != 0) {
1020			filecaps_free(&nd.ni_filecaps);
1021			goto bad;
1022		}
1023	} else {
1024		filecaps_free(&nd.ni_filecaps);
1025	}
1026
1027	/*
1028	 * Release our private reference, leaving the one associated with
1029	 * the descriptor table intact.
1030	 */
1031	fdrop(fp, td);
1032	td->td_retval[0] = indx;
1033	return (0);
1034bad:
1035	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
1036	fdrop(fp, td);
1037	return (error);
1038}
1039
1040#ifdef COMPAT_43
1041/*
1042 * Create a file.
1043 */
1044#ifndef _SYS_SYSPROTO_H_
1045struct ocreat_args {
1046	char	*path;
1047	int	mode;
1048};
1049#endif
1050int
1051ocreat(struct thread *td, struct ocreat_args *uap)
1052{
1053
1054	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1055	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
1056}
1057#endif /* COMPAT_43 */
1058
1059/*
1060 * Create a special file.
1061 */
1062#ifndef _SYS_SYSPROTO_H_
1063struct mknod_args {
1064	char	*path;
1065	int	mode;
1066	int	dev;
1067};
1068#endif
1069int
1070sys_mknod(struct thread *td, struct mknod_args *uap)
1071{
1072
1073	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1074	    uap->mode, uap->dev));
1075}
1076
1077#ifndef _SYS_SYSPROTO_H_
1078struct mknodat_args {
1079	int	fd;
1080	char	*path;
1081	mode_t	mode;
1082	dev_t	dev;
1083};
1084#endif
1085int
1086sys_mknodat(struct thread *td, struct mknodat_args *uap)
1087{
1088
1089	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
1090	    uap->dev));
1091}
1092
1093int
1094kern_mknodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1095    int mode, int dev)
1096{
1097	struct vnode *vp;
1098	struct mount *mp;
1099	struct vattr vattr;
1100	struct nameidata nd;
1101	cap_rights_t rights;
1102	int error, whiteout = 0;
1103
1104	AUDIT_ARG_MODE(mode);
1105	AUDIT_ARG_DEV(dev);
1106	switch (mode & S_IFMT) {
1107	case S_IFCHR:
1108	case S_IFBLK:
1109		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
1110		if (error == 0 && dev == VNOVAL)
1111			error = EINVAL;
1112		break;
1113	case S_IFWHT:
1114		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
1115		break;
1116	case S_IFIFO:
1117		if (dev == 0)
1118			return (kern_mkfifoat(td, fd, path, pathseg, mode));
1119		/* FALLTHROUGH */
1120	default:
1121		error = EINVAL;
1122		break;
1123	}
1124	if (error != 0)
1125		return (error);
1126restart:
1127	bwillwrite();
1128	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1129	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
1130	    td);
1131	if ((error = namei(&nd)) != 0)
1132		return (error);
1133	vp = nd.ni_vp;
1134	if (vp != NULL) {
1135		NDFREE(&nd, NDF_ONLY_PNBUF);
1136		if (vp == nd.ni_dvp)
1137			vrele(nd.ni_dvp);
1138		else
1139			vput(nd.ni_dvp);
1140		vrele(vp);
1141		return (EEXIST);
1142	} else {
1143		VATTR_NULL(&vattr);
1144		vattr.va_mode = (mode & ALLPERMS) &
1145		    ~td->td_proc->p_fd->fd_cmask;
1146		vattr.va_rdev = dev;
1147		whiteout = 0;
1148
1149		switch (mode & S_IFMT) {
1150		case S_IFCHR:
1151			vattr.va_type = VCHR;
1152			break;
1153		case S_IFBLK:
1154			vattr.va_type = VBLK;
1155			break;
1156		case S_IFWHT:
1157			whiteout = 1;
1158			break;
1159		default:
1160			panic("kern_mknod: invalid mode");
1161		}
1162	}
1163	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1164		NDFREE(&nd, NDF_ONLY_PNBUF);
1165		vput(nd.ni_dvp);
1166		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1167			return (error);
1168		goto restart;
1169	}
1170#ifdef MAC
1171	if (error == 0 && !whiteout)
1172		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
1173		    &nd.ni_cnd, &vattr);
1174#endif
1175	if (error == 0) {
1176		if (whiteout)
1177			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
1178		else {
1179			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
1180						&nd.ni_cnd, &vattr);
1181			if (error == 0)
1182				vput(nd.ni_vp);
1183		}
1184	}
1185	NDFREE(&nd, NDF_ONLY_PNBUF);
1186	vput(nd.ni_dvp);
1187	vn_finished_write(mp);
1188	return (error);
1189}
1190
1191/*
1192 * Create a named pipe.
1193 */
1194#ifndef _SYS_SYSPROTO_H_
1195struct mkfifo_args {
1196	char	*path;
1197	int	mode;
1198};
1199#endif
1200int
1201sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
1202{
1203
1204	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1205	    uap->mode));
1206}
1207
1208#ifndef _SYS_SYSPROTO_H_
1209struct mkfifoat_args {
1210	int	fd;
1211	char	*path;
1212	mode_t	mode;
1213};
1214#endif
1215int
1216sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
1217{
1218
1219	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
1220	    uap->mode));
1221}
1222
1223int
1224kern_mkfifoat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1225    int mode)
1226{
1227	struct mount *mp;
1228	struct vattr vattr;
1229	struct nameidata nd;
1230	cap_rights_t rights;
1231	int error;
1232
1233	AUDIT_ARG_MODE(mode);
1234restart:
1235	bwillwrite();
1236	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1237	    NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
1238	    td);
1239	if ((error = namei(&nd)) != 0)
1240		return (error);
1241	if (nd.ni_vp != NULL) {
1242		NDFREE(&nd, NDF_ONLY_PNBUF);
1243		if (nd.ni_vp == nd.ni_dvp)
1244			vrele(nd.ni_dvp);
1245		else
1246			vput(nd.ni_dvp);
1247		vrele(nd.ni_vp);
1248		return (EEXIST);
1249	}
1250	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1251		NDFREE(&nd, NDF_ONLY_PNBUF);
1252		vput(nd.ni_dvp);
1253		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1254			return (error);
1255		goto restart;
1256	}
1257	VATTR_NULL(&vattr);
1258	vattr.va_type = VFIFO;
1259	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
1260#ifdef MAC
1261	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1262	    &vattr);
1263	if (error != 0)
1264		goto out;
1265#endif
1266	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
1267	if (error == 0)
1268		vput(nd.ni_vp);
1269#ifdef MAC
1270out:
1271#endif
1272	vput(nd.ni_dvp);
1273	vn_finished_write(mp);
1274	NDFREE(&nd, NDF_ONLY_PNBUF);
1275	return (error);
1276}
1277
1278/*
1279 * Make a hard file link.
1280 */
1281#ifndef _SYS_SYSPROTO_H_
1282struct link_args {
1283	char	*path;
1284	char	*link;
1285};
1286#endif
1287int
1288sys_link(struct thread *td, struct link_args *uap)
1289{
1290
1291	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
1292	    UIO_USERSPACE, FOLLOW));
1293}
1294
1295#ifndef _SYS_SYSPROTO_H_
1296struct linkat_args {
1297	int	fd1;
1298	char	*path1;
1299	int	fd2;
1300	char	*path2;
1301	int	flag;
1302};
1303#endif
1304int
1305sys_linkat(struct thread *td, struct linkat_args *uap)
1306{
1307	int flag;
1308
1309	flag = uap->flag;
1310	if (flag & ~AT_SYMLINK_FOLLOW)
1311		return (EINVAL);
1312
1313	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
1314	    UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
1315}
1316
1317int hardlink_check_uid = 0;
1318SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
1319    &hardlink_check_uid, 0,
1320    "Unprivileged processes cannot create hard links to files owned by other "
1321    "users");
1322static int hardlink_check_gid = 0;
1323SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
1324    &hardlink_check_gid, 0,
1325    "Unprivileged processes cannot create hard links to files owned by other "
1326    "groups");
1327
1328static int
1329can_hardlink(struct vnode *vp, struct ucred *cred)
1330{
1331	struct vattr va;
1332	int error;
1333
1334	if (!hardlink_check_uid && !hardlink_check_gid)
1335		return (0);
1336
1337	error = VOP_GETATTR(vp, &va, cred);
1338	if (error != 0)
1339		return (error);
1340
1341	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
1342		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1343		if (error != 0)
1344			return (error);
1345	}
1346
1347	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
1348		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
1349		if (error != 0)
1350			return (error);
1351	}
1352
1353	return (0);
1354}
1355
1356int
1357kern_linkat(struct thread *td, int fd1, int fd2, char *path1, char *path2,
1358    enum uio_seg segflg, int follow)
1359{
1360	struct vnode *vp;
1361	struct mount *mp;
1362	struct nameidata nd;
1363	cap_rights_t rights;
1364	int error;
1365
1366again:
1367	bwillwrite();
1368	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1,
1369	    cap_rights_init(&rights, CAP_LINKAT_SOURCE), td);
1370
1371	if ((error = namei(&nd)) != 0)
1372		return (error);
1373	NDFREE(&nd, NDF_ONLY_PNBUF);
1374	vp = nd.ni_vp;
1375	if (vp->v_type == VDIR) {
1376		vrele(vp);
1377		return (EPERM);		/* POSIX */
1378	}
1379	NDINIT_ATRIGHTS(&nd, CREATE,
1380	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflg, path2, fd2,
1381	    cap_rights_init(&rights, CAP_LINKAT_TARGET), td);
1382	if ((error = namei(&nd)) == 0) {
1383		if (nd.ni_vp != NULL) {
1384			NDFREE(&nd, NDF_ONLY_PNBUF);
1385			if (nd.ni_dvp == nd.ni_vp)
1386				vrele(nd.ni_dvp);
1387			else
1388				vput(nd.ni_dvp);
1389			vrele(nd.ni_vp);
1390			vrele(vp);
1391			return (EEXIST);
1392		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
1393			/*
1394			 * Cross-device link.  No need to recheck
1395			 * vp->v_type, since it cannot change, except
1396			 * to VBAD.
1397			 */
1398			NDFREE(&nd, NDF_ONLY_PNBUF);
1399			vput(nd.ni_dvp);
1400			vrele(vp);
1401			return (EXDEV);
1402		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
1403			error = can_hardlink(vp, td->td_ucred);
1404#ifdef MAC
1405			if (error == 0)
1406				error = mac_vnode_check_link(td->td_ucred,
1407				    nd.ni_dvp, vp, &nd.ni_cnd);
1408#endif
1409			if (error != 0) {
1410				vput(vp);
1411				vput(nd.ni_dvp);
1412				NDFREE(&nd, NDF_ONLY_PNBUF);
1413				return (error);
1414			}
1415			error = vn_start_write(vp, &mp, V_NOWAIT);
1416			if (error != 0) {
1417				vput(vp);
1418				vput(nd.ni_dvp);
1419				NDFREE(&nd, NDF_ONLY_PNBUF);
1420				error = vn_start_write(NULL, &mp,
1421				    V_XSLEEP | PCATCH);
1422				if (error != 0)
1423					return (error);
1424				goto again;
1425			}
1426			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
1427			VOP_UNLOCK(vp, 0);
1428			vput(nd.ni_dvp);
1429			vn_finished_write(mp);
1430			NDFREE(&nd, NDF_ONLY_PNBUF);
1431		} else {
1432			vput(nd.ni_dvp);
1433			NDFREE(&nd, NDF_ONLY_PNBUF);
1434			vrele(vp);
1435			goto again;
1436		}
1437	}
1438	vrele(vp);
1439	return (error);
1440}
1441
1442/*
1443 * Make a symbolic link.
1444 */
1445#ifndef _SYS_SYSPROTO_H_
1446struct symlink_args {
1447	char	*path;
1448	char	*link;
1449};
1450#endif
1451int
1452sys_symlink(struct thread *td, struct symlink_args *uap)
1453{
1454
1455	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
1456	    UIO_USERSPACE));
1457}
1458
1459#ifndef _SYS_SYSPROTO_H_
1460struct symlinkat_args {
1461	char	*path;
1462	int	fd;
1463	char	*path2;
1464};
1465#endif
1466int
1467sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
1468{
1469
1470	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
1471	    UIO_USERSPACE));
1472}
1473
1474int
1475kern_symlinkat(struct thread *td, char *path1, int fd, char *path2,
1476    enum uio_seg segflg)
1477{
1478	struct mount *mp;
1479	struct vattr vattr;
1480	char *syspath;
1481	struct nameidata nd;
1482	int error;
1483	cap_rights_t rights;
1484
1485	if (segflg == UIO_SYSSPACE) {
1486		syspath = path1;
1487	} else {
1488		syspath = uma_zalloc(namei_zone, M_WAITOK);
1489		if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
1490			goto out;
1491	}
1492	AUDIT_ARG_TEXT(syspath);
1493restart:
1494	bwillwrite();
1495	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
1496	    NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
1497	    td);
1498	if ((error = namei(&nd)) != 0)
1499		goto out;
1500	if (nd.ni_vp) {
1501		NDFREE(&nd, NDF_ONLY_PNBUF);
1502		if (nd.ni_vp == nd.ni_dvp)
1503			vrele(nd.ni_dvp);
1504		else
1505			vput(nd.ni_dvp);
1506		vrele(nd.ni_vp);
1507		error = EEXIST;
1508		goto out;
1509	}
1510	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1511		NDFREE(&nd, NDF_ONLY_PNBUF);
1512		vput(nd.ni_dvp);
1513		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1514			goto out;
1515		goto restart;
1516	}
1517	VATTR_NULL(&vattr);
1518	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
1519#ifdef MAC
1520	vattr.va_type = VLNK;
1521	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
1522	    &vattr);
1523	if (error != 0)
1524		goto out2;
1525#endif
1526	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
1527	if (error == 0)
1528		vput(nd.ni_vp);
1529#ifdef MAC
1530out2:
1531#endif
1532	NDFREE(&nd, NDF_ONLY_PNBUF);
1533	vput(nd.ni_dvp);
1534	vn_finished_write(mp);
1535out:
1536	if (segflg != UIO_SYSSPACE)
1537		uma_zfree(namei_zone, syspath);
1538	return (error);
1539}
1540
1541/*
1542 * Delete a whiteout from the filesystem.
1543 */
1544#ifndef _SYS_SYSPROTO_H_
1545struct undelete_args {
1546	char *path;
1547};
1548#endif
1549int
1550sys_undelete(struct thread *td, struct undelete_args *uap)
1551{
1552	struct mount *mp;
1553	struct nameidata nd;
1554	int error;
1555
1556restart:
1557	bwillwrite();
1558	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
1559	    UIO_USERSPACE, uap->path, td);
1560	error = namei(&nd);
1561	if (error != 0)
1562		return (error);
1563
1564	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
1565		NDFREE(&nd, NDF_ONLY_PNBUF);
1566		if (nd.ni_vp == nd.ni_dvp)
1567			vrele(nd.ni_dvp);
1568		else
1569			vput(nd.ni_dvp);
1570		if (nd.ni_vp)
1571			vrele(nd.ni_vp);
1572		return (EEXIST);
1573	}
1574	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1575		NDFREE(&nd, NDF_ONLY_PNBUF);
1576		vput(nd.ni_dvp);
1577		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
1578			return (error);
1579		goto restart;
1580	}
1581	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
1582	NDFREE(&nd, NDF_ONLY_PNBUF);
1583	vput(nd.ni_dvp);
1584	vn_finished_write(mp);
1585	return (error);
1586}
1587
1588/*
1589 * Delete a name from the filesystem.
1590 */
1591#ifndef _SYS_SYSPROTO_H_
1592struct unlink_args {
1593	char	*path;
1594};
1595#endif
1596int
1597sys_unlink(struct thread *td, struct unlink_args *uap)
1598{
1599
1600	return (kern_unlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE, 0));
1601}
1602
1603#ifndef _SYS_SYSPROTO_H_
1604struct unlinkat_args {
1605	int	fd;
1606	char	*path;
1607	int	flag;
1608};
1609#endif
1610int
1611sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
1612{
1613	int flag = uap->flag;
1614	int fd = uap->fd;
1615	char *path = uap->path;
1616
1617	if (flag & ~AT_REMOVEDIR)
1618		return (EINVAL);
1619
1620	if (flag & AT_REMOVEDIR)
1621		return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
1622	else
1623		return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
1624}
1625
1626int
1627kern_unlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1628    ino_t oldinum)
1629{
1630	struct mount *mp;
1631	struct vnode *vp;
1632	struct nameidata nd;
1633	struct stat sb;
1634	cap_rights_t rights;
1635	int error;
1636
1637restart:
1638	bwillwrite();
1639	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
1640	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
1641	if ((error = namei(&nd)) != 0)
1642		return (error == EINVAL ? EPERM : error);
1643	vp = nd.ni_vp;
1644	if (vp->v_type == VDIR && oldinum == 0) {
1645		error = EPERM;		/* POSIX */
1646	} else if (oldinum != 0 &&
1647		  ((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
1648		  sb.st_ino != oldinum) {
1649			error = EIDRM;	/* Identifier removed */
1650	} else {
1651		/*
1652		 * The root of a mounted filesystem cannot be deleted.
1653		 *
1654		 * XXX: can this only be a VDIR case?
1655		 */
1656		if (vp->v_vflag & VV_ROOT)
1657			error = EBUSY;
1658	}
1659	if (error == 0) {
1660		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
1661			NDFREE(&nd, NDF_ONLY_PNBUF);
1662			vput(nd.ni_dvp);
1663			if (vp == nd.ni_dvp)
1664				vrele(vp);
1665			else
1666				vput(vp);
1667			if ((error = vn_start_write(NULL, &mp,
1668			    V_XSLEEP | PCATCH)) != 0)
1669				return (error);
1670			goto restart;
1671		}
1672#ifdef MAC
1673		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
1674		    &nd.ni_cnd);
1675		if (error != 0)
1676			goto out;
1677#endif
1678		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
1679		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
1680#ifdef MAC
1681out:
1682#endif
1683		vn_finished_write(mp);
1684	}
1685	NDFREE(&nd, NDF_ONLY_PNBUF);
1686	vput(nd.ni_dvp);
1687	if (vp == nd.ni_dvp)
1688		vrele(vp);
1689	else
1690		vput(vp);
1691	return (error);
1692}
1693
1694/*
1695 * Reposition read/write file offset.
1696 */
1697#ifndef _SYS_SYSPROTO_H_
1698struct lseek_args {
1699	int	fd;
1700	int	pad;
1701	off_t	offset;
1702	int	whence;
1703};
1704#endif
1705int
1706sys_lseek(struct thread *td, struct lseek_args *uap)
1707{
1708
1709	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1710}
1711
1712int
1713kern_lseek(struct thread *td, int fd, off_t offset, int whence)
1714{
1715	struct file *fp;
1716	cap_rights_t rights;
1717	int error;
1718
1719	AUDIT_ARG_FD(fd);
1720	error = fget(td, fd, cap_rights_init(&rights, CAP_SEEK), &fp);
1721	if (error != 0)
1722		return (error);
1723	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
1724	    fo_seek(fp, offset, whence, td) : ESPIPE;
1725	fdrop(fp, td);
1726	return (error);
1727}
1728
1729#if defined(COMPAT_43)
1730/*
1731 * Reposition read/write file offset.
1732 */
1733#ifndef _SYS_SYSPROTO_H_
1734struct olseek_args {
1735	int	fd;
1736	long	offset;
1737	int	whence;
1738};
1739#endif
1740int
1741olseek(struct thread *td, struct olseek_args *uap)
1742{
1743
1744	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1745}
1746#endif /* COMPAT_43 */
1747
1748#if defined(COMPAT_FREEBSD6)
1749/* Version with the 'pad' argument */
1750int
1751freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
1752{
1753
1754	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
1755}
1756#endif
1757
1758/*
1759 * Check access permissions using passed credentials.
1760 */
1761static int
1762vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
1763     struct thread *td)
1764{
1765	accmode_t accmode;
1766	int error;
1767
1768	/* Flags == 0 means only check for existence. */
1769	if (user_flags == 0)
1770		return (0);
1771
1772	accmode = 0;
1773	if (user_flags & R_OK)
1774		accmode |= VREAD;
1775	if (user_flags & W_OK)
1776		accmode |= VWRITE;
1777	if (user_flags & X_OK)
1778		accmode |= VEXEC;
1779#ifdef MAC
1780	error = mac_vnode_check_access(cred, vp, accmode);
1781	if (error != 0)
1782		return (error);
1783#endif
1784	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
1785		error = VOP_ACCESS(vp, accmode, cred, td);
1786	return (error);
1787}
1788
1789/*
1790 * Check access permissions using "real" credentials.
1791 */
1792#ifndef _SYS_SYSPROTO_H_
1793struct access_args {
1794	char	*path;
1795	int	amode;
1796};
1797#endif
1798int
1799sys_access(struct thread *td, struct access_args *uap)
1800{
1801
1802	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1803	    0, uap->amode));
1804}
1805
1806#ifndef _SYS_SYSPROTO_H_
1807struct faccessat_args {
1808	int	dirfd;
1809	char	*path;
1810	int	amode;
1811	int	flag;
1812}
1813#endif
1814int
1815sys_faccessat(struct thread *td, struct faccessat_args *uap)
1816{
1817
1818	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
1819	    uap->amode));
1820}
1821
1822int
1823kern_accessat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
1824    int flag, int amode)
1825{
1826	struct ucred *cred, *usecred;
1827	struct vnode *vp;
1828	struct nameidata nd;
1829	cap_rights_t rights;
1830	int error;
1831
1832	if (flag & ~AT_EACCESS)
1833		return (EINVAL);
1834	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
1835		return (EINVAL);
1836
1837	/*
1838	 * Create and modify a temporary credential instead of one that
1839	 * is potentially shared (if we need one).
1840	 */
1841	cred = td->td_ucred;
1842	if ((flag & AT_EACCESS) == 0 &&
1843	    ((cred->cr_uid != cred->cr_ruid ||
1844	    cred->cr_rgid != cred->cr_groups[0]))) {
1845		usecred = crdup(cred);
1846		usecred->cr_uid = cred->cr_ruid;
1847		usecred->cr_groups[0] = cred->cr_rgid;
1848		td->td_ucred = usecred;
1849	} else
1850		usecred = cred;
1851	AUDIT_ARG_VALUE(amode);
1852	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
1853	    AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
1854	    td);
1855	if ((error = namei(&nd)) != 0)
1856		goto out;
1857	vp = nd.ni_vp;
1858
1859	error = vn_access(vp, amode, usecred, td);
1860	NDFREE(&nd, NDF_ONLY_PNBUF);
1861	vput(vp);
1862out:
1863	if (usecred != cred) {
1864		td->td_ucred = cred;
1865		crfree(usecred);
1866	}
1867	return (error);
1868}
1869
1870/*
1871 * Check access permissions using "effective" credentials.
1872 */
1873#ifndef _SYS_SYSPROTO_H_
1874struct eaccess_args {
1875	char	*path;
1876	int	amode;
1877};
1878#endif
1879int
1880sys_eaccess(struct thread *td, struct eaccess_args *uap)
1881{
1882
1883	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
1884	    AT_EACCESS, uap->amode));
1885}
1886
1887#if defined(COMPAT_43)
1888/*
1889 * Get file status; this version follows links.
1890 */
1891#ifndef _SYS_SYSPROTO_H_
1892struct ostat_args {
1893	char	*path;
1894	struct ostat *ub;
1895};
1896#endif
1897int
1898ostat(struct thread *td, struct ostat_args *uap)
1899{
1900	struct stat sb;
1901	struct ostat osb;
1902	int error;
1903
1904	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
1905	    &sb, NULL);
1906	if (error != 0)
1907		return (error);
1908	cvtstat(&sb, &osb);
1909	return (copyout(&osb, uap->ub, sizeof (osb)));
1910}
1911
1912/*
1913 * Get file status; this version does not follow links.
1914 */
1915#ifndef _SYS_SYSPROTO_H_
1916struct olstat_args {
1917	char	*path;
1918	struct ostat *ub;
1919};
1920#endif
1921int
1922olstat(struct thread *td, struct olstat_args *uap)
1923{
1924	struct stat sb;
1925	struct ostat osb;
1926	int error;
1927
1928	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
1929	    UIO_USERSPACE, &sb, NULL);
1930	if (error != 0)
1931		return (error);
1932	cvtstat(&sb, &osb);
1933	return (copyout(&osb, uap->ub, sizeof (osb)));
1934}
1935
1936/*
1937 * Convert from an old to a new stat structure.
1938 */
1939void
1940cvtstat(struct stat *st, struct ostat *ost)
1941{
1942
1943	bzero(ost, sizeof(*ost));
1944	ost->st_dev = st->st_dev;
1945	ost->st_ino = st->st_ino;
1946	ost->st_mode = st->st_mode;
1947	ost->st_nlink = st->st_nlink;
1948	ost->st_uid = st->st_uid;
1949	ost->st_gid = st->st_gid;
1950	ost->st_rdev = st->st_rdev;
1951	if (st->st_size < (quad_t)1 << 32)
1952		ost->st_size = st->st_size;
1953	else
1954		ost->st_size = -2;
1955	ost->st_atim = st->st_atim;
1956	ost->st_mtim = st->st_mtim;
1957	ost->st_ctim = st->st_ctim;
1958	ost->st_blksize = st->st_blksize;
1959	ost->st_blocks = st->st_blocks;
1960	ost->st_flags = st->st_flags;
1961	ost->st_gen = st->st_gen;
1962}
1963#endif /* COMPAT_43 */
1964
1965/*
1966 * Get file status; this version follows links.
1967 */
1968#ifndef _SYS_SYSPROTO_H_
1969struct stat_args {
1970	char	*path;
1971	struct stat *ub;
1972};
1973#endif
1974int
1975sys_stat(struct thread *td, struct stat_args *uap)
1976{
1977	struct stat sb;
1978	int error;
1979
1980	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
1981	    &sb, NULL);
1982	if (error == 0)
1983		error = copyout(&sb, uap->ub, sizeof (sb));
1984	return (error);
1985}
1986
1987#ifndef _SYS_SYSPROTO_H_
1988struct fstatat_args {
1989	int	fd;
1990	char	*path;
1991	struct stat	*buf;
1992	int	flag;
1993}
1994#endif
1995int
1996sys_fstatat(struct thread *td, struct fstatat_args *uap)
1997{
1998	struct stat sb;
1999	int error;
2000
2001	error = kern_statat(td, uap->flag, uap->fd, uap->path,
2002	    UIO_USERSPACE, &sb, NULL);
2003	if (error == 0)
2004		error = copyout(&sb, uap->buf, sizeof (sb));
2005	return (error);
2006}
2007
2008int
2009kern_statat(struct thread *td, int flag, int fd, char *path,
2010    enum uio_seg pathseg, struct stat *sbp,
2011    void (*hook)(struct vnode *vp, struct stat *sbp))
2012{
2013	struct nameidata nd;
2014	struct stat sb;
2015	cap_rights_t rights;
2016	int error;
2017
2018	if (flag & ~AT_SYMLINK_NOFOLLOW)
2019		return (EINVAL);
2020
2021	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2022	    FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
2023	    cap_rights_init(&rights, CAP_FSTAT), td);
2024
2025	if ((error = namei(&nd)) != 0)
2026		return (error);
2027	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
2028	if (error == 0) {
2029		SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
2030		if (S_ISREG(sb.st_mode))
2031			SDT_PROBE2(vfs, , stat, reg, path, pathseg);
2032		if (__predict_false(hook != NULL))
2033			hook(nd.ni_vp, &sb);
2034	}
2035	NDFREE(&nd, NDF_ONLY_PNBUF);
2036	vput(nd.ni_vp);
2037	if (error != 0)
2038		return (error);
2039	*sbp = sb;
2040#ifdef KTRACE
2041	if (KTRPOINT(td, KTR_STRUCT))
2042		ktrstat(&sb);
2043#endif
2044	return (0);
2045}
2046
2047/*
2048 * Get file status; this version does not follow links.
2049 */
2050#ifndef _SYS_SYSPROTO_H_
2051struct lstat_args {
2052	char	*path;
2053	struct stat *ub;
2054};
2055#endif
2056int
2057sys_lstat(struct thread *td, struct lstat_args *uap)
2058{
2059	struct stat sb;
2060	int error;
2061
2062	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2063	    UIO_USERSPACE, &sb, NULL);
2064	if (error == 0)
2065		error = copyout(&sb, uap->ub, sizeof (sb));
2066	return (error);
2067}
2068
2069/*
2070 * Implementation of the NetBSD [l]stat() functions.
2071 */
2072void
2073cvtnstat( struct stat *sb, struct nstat *nsb)
2074{
2075
2076	bzero(nsb, sizeof *nsb);
2077	nsb->st_dev = sb->st_dev;
2078	nsb->st_ino = sb->st_ino;
2079	nsb->st_mode = sb->st_mode;
2080	nsb->st_nlink = sb->st_nlink;
2081	nsb->st_uid = sb->st_uid;
2082	nsb->st_gid = sb->st_gid;
2083	nsb->st_rdev = sb->st_rdev;
2084	nsb->st_atim = sb->st_atim;
2085	nsb->st_mtim = sb->st_mtim;
2086	nsb->st_ctim = sb->st_ctim;
2087	nsb->st_size = sb->st_size;
2088	nsb->st_blocks = sb->st_blocks;
2089	nsb->st_blksize = sb->st_blksize;
2090	nsb->st_flags = sb->st_flags;
2091	nsb->st_gen = sb->st_gen;
2092	nsb->st_birthtim = sb->st_birthtim;
2093}
2094
2095#ifndef _SYS_SYSPROTO_H_
2096struct nstat_args {
2097	char	*path;
2098	struct nstat *ub;
2099};
2100#endif
2101int
2102sys_nstat(struct thread *td, struct nstat_args *uap)
2103{
2104	struct stat sb;
2105	struct nstat nsb;
2106	int error;
2107
2108	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
2109	    &sb, NULL);
2110	if (error != 0)
2111		return (error);
2112	cvtnstat(&sb, &nsb);
2113	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2114}
2115
2116/*
2117 * NetBSD lstat.  Get file status; this version does not follow links.
2118 */
2119#ifndef _SYS_SYSPROTO_H_
2120struct lstat_args {
2121	char	*path;
2122	struct stat *ub;
2123};
2124#endif
2125int
2126sys_nlstat(struct thread *td, struct nlstat_args *uap)
2127{
2128	struct stat sb;
2129	struct nstat nsb;
2130	int error;
2131
2132	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
2133	    UIO_USERSPACE, &sb, NULL);
2134	if (error != 0)
2135		return (error);
2136	cvtnstat(&sb, &nsb);
2137	return (copyout(&nsb, uap->ub, sizeof (nsb)));
2138}
2139
2140/*
2141 * Get configurable pathname variables.
2142 */
2143#ifndef _SYS_SYSPROTO_H_
2144struct pathconf_args {
2145	char	*path;
2146	int	name;
2147};
2148#endif
2149int
2150sys_pathconf(struct thread *td, struct pathconf_args *uap)
2151{
2152
2153	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
2154}
2155
2156#ifndef _SYS_SYSPROTO_H_
2157struct lpathconf_args {
2158	char	*path;
2159	int	name;
2160};
2161#endif
2162int
2163sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
2164{
2165
2166	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
2167	    NOFOLLOW));
2168}
2169
2170int
2171kern_pathconf(struct thread *td, char *path, enum uio_seg pathseg, int name,
2172    u_long flags)
2173{
2174	struct nameidata nd;
2175	int error;
2176
2177	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
2178	    pathseg, path, td);
2179	if ((error = namei(&nd)) != 0)
2180		return (error);
2181	NDFREE(&nd, NDF_ONLY_PNBUF);
2182
2183	error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
2184	vput(nd.ni_vp);
2185	return (error);
2186}
2187
2188/*
2189 * Return target name of a symbolic link.
2190 */
2191#ifndef _SYS_SYSPROTO_H_
2192struct readlink_args {
2193	char	*path;
2194	char	*buf;
2195	size_t	count;
2196};
2197#endif
2198int
2199sys_readlink(struct thread *td, struct readlink_args *uap)
2200{
2201
2202	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2203	    uap->buf, UIO_USERSPACE, uap->count));
2204}
2205#ifndef _SYS_SYSPROTO_H_
2206struct readlinkat_args {
2207	int	fd;
2208	char	*path;
2209	char	*buf;
2210	size_t	bufsize;
2211};
2212#endif
2213int
2214sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
2215{
2216
2217	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
2218	    uap->buf, UIO_USERSPACE, uap->bufsize));
2219}
2220
2221int
2222kern_readlinkat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2223    char *buf, enum uio_seg bufseg, size_t count)
2224{
2225	struct vnode *vp;
2226	struct iovec aiov;
2227	struct uio auio;
2228	struct nameidata nd;
2229	int error;
2230
2231	if (count > IOSIZE_MAX)
2232		return (EINVAL);
2233
2234	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
2235	    pathseg, path, fd, td);
2236
2237	if ((error = namei(&nd)) != 0)
2238		return (error);
2239	NDFREE(&nd, NDF_ONLY_PNBUF);
2240	vp = nd.ni_vp;
2241#ifdef MAC
2242	error = mac_vnode_check_readlink(td->td_ucred, vp);
2243	if (error != 0) {
2244		vput(vp);
2245		return (error);
2246	}
2247#endif
2248	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
2249		error = EINVAL;
2250	else {
2251		aiov.iov_base = buf;
2252		aiov.iov_len = count;
2253		auio.uio_iov = &aiov;
2254		auio.uio_iovcnt = 1;
2255		auio.uio_offset = 0;
2256		auio.uio_rw = UIO_READ;
2257		auio.uio_segflg = bufseg;
2258		auio.uio_td = td;
2259		auio.uio_resid = count;
2260		error = VOP_READLINK(vp, &auio, td->td_ucred);
2261		td->td_retval[0] = count - auio.uio_resid;
2262	}
2263	vput(vp);
2264	return (error);
2265}
2266
2267/*
2268 * Common implementation code for chflags() and fchflags().
2269 */
2270static int
2271setfflags(struct thread *td, struct vnode *vp, u_long flags)
2272{
2273	struct mount *mp;
2274	struct vattr vattr;
2275	int error;
2276
2277	/* We can't support the value matching VNOVAL. */
2278	if (flags == VNOVAL)
2279		return (EOPNOTSUPP);
2280
2281	/*
2282	 * Prevent non-root users from setting flags on devices.  When
2283	 * a device is reused, users can retain ownership of the device
2284	 * if they are allowed to set flags and programs assume that
2285	 * chown can't fail when done as root.
2286	 */
2287	if (vp->v_type == VCHR || vp->v_type == VBLK) {
2288		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
2289		if (error != 0)
2290			return (error);
2291	}
2292
2293	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2294		return (error);
2295	VATTR_NULL(&vattr);
2296	vattr.va_flags = flags;
2297	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2298#ifdef MAC
2299	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
2300	if (error == 0)
2301#endif
2302		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2303	VOP_UNLOCK(vp, 0);
2304	vn_finished_write(mp);
2305	return (error);
2306}
2307
2308/*
2309 * Change flags of a file given a path name.
2310 */
2311#ifndef _SYS_SYSPROTO_H_
2312struct chflags_args {
2313	const char *path;
2314	u_long	flags;
2315};
2316#endif
2317int
2318sys_chflags(struct thread *td, struct chflags_args *uap)
2319{
2320
2321	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2322	    uap->flags, 0));
2323}
2324
2325#ifndef _SYS_SYSPROTO_H_
2326struct chflagsat_args {
2327	int	fd;
2328	const char *path;
2329	u_long	flags;
2330	int	atflag;
2331}
2332#endif
2333int
2334sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
2335{
2336	int fd = uap->fd;
2337	const char *path = uap->path;
2338	u_long flags = uap->flags;
2339	int atflag = uap->atflag;
2340
2341	if (atflag & ~AT_SYMLINK_NOFOLLOW)
2342		return (EINVAL);
2343
2344	return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
2345}
2346
2347/*
2348 * Same as chflags() but doesn't follow symlinks.
2349 */
2350#ifndef _SYS_SYSPROTO_H_
2351struct lchflags_args {
2352	const char *path;
2353	u_long flags;
2354};
2355#endif
2356int
2357sys_lchflags(struct thread *td, struct lchflags_args *uap)
2358{
2359
2360	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2361	    uap->flags, AT_SYMLINK_NOFOLLOW));
2362}
2363
2364static int
2365kern_chflagsat(struct thread *td, int fd, const char *path,
2366    enum uio_seg pathseg, u_long flags, int atflag)
2367{
2368	struct nameidata nd;
2369	cap_rights_t rights;
2370	int error, follow;
2371
2372	AUDIT_ARG_FFLAGS(flags);
2373	follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2374	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2375	    cap_rights_init(&rights, CAP_FCHFLAGS), td);
2376	if ((error = namei(&nd)) != 0)
2377		return (error);
2378	NDFREE(&nd, NDF_ONLY_PNBUF);
2379	error = setfflags(td, nd.ni_vp, flags);
2380	vrele(nd.ni_vp);
2381	return (error);
2382}
2383
2384/*
2385 * Change flags of a file given a file descriptor.
2386 */
2387#ifndef _SYS_SYSPROTO_H_
2388struct fchflags_args {
2389	int	fd;
2390	u_long	flags;
2391};
2392#endif
2393int
2394sys_fchflags(struct thread *td, struct fchflags_args *uap)
2395{
2396	struct file *fp;
2397	cap_rights_t rights;
2398	int error;
2399
2400	AUDIT_ARG_FD(uap->fd);
2401	AUDIT_ARG_FFLAGS(uap->flags);
2402	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_FCHFLAGS),
2403	    &fp);
2404	if (error != 0)
2405		return (error);
2406#ifdef AUDIT
2407	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2408	AUDIT_ARG_VNODE1(fp->f_vnode);
2409	VOP_UNLOCK(fp->f_vnode, 0);
2410#endif
2411	error = setfflags(td, fp->f_vnode, uap->flags);
2412	fdrop(fp, td);
2413	return (error);
2414}
2415
2416/*
2417 * Common implementation code for chmod(), lchmod() and fchmod().
2418 */
2419int
2420setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
2421{
2422	struct mount *mp;
2423	struct vattr vattr;
2424	int error;
2425
2426	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2427		return (error);
2428	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2429	VATTR_NULL(&vattr);
2430	vattr.va_mode = mode & ALLPERMS;
2431#ifdef MAC
2432	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
2433	if (error == 0)
2434#endif
2435		error = VOP_SETATTR(vp, &vattr, cred);
2436	VOP_UNLOCK(vp, 0);
2437	vn_finished_write(mp);
2438	return (error);
2439}
2440
2441/*
2442 * Change mode of a file given path name.
2443 */
2444#ifndef _SYS_SYSPROTO_H_
2445struct chmod_args {
2446	char	*path;
2447	int	mode;
2448};
2449#endif
2450int
2451sys_chmod(struct thread *td, struct chmod_args *uap)
2452{
2453
2454	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2455	    uap->mode, 0));
2456}
2457
2458#ifndef _SYS_SYSPROTO_H_
2459struct fchmodat_args {
2460	int	dirfd;
2461	char	*path;
2462	mode_t	mode;
2463	int	flag;
2464}
2465#endif
2466int
2467sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
2468{
2469	int flag = uap->flag;
2470	int fd = uap->fd;
2471	char *path = uap->path;
2472	mode_t mode = uap->mode;
2473
2474	if (flag & ~AT_SYMLINK_NOFOLLOW)
2475		return (EINVAL);
2476
2477	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
2478}
2479
2480/*
2481 * Change mode of a file given path name (don't follow links.)
2482 */
2483#ifndef _SYS_SYSPROTO_H_
2484struct lchmod_args {
2485	char	*path;
2486	int	mode;
2487};
2488#endif
2489int
2490sys_lchmod(struct thread *td, struct lchmod_args *uap)
2491{
2492
2493	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2494	    uap->mode, AT_SYMLINK_NOFOLLOW));
2495}
2496
2497int
2498kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2499    mode_t mode, int flag)
2500{
2501	struct nameidata nd;
2502	cap_rights_t rights;
2503	int error, follow;
2504
2505	AUDIT_ARG_MODE(mode);
2506	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2507	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2508	    cap_rights_init(&rights, CAP_FCHMOD), td);
2509	if ((error = namei(&nd)) != 0)
2510		return (error);
2511	NDFREE(&nd, NDF_ONLY_PNBUF);
2512	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
2513	vrele(nd.ni_vp);
2514	return (error);
2515}
2516
2517/*
2518 * Change mode of a file given a file descriptor.
2519 */
2520#ifndef _SYS_SYSPROTO_H_
2521struct fchmod_args {
2522	int	fd;
2523	int	mode;
2524};
2525#endif
2526int
2527sys_fchmod(struct thread *td, struct fchmod_args *uap)
2528{
2529	struct file *fp;
2530	cap_rights_t rights;
2531	int error;
2532
2533	AUDIT_ARG_FD(uap->fd);
2534	AUDIT_ARG_MODE(uap->mode);
2535
2536	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
2537	if (error != 0)
2538		return (error);
2539	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
2540	fdrop(fp, td);
2541	return (error);
2542}
2543
2544/*
2545 * Common implementation for chown(), lchown(), and fchown()
2546 */
2547int
2548setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
2549    gid_t gid)
2550{
2551	struct mount *mp;
2552	struct vattr vattr;
2553	int error;
2554
2555	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2556		return (error);
2557	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2558	VATTR_NULL(&vattr);
2559	vattr.va_uid = uid;
2560	vattr.va_gid = gid;
2561#ifdef MAC
2562	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
2563	    vattr.va_gid);
2564	if (error == 0)
2565#endif
2566		error = VOP_SETATTR(vp, &vattr, cred);
2567	VOP_UNLOCK(vp, 0);
2568	vn_finished_write(mp);
2569	return (error);
2570}
2571
2572/*
2573 * Set ownership given a path name.
2574 */
2575#ifndef _SYS_SYSPROTO_H_
2576struct chown_args {
2577	char	*path;
2578	int	uid;
2579	int	gid;
2580};
2581#endif
2582int
2583sys_chown(struct thread *td, struct chown_args *uap)
2584{
2585
2586	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
2587	    uap->gid, 0));
2588}
2589
2590#ifndef _SYS_SYSPROTO_H_
2591struct fchownat_args {
2592	int fd;
2593	const char * path;
2594	uid_t uid;
2595	gid_t gid;
2596	int flag;
2597};
2598#endif
2599int
2600sys_fchownat(struct thread *td, struct fchownat_args *uap)
2601{
2602	int flag;
2603
2604	flag = uap->flag;
2605	if (flag & ~AT_SYMLINK_NOFOLLOW)
2606		return (EINVAL);
2607
2608	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
2609	    uap->gid, uap->flag));
2610}
2611
2612int
2613kern_fchownat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2614    int uid, int gid, int flag)
2615{
2616	struct nameidata nd;
2617	cap_rights_t rights;
2618	int error, follow;
2619
2620	AUDIT_ARG_OWNER(uid, gid);
2621	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
2622	NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
2623	    cap_rights_init(&rights, CAP_FCHOWN), td);
2624
2625	if ((error = namei(&nd)) != 0)
2626		return (error);
2627	NDFREE(&nd, NDF_ONLY_PNBUF);
2628	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
2629	vrele(nd.ni_vp);
2630	return (error);
2631}
2632
2633/*
2634 * Set ownership given a path name, do not cross symlinks.
2635 */
2636#ifndef _SYS_SYSPROTO_H_
2637struct lchown_args {
2638	char	*path;
2639	int	uid;
2640	int	gid;
2641};
2642#endif
2643int
2644sys_lchown(struct thread *td, struct lchown_args *uap)
2645{
2646
2647	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2648	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
2649}
2650
2651/*
2652 * Set ownership given a file descriptor.
2653 */
2654#ifndef _SYS_SYSPROTO_H_
2655struct fchown_args {
2656	int	fd;
2657	int	uid;
2658	int	gid;
2659};
2660#endif
2661int
2662sys_fchown(struct thread *td, struct fchown_args *uap)
2663{
2664	struct file *fp;
2665	cap_rights_t rights;
2666	int error;
2667
2668	AUDIT_ARG_FD(uap->fd);
2669	AUDIT_ARG_OWNER(uap->uid, uap->gid);
2670	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
2671	if (error != 0)
2672		return (error);
2673	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
2674	fdrop(fp, td);
2675	return (error);
2676}
2677
2678/*
2679 * Common implementation code for utimes(), lutimes(), and futimes().
2680 */
2681static int
2682getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
2683    struct timespec *tsp)
2684{
2685	struct timeval tv[2];
2686	const struct timeval *tvp;
2687	int error;
2688
2689	if (usrtvp == NULL) {
2690		vfs_timestamp(&tsp[0]);
2691		tsp[1] = tsp[0];
2692	} else {
2693		if (tvpseg == UIO_SYSSPACE) {
2694			tvp = usrtvp;
2695		} else {
2696			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
2697				return (error);
2698			tvp = tv;
2699		}
2700
2701		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
2702		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
2703			return (EINVAL);
2704		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
2705		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
2706	}
2707	return (0);
2708}
2709
2710/*
2711 * Common implementation code for futimens(), utimensat().
2712 */
2713#define	UTIMENS_NULL	0x1
2714#define	UTIMENS_EXIT	0x2
2715static int
2716getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
2717    struct timespec *tsp, int *retflags)
2718{
2719	struct timespec tsnow;
2720	int error;
2721
2722	vfs_timestamp(&tsnow);
2723	*retflags = 0;
2724	if (usrtsp == NULL) {
2725		tsp[0] = tsnow;
2726		tsp[1] = tsnow;
2727		*retflags |= UTIMENS_NULL;
2728		return (0);
2729	}
2730	if (tspseg == UIO_SYSSPACE) {
2731		tsp[0] = usrtsp[0];
2732		tsp[1] = usrtsp[1];
2733	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
2734		return (error);
2735	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
2736		*retflags |= UTIMENS_EXIT;
2737	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
2738		*retflags |= UTIMENS_NULL;
2739	if (tsp[0].tv_nsec == UTIME_OMIT)
2740		tsp[0].tv_sec = VNOVAL;
2741	else if (tsp[0].tv_nsec == UTIME_NOW)
2742		tsp[0] = tsnow;
2743	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
2744		return (EINVAL);
2745	if (tsp[1].tv_nsec == UTIME_OMIT)
2746		tsp[1].tv_sec = VNOVAL;
2747	else if (tsp[1].tv_nsec == UTIME_NOW)
2748		tsp[1] = tsnow;
2749	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
2750		return (EINVAL);
2751
2752	return (0);
2753}
2754
2755/*
2756 * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
2757 * and utimensat().
2758 */
2759static int
2760setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
2761    int numtimes, int nullflag)
2762{
2763	struct mount *mp;
2764	struct vattr vattr;
2765	int error, setbirthtime;
2766
2767	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
2768		return (error);
2769	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2770	setbirthtime = 0;
2771	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
2772	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
2773		setbirthtime = 1;
2774	VATTR_NULL(&vattr);
2775	vattr.va_atime = ts[0];
2776	vattr.va_mtime = ts[1];
2777	if (setbirthtime)
2778		vattr.va_birthtime = ts[1];
2779	if (numtimes > 2)
2780		vattr.va_birthtime = ts[2];
2781	if (nullflag)
2782		vattr.va_vaflags |= VA_UTIMES_NULL;
2783#ifdef MAC
2784	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
2785	    vattr.va_mtime);
2786#endif
2787	if (error == 0)
2788		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
2789	VOP_UNLOCK(vp, 0);
2790	vn_finished_write(mp);
2791	return (error);
2792}
2793
2794/*
2795 * Set the access and modification times of a file.
2796 */
2797#ifndef _SYS_SYSPROTO_H_
2798struct utimes_args {
2799	char	*path;
2800	struct	timeval *tptr;
2801};
2802#endif
2803int
2804sys_utimes(struct thread *td, struct utimes_args *uap)
2805{
2806
2807	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
2808	    uap->tptr, UIO_USERSPACE));
2809}
2810
2811#ifndef _SYS_SYSPROTO_H_
2812struct futimesat_args {
2813	int fd;
2814	const char * path;
2815	const struct timeval * times;
2816};
2817#endif
2818int
2819sys_futimesat(struct thread *td, struct futimesat_args *uap)
2820{
2821
2822	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
2823	    uap->times, UIO_USERSPACE));
2824}
2825
2826int
2827kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2828    struct timeval *tptr, enum uio_seg tptrseg)
2829{
2830	struct nameidata nd;
2831	struct timespec ts[2];
2832	cap_rights_t rights;
2833	int error;
2834
2835	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
2836		return (error);
2837	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
2838	    cap_rights_init(&rights, CAP_FUTIMES), td);
2839
2840	if ((error = namei(&nd)) != 0)
2841		return (error);
2842	NDFREE(&nd, NDF_ONLY_PNBUF);
2843	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
2844	vrele(nd.ni_vp);
2845	return (error);
2846}
2847
2848/*
2849 * Set the access and modification times of a file.
2850 */
2851#ifndef _SYS_SYSPROTO_H_
2852struct lutimes_args {
2853	char	*path;
2854	struct	timeval *tptr;
2855};
2856#endif
2857int
2858sys_lutimes(struct thread *td, struct lutimes_args *uap)
2859{
2860
2861	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
2862	    UIO_USERSPACE));
2863}
2864
2865int
2866kern_lutimes(struct thread *td, char *path, enum uio_seg pathseg,
2867    struct timeval *tptr, enum uio_seg tptrseg)
2868{
2869	struct timespec ts[2];
2870	struct nameidata nd;
2871	int error;
2872
2873	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
2874		return (error);
2875	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
2876	if ((error = namei(&nd)) != 0)
2877		return (error);
2878	NDFREE(&nd, NDF_ONLY_PNBUF);
2879	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
2880	vrele(nd.ni_vp);
2881	return (error);
2882}
2883
2884/*
2885 * Set the access and modification times of a file.
2886 */
2887#ifndef _SYS_SYSPROTO_H_
2888struct futimes_args {
2889	int	fd;
2890	struct	timeval *tptr;
2891};
2892#endif
2893int
2894sys_futimes(struct thread *td, struct futimes_args *uap)
2895{
2896
2897	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
2898}
2899
2900int
2901kern_futimes(struct thread *td, int fd, struct timeval *tptr,
2902    enum uio_seg tptrseg)
2903{
2904	struct timespec ts[2];
2905	struct file *fp;
2906	cap_rights_t rights;
2907	int error;
2908
2909	AUDIT_ARG_FD(fd);
2910	error = getutimes(tptr, tptrseg, ts);
2911	if (error != 0)
2912		return (error);
2913	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
2914	if (error != 0)
2915		return (error);
2916#ifdef AUDIT
2917	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2918	AUDIT_ARG_VNODE1(fp->f_vnode);
2919	VOP_UNLOCK(fp->f_vnode, 0);
2920#endif
2921	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
2922	fdrop(fp, td);
2923	return (error);
2924}
2925
2926int
2927sys_futimens(struct thread *td, struct futimens_args *uap)
2928{
2929
2930	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
2931}
2932
2933int
2934kern_futimens(struct thread *td, int fd, struct timespec *tptr,
2935    enum uio_seg tptrseg)
2936{
2937	struct timespec ts[2];
2938	struct file *fp;
2939	cap_rights_t rights;
2940	int error, flags;
2941
2942	AUDIT_ARG_FD(fd);
2943	error = getutimens(tptr, tptrseg, ts, &flags);
2944	if (error != 0)
2945		return (error);
2946	if (flags & UTIMENS_EXIT)
2947		return (0);
2948	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FUTIMES), &fp);
2949	if (error != 0)
2950		return (error);
2951#ifdef AUDIT
2952	vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
2953	AUDIT_ARG_VNODE1(fp->f_vnode);
2954	VOP_UNLOCK(fp->f_vnode, 0);
2955#endif
2956	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
2957	fdrop(fp, td);
2958	return (error);
2959}
2960
2961int
2962sys_utimensat(struct thread *td, struct utimensat_args *uap)
2963{
2964
2965	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
2966	    uap->times, UIO_USERSPACE, uap->flag));
2967}
2968
2969int
2970kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
2971    struct timespec *tptr, enum uio_seg tptrseg, int flag)
2972{
2973	struct nameidata nd;
2974	struct timespec ts[2];
2975	cap_rights_t rights;
2976	int error, flags;
2977
2978	if (flag & ~AT_SYMLINK_NOFOLLOW)
2979		return (EINVAL);
2980
2981	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
2982		return (error);
2983	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
2984	    FOLLOW) | AUDITVNODE1, pathseg, path, fd,
2985	    cap_rights_init(&rights, CAP_FUTIMES), td);
2986	if ((error = namei(&nd)) != 0)
2987		return (error);
2988	/*
2989	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
2990	 * POSIX states:
2991	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
2992	 * "Search permission is denied by a component of the path prefix."
2993	 */
2994	NDFREE(&nd, NDF_ONLY_PNBUF);
2995	if ((flags & UTIMENS_EXIT) == 0)
2996		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
2997	vrele(nd.ni_vp);
2998	return (error);
2999}
3000
3001/*
3002 * Truncate a file given its path name.
3003 */
3004#ifndef _SYS_SYSPROTO_H_
3005struct truncate_args {
3006	char	*path;
3007	int	pad;
3008	off_t	length;
3009};
3010#endif
3011int
3012sys_truncate(struct thread *td, struct truncate_args *uap)
3013{
3014
3015	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3016}
3017
3018int
3019kern_truncate(struct thread *td, char *path, enum uio_seg pathseg, off_t length)
3020{
3021	struct mount *mp;
3022	struct vnode *vp;
3023	void *rl_cookie;
3024	struct vattr vattr;
3025	struct nameidata nd;
3026	int error;
3027
3028	if (length < 0)
3029		return(EINVAL);
3030	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
3031	if ((error = namei(&nd)) != 0)
3032		return (error);
3033	vp = nd.ni_vp;
3034	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
3035	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
3036		vn_rangelock_unlock(vp, rl_cookie);
3037		vrele(vp);
3038		return (error);
3039	}
3040	NDFREE(&nd, NDF_ONLY_PNBUF);
3041	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3042	if (vp->v_type == VDIR)
3043		error = EISDIR;
3044#ifdef MAC
3045	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
3046	}
3047#endif
3048	else if ((error = vn_writechk(vp)) == 0 &&
3049	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
3050		VATTR_NULL(&vattr);
3051		vattr.va_size = length;
3052		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
3053	}
3054	VOP_UNLOCK(vp, 0);
3055	vn_finished_write(mp);
3056	vn_rangelock_unlock(vp, rl_cookie);
3057	vrele(vp);
3058	return (error);
3059}
3060
3061#if defined(COMPAT_43)
3062/*
3063 * Truncate a file given its path name.
3064 */
3065#ifndef _SYS_SYSPROTO_H_
3066struct otruncate_args {
3067	char	*path;
3068	long	length;
3069};
3070#endif
3071int
3072otruncate(struct thread *td, struct otruncate_args *uap)
3073{
3074
3075	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3076}
3077#endif /* COMPAT_43 */
3078
3079#if defined(COMPAT_FREEBSD6)
3080/* Versions with the pad argument */
3081int
3082freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
3083{
3084
3085	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
3086}
3087
3088int
3089freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
3090{
3091
3092	return (kern_ftruncate(td, uap->fd, uap->length));
3093}
3094#endif
3095
3096int
3097kern_fsync(struct thread *td, int fd, bool fullsync)
3098{
3099	struct vnode *vp;
3100	struct mount *mp;
3101	struct file *fp;
3102	cap_rights_t rights;
3103	int error, lock_flags;
3104
3105	AUDIT_ARG_FD(fd);
3106	error = getvnode(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
3107	if (error != 0)
3108		return (error);
3109	vp = fp->f_vnode;
3110#if 0
3111	if (!fullsync)
3112		/* XXXKIB: compete outstanding aio writes */;
3113#endif
3114	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
3115	if (error != 0)
3116		goto drop;
3117	if (MNT_SHARED_WRITES(mp) ||
3118	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
3119		lock_flags = LK_SHARED;
3120	} else {
3121		lock_flags = LK_EXCLUSIVE;
3122	}
3123	vn_lock(vp, lock_flags | LK_RETRY);
3124	AUDIT_ARG_VNODE1(vp);
3125	if (vp->v_object != NULL) {
3126		VM_OBJECT_WLOCK(vp->v_object);
3127		vm_object_page_clean(vp->v_object, 0, 0, 0);
3128		VM_OBJECT_WUNLOCK(vp->v_object);
3129	}
3130	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
3131	VOP_UNLOCK(vp, 0);
3132	vn_finished_write(mp);
3133drop:
3134	fdrop(fp, td);
3135	return (error);
3136}
3137
3138/*
3139 * Sync an open file.
3140 */
3141#ifndef _SYS_SYSPROTO_H_
3142struct fsync_args {
3143	int	fd;
3144};
3145#endif
3146int
3147sys_fsync(struct thread *td, struct fsync_args *uap)
3148{
3149
3150	return (kern_fsync(td, uap->fd, true));
3151}
3152
3153int
3154sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
3155{
3156
3157	return (kern_fsync(td, uap->fd, false));
3158}
3159
3160/*
3161 * Rename files.  Source and destination must either both be directories, or
3162 * both not be directories.  If target is a directory, it must be empty.
3163 */
3164#ifndef _SYS_SYSPROTO_H_
3165struct rename_args {
3166	char	*from;
3167	char	*to;
3168};
3169#endif
3170int
3171sys_rename(struct thread *td, struct rename_args *uap)
3172{
3173
3174	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
3175	    uap->to, UIO_USERSPACE));
3176}
3177
3178#ifndef _SYS_SYSPROTO_H_
3179struct renameat_args {
3180	int	oldfd;
3181	char	*old;
3182	int	newfd;
3183	char	*new;
3184};
3185#endif
3186int
3187sys_renameat(struct thread *td, struct renameat_args *uap)
3188{
3189
3190	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
3191	    UIO_USERSPACE));
3192}
3193
3194int
3195kern_renameat(struct thread *td, int oldfd, char *old, int newfd, char *new,
3196    enum uio_seg pathseg)
3197{
3198	struct mount *mp = NULL;
3199	struct vnode *tvp, *fvp, *tdvp;
3200	struct nameidata fromnd, tond;
3201	cap_rights_t rights;
3202	int error;
3203
3204again:
3205	bwillwrite();
3206#ifdef MAC
3207	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
3208	    AUDITVNODE1, pathseg, old, oldfd,
3209	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3210#else
3211	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
3212	    pathseg, old, oldfd,
3213	    cap_rights_init(&rights, CAP_RENAMEAT_SOURCE), td);
3214#endif
3215
3216	if ((error = namei(&fromnd)) != 0)
3217		return (error);
3218#ifdef MAC
3219	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
3220	    fromnd.ni_vp, &fromnd.ni_cnd);
3221	VOP_UNLOCK(fromnd.ni_dvp, 0);
3222	if (fromnd.ni_dvp != fromnd.ni_vp)
3223		VOP_UNLOCK(fromnd.ni_vp, 0);
3224#endif
3225	fvp = fromnd.ni_vp;
3226	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
3227	    SAVESTART | AUDITVNODE2, pathseg, new, newfd,
3228	    cap_rights_init(&rights, CAP_RENAMEAT_TARGET), td);
3229	if (fromnd.ni_vp->v_type == VDIR)
3230		tond.ni_cnd.cn_flags |= WILLBEDIR;
3231	if ((error = namei(&tond)) != 0) {
3232		/* Translate error code for rename("dir1", "dir2/."). */
3233		if (error == EISDIR && fvp->v_type == VDIR)
3234			error = EINVAL;
3235		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3236		vrele(fromnd.ni_dvp);
3237		vrele(fvp);
3238		goto out1;
3239	}
3240	tdvp = tond.ni_dvp;
3241	tvp = tond.ni_vp;
3242	error = vn_start_write(fvp, &mp, V_NOWAIT);
3243	if (error != 0) {
3244		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3245		NDFREE(&tond, NDF_ONLY_PNBUF);
3246		if (tvp != NULL)
3247			vput(tvp);
3248		if (tdvp == tvp)
3249			vrele(tdvp);
3250		else
3251			vput(tdvp);
3252		vrele(fromnd.ni_dvp);
3253		vrele(fvp);
3254		vrele(tond.ni_startdir);
3255		if (fromnd.ni_startdir != NULL)
3256			vrele(fromnd.ni_startdir);
3257		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
3258		if (error != 0)
3259			return (error);
3260		goto again;
3261	}
3262	if (tvp != NULL) {
3263		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
3264			error = ENOTDIR;
3265			goto out;
3266		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
3267			error = EISDIR;
3268			goto out;
3269		}
3270#ifdef CAPABILITIES
3271		if (newfd != AT_FDCWD) {
3272			/*
3273			 * If the target already exists we require CAP_UNLINKAT
3274			 * from 'newfd'.
3275			 */
3276			error = cap_check(&tond.ni_filecaps.fc_rights,
3277			    cap_rights_init(&rights, CAP_UNLINKAT));
3278			if (error != 0)
3279				goto out;
3280		}
3281#endif
3282	}
3283	if (fvp == tdvp) {
3284		error = EINVAL;
3285		goto out;
3286	}
3287	/*
3288	 * If the source is the same as the destination (that is, if they
3289	 * are links to the same vnode), then there is nothing to do.
3290	 */
3291	if (fvp == tvp)
3292		error = -1;
3293#ifdef MAC
3294	else
3295		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
3296		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
3297#endif
3298out:
3299	if (error == 0) {
3300		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
3301		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
3302		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3303		NDFREE(&tond, NDF_ONLY_PNBUF);
3304	} else {
3305		NDFREE(&fromnd, NDF_ONLY_PNBUF);
3306		NDFREE(&tond, NDF_ONLY_PNBUF);
3307		if (tvp != NULL)
3308			vput(tvp);
3309		if (tdvp == tvp)
3310			vrele(tdvp);
3311		else
3312			vput(tdvp);
3313		vrele(fromnd.ni_dvp);
3314		vrele(fvp);
3315	}
3316	vrele(tond.ni_startdir);
3317	vn_finished_write(mp);
3318out1:
3319	if (fromnd.ni_startdir)
3320		vrele(fromnd.ni_startdir);
3321	if (error == -1)
3322		return (0);
3323	return (error);
3324}
3325
3326/*
3327 * Make a directory file.
3328 */
3329#ifndef _SYS_SYSPROTO_H_
3330struct mkdir_args {
3331	char	*path;
3332	int	mode;
3333};
3334#endif
3335int
3336sys_mkdir(struct thread *td, struct mkdir_args *uap)
3337{
3338
3339	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
3340	    uap->mode));
3341}
3342
3343#ifndef _SYS_SYSPROTO_H_
3344struct mkdirat_args {
3345	int	fd;
3346	char	*path;
3347	mode_t	mode;
3348};
3349#endif
3350int
3351sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
3352{
3353
3354	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
3355}
3356
3357int
3358kern_mkdirat(struct thread *td, int fd, char *path, enum uio_seg segflg,
3359    int mode)
3360{
3361	struct mount *mp;
3362	struct vnode *vp;
3363	struct vattr vattr;
3364	struct nameidata nd;
3365	cap_rights_t rights;
3366	int error;
3367
3368	AUDIT_ARG_MODE(mode);
3369restart:
3370	bwillwrite();
3371	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
3372	    NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
3373	    td);
3374	nd.ni_cnd.cn_flags |= WILLBEDIR;
3375	if ((error = namei(&nd)) != 0)
3376		return (error);
3377	vp = nd.ni_vp;
3378	if (vp != NULL) {
3379		NDFREE(&nd, NDF_ONLY_PNBUF);
3380		/*
3381		 * XXX namei called with LOCKPARENT but not LOCKLEAF has
3382		 * the strange behaviour of leaving the vnode unlocked
3383		 * if the target is the same vnode as the parent.
3384		 */
3385		if (vp == nd.ni_dvp)
3386			vrele(nd.ni_dvp);
3387		else
3388			vput(nd.ni_dvp);
3389		vrele(vp);
3390		return (EEXIST);
3391	}
3392	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3393		NDFREE(&nd, NDF_ONLY_PNBUF);
3394		vput(nd.ni_dvp);
3395		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3396			return (error);
3397		goto restart;
3398	}
3399	VATTR_NULL(&vattr);
3400	vattr.va_type = VDIR;
3401	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
3402#ifdef MAC
3403	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
3404	    &vattr);
3405	if (error != 0)
3406		goto out;
3407#endif
3408	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
3409#ifdef MAC
3410out:
3411#endif
3412	NDFREE(&nd, NDF_ONLY_PNBUF);
3413	vput(nd.ni_dvp);
3414	if (error == 0)
3415		vput(nd.ni_vp);
3416	vn_finished_write(mp);
3417	return (error);
3418}
3419
3420/*
3421 * Remove a directory file.
3422 */
3423#ifndef _SYS_SYSPROTO_H_
3424struct rmdir_args {
3425	char	*path;
3426};
3427#endif
3428int
3429sys_rmdir(struct thread *td, struct rmdir_args *uap)
3430{
3431
3432	return (kern_rmdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE));
3433}
3434
3435int
3436kern_rmdirat(struct thread *td, int fd, char *path, enum uio_seg pathseg)
3437{
3438	struct mount *mp;
3439	struct vnode *vp;
3440	struct nameidata nd;
3441	cap_rights_t rights;
3442	int error;
3443
3444restart:
3445	bwillwrite();
3446	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
3447	    pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
3448	if ((error = namei(&nd)) != 0)
3449		return (error);
3450	vp = nd.ni_vp;
3451	if (vp->v_type != VDIR) {
3452		error = ENOTDIR;
3453		goto out;
3454	}
3455	/*
3456	 * No rmdir "." please.
3457	 */
3458	if (nd.ni_dvp == vp) {
3459		error = EINVAL;
3460		goto out;
3461	}
3462	/*
3463	 * The root of a mounted filesystem cannot be deleted.
3464	 */
3465	if (vp->v_vflag & VV_ROOT) {
3466		error = EBUSY;
3467		goto out;
3468	}
3469#ifdef MAC
3470	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
3471	    &nd.ni_cnd);
3472	if (error != 0)
3473		goto out;
3474#endif
3475	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
3476		NDFREE(&nd, NDF_ONLY_PNBUF);
3477		vput(vp);
3478		if (nd.ni_dvp == vp)
3479			vrele(nd.ni_dvp);
3480		else
3481			vput(nd.ni_dvp);
3482		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
3483			return (error);
3484		goto restart;
3485	}
3486	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
3487	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
3488	vn_finished_write(mp);
3489out:
3490	NDFREE(&nd, NDF_ONLY_PNBUF);
3491	vput(vp);
3492	if (nd.ni_dvp == vp)
3493		vrele(nd.ni_dvp);
3494	else
3495		vput(nd.ni_dvp);
3496	return (error);
3497}
3498
3499#ifdef COMPAT_43
3500/*
3501 * Read a block of directory entries in a filesystem independent format.
3502 */
3503#ifndef _SYS_SYSPROTO_H_
3504struct ogetdirentries_args {
3505	int	fd;
3506	char	*buf;
3507	u_int	count;
3508	long	*basep;
3509};
3510#endif
3511int
3512ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
3513{
3514	long loff;
3515	int error;
3516
3517	error = kern_ogetdirentries(td, uap, &loff);
3518	if (error == 0)
3519		error = copyout(&loff, uap->basep, sizeof(long));
3520	return (error);
3521}
3522
3523int
3524kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
3525    long *ploff)
3526{
3527	struct vnode *vp;
3528	struct file *fp;
3529	struct uio auio, kuio;
3530	struct iovec aiov, kiov;
3531	struct dirent *dp, *edp;
3532	cap_rights_t rights;
3533	caddr_t dirbuf;
3534	int error, eofflag, readcnt;
3535	long loff;
3536	off_t foffset;
3537
3538	/* XXX arbitrary sanity limit on `count'. */
3539	if (uap->count > 64 * 1024)
3540		return (EINVAL);
3541	error = getvnode(td, uap->fd, cap_rights_init(&rights, CAP_READ), &fp);
3542	if (error != 0)
3543		return (error);
3544	if ((fp->f_flag & FREAD) == 0) {
3545		fdrop(fp, td);
3546		return (EBADF);
3547	}
3548	vp = fp->f_vnode;
3549	foffset = foffset_lock(fp, 0);
3550unionread:
3551	if (vp->v_type != VDIR) {
3552		foffset_unlock(fp, foffset, 0);
3553		fdrop(fp, td);
3554		return (EINVAL);
3555	}
3556	aiov.iov_base = uap->buf;
3557	aiov.iov_len = uap->count;
3558	auio.uio_iov = &aiov;
3559	auio.uio_iovcnt = 1;
3560	auio.uio_rw = UIO_READ;
3561	auio.uio_segflg = UIO_USERSPACE;
3562	auio.uio_td = td;
3563	auio.uio_resid = uap->count;
3564	vn_lock(vp, LK_SHARED | LK_RETRY);
3565	loff = auio.uio_offset = foffset;
3566#ifdef MAC
3567	error = mac_vnode_check_readdir(td->td_ucred, vp);
3568	if (error != 0) {
3569		VOP_UNLOCK(vp, 0);
3570		foffset_unlock(fp, foffset, FOF_NOUPDATE);
3571		fdrop(fp, td);
3572		return (error);
3573	}
3574#endif
3575#	if (BYTE_ORDER != LITTLE_ENDIAN)
3576		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
3577			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
3578			    NULL, NULL);
3579			foffset = auio.uio_offset;
3580		} else
3581#	endif
3582	{
3583		kuio = auio;
3584		kuio.uio_iov = &kiov;
3585		kuio.uio_segflg = UIO_SYSSPACE;
3586		kiov.iov_len = uap->count;
3587		dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
3588		kiov.iov_base = dirbuf;
3589		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
3590			    NULL, NULL);
3591		foffset = kuio.uio_offset;
3592		if (error == 0) {
3593			readcnt = uap->count - kuio.uio_resid;
3594			edp = (struct dirent *)&dirbuf[readcnt];
3595			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
3596#				if (BYTE_ORDER == LITTLE_ENDIAN)
3597					/*
3598					 * The expected low byte of
3599					 * dp->d_namlen is our dp->d_type.
3600					 * The high MBZ byte of dp->d_namlen
3601					 * is our dp->d_namlen.
3602					 */
3603					dp->d_type = dp->d_namlen;
3604					dp->d_namlen = 0;
3605#				else
3606					/*
3607					 * The dp->d_type is the high byte
3608					 * of the expected dp->d_namlen,
3609					 * so must be zero'ed.
3610					 */
3611					dp->d_type = 0;
3612#				endif
3613				if (dp->d_reclen > 0) {
3614					dp = (struct dirent *)
3615					    ((char *)dp + dp->d_reclen);
3616				} else {
3617					error = EIO;
3618					break;
3619				}
3620			}
3621			if (dp >= edp)
3622				error = uiomove(dirbuf, readcnt, &auio);
3623		}
3624		free(dirbuf, M_TEMP);
3625	}
3626	if (error != 0) {
3627		VOP_UNLOCK(vp, 0);
3628		foffset_unlock(fp, foffset, 0);
3629		fdrop(fp, td);
3630		return (error);
3631	}
3632	if (uap->count == auio.uio_resid &&
3633	    (vp->v_vflag & VV_ROOT) &&
3634	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3635		struct vnode *tvp = vp;
3636		vp = vp->v_mount->mnt_vnodecovered;
3637		VREF(vp);
3638		fp->f_vnode = vp;
3639		fp->f_data = vp;
3640		foffset = 0;
3641		vput(tvp);
3642		goto unionread;
3643	}
3644	VOP_UNLOCK(vp, 0);
3645	foffset_unlock(fp, foffset, 0);
3646	fdrop(fp, td);
3647	td->td_retval[0] = uap->count - auio.uio_resid;
3648	if (error == 0)
3649		*ploff = loff;
3650	return (error);
3651}
3652#endif /* COMPAT_43 */
3653
3654/*
3655 * Read a block of directory entries in a filesystem independent format.
3656 */
3657#ifndef _SYS_SYSPROTO_H_
3658struct getdirentries_args {
3659	int	fd;
3660	char	*buf;
3661	u_int	count;
3662	long	*basep;
3663};
3664#endif
3665int
3666sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
3667{
3668	long base;
3669	int error;
3670
3671	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
3672	    NULL, UIO_USERSPACE);
3673	if (error != 0)
3674		return (error);
3675	if (uap->basep != NULL)
3676		error = copyout(&base, uap->basep, sizeof(long));
3677	return (error);
3678}
3679
3680int
3681kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
3682    long *basep, ssize_t *residp, enum uio_seg bufseg)
3683{
3684	struct vnode *vp;
3685	struct file *fp;
3686	struct uio auio;
3687	struct iovec aiov;
3688	cap_rights_t rights;
3689	long loff;
3690	int error, eofflag;
3691	off_t foffset;
3692
3693	AUDIT_ARG_FD(fd);
3694	if (count > IOSIZE_MAX)
3695		return (EINVAL);
3696	auio.uio_resid = count;
3697	error = getvnode(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
3698	if (error != 0)
3699		return (error);
3700	if ((fp->f_flag & FREAD) == 0) {
3701		fdrop(fp, td);
3702		return (EBADF);
3703	}
3704	vp = fp->f_vnode;
3705	foffset = foffset_lock(fp, 0);
3706unionread:
3707	if (vp->v_type != VDIR) {
3708		error = EINVAL;
3709		goto fail;
3710	}
3711	aiov.iov_base = buf;
3712	aiov.iov_len = count;
3713	auio.uio_iov = &aiov;
3714	auio.uio_iovcnt = 1;
3715	auio.uio_rw = UIO_READ;
3716	auio.uio_segflg = bufseg;
3717	auio.uio_td = td;
3718	vn_lock(vp, LK_SHARED | LK_RETRY);
3719	AUDIT_ARG_VNODE1(vp);
3720	loff = auio.uio_offset = foffset;
3721#ifdef MAC
3722	error = mac_vnode_check_readdir(td->td_ucred, vp);
3723	if (error == 0)
3724#endif
3725		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
3726		    NULL);
3727	foffset = auio.uio_offset;
3728	if (error != 0) {
3729		VOP_UNLOCK(vp, 0);
3730		goto fail;
3731	}
3732	if (count == auio.uio_resid &&
3733	    (vp->v_vflag & VV_ROOT) &&
3734	    (vp->v_mount->mnt_flag & MNT_UNION)) {
3735		struct vnode *tvp = vp;
3736
3737		vp = vp->v_mount->mnt_vnodecovered;
3738		VREF(vp);
3739		fp->f_vnode = vp;
3740		fp->f_data = vp;
3741		foffset = 0;
3742		vput(tvp);
3743		goto unionread;
3744	}
3745	VOP_UNLOCK(vp, 0);
3746	*basep = loff;
3747	if (residp != NULL)
3748		*residp = auio.uio_resid;
3749	td->td_retval[0] = count - auio.uio_resid;
3750fail:
3751	foffset_unlock(fp, foffset, 0);
3752	fdrop(fp, td);
3753	return (error);
3754}
3755
3756#ifndef _SYS_SYSPROTO_H_
3757struct getdents_args {
3758	int fd;
3759	char *buf;
3760	size_t count;
3761};
3762#endif
3763int
3764sys_getdents(struct thread *td, struct getdents_args *uap)
3765{
3766	struct getdirentries_args ap;
3767
3768	ap.fd = uap->fd;
3769	ap.buf = uap->buf;
3770	ap.count = uap->count;
3771	ap.basep = NULL;
3772	return (sys_getdirentries(td, &ap));
3773}
3774
3775/*
3776 * Set the mode mask for creation of filesystem nodes.
3777 */
3778#ifndef _SYS_SYSPROTO_H_
3779struct umask_args {
3780	int	newmask;
3781};
3782#endif
3783int
3784sys_umask(struct thread *td, struct umask_args *uap)
3785{
3786	struct filedesc *fdp;
3787
3788	fdp = td->td_proc->p_fd;
3789	FILEDESC_XLOCK(fdp);
3790	td->td_retval[0] = fdp->fd_cmask;
3791	fdp->fd_cmask = uap->newmask & ALLPERMS;
3792	FILEDESC_XUNLOCK(fdp);
3793	return (0);
3794}
3795
3796/*
3797 * Void all references to file by ripping underlying filesystem away from
3798 * vnode.
3799 */
3800#ifndef _SYS_SYSPROTO_H_
3801struct revoke_args {
3802	char	*path;
3803};
3804#endif
3805int
3806sys_revoke(struct thread *td, struct revoke_args *uap)
3807{
3808	struct vnode *vp;
3809	struct vattr vattr;
3810	struct nameidata nd;
3811	int error;
3812
3813	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3814	    uap->path, td);
3815	if ((error = namei(&nd)) != 0)
3816		return (error);
3817	vp = nd.ni_vp;
3818	NDFREE(&nd, NDF_ONLY_PNBUF);
3819	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
3820		error = EINVAL;
3821		goto out;
3822	}
3823#ifdef MAC
3824	error = mac_vnode_check_revoke(td->td_ucred, vp);
3825	if (error != 0)
3826		goto out;
3827#endif
3828	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
3829	if (error != 0)
3830		goto out;
3831	if (td->td_ucred->cr_uid != vattr.va_uid) {
3832		error = priv_check(td, PRIV_VFS_ADMIN);
3833		if (error != 0)
3834			goto out;
3835	}
3836	if (vcount(vp) > 1)
3837		VOP_REVOKE(vp, REVOKEALL);
3838out:
3839	vput(vp);
3840	return (error);
3841}
3842
3843/*
3844 * Convert a user file descriptor to a kernel file entry and check that, if it
3845 * is a capability, the correct rights are present. A reference on the file
3846 * entry is held upon returning.
3847 */
3848int
3849getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
3850{
3851	struct file *fp;
3852	int error;
3853
3854	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp, NULL);
3855	if (error != 0)
3856		return (error);
3857
3858	/*
3859	 * The file could be not of the vnode type, or it may be not
3860	 * yet fully initialized, in which case the f_vnode pointer
3861	 * may be set, but f_ops is still badfileops.  E.g.,
3862	 * devfs_open() transiently create such situation to
3863	 * facilitate csw d_fdopen().
3864	 *
3865	 * Dupfdopen() handling in kern_openat() installs the
3866	 * half-baked file into the process descriptor table, allowing
3867	 * other thread to dereference it. Guard against the race by
3868	 * checking f_ops.
3869	 */
3870	if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
3871		fdrop(fp, td);
3872		return (EINVAL);
3873	}
3874	*fpp = fp;
3875	return (0);
3876}
3877
3878
3879/*
3880 * Get an (NFS) file handle.
3881 */
3882#ifndef _SYS_SYSPROTO_H_
3883struct lgetfh_args {
3884	char	*fname;
3885	fhandle_t *fhp;
3886};
3887#endif
3888int
3889sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
3890{
3891	struct nameidata nd;
3892	fhandle_t fh;
3893	struct vnode *vp;
3894	int error;
3895
3896	error = priv_check(td, PRIV_VFS_GETFH);
3897	if (error != 0)
3898		return (error);
3899	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3900	    uap->fname, td);
3901	error = namei(&nd);
3902	if (error != 0)
3903		return (error);
3904	NDFREE(&nd, NDF_ONLY_PNBUF);
3905	vp = nd.ni_vp;
3906	bzero(&fh, sizeof(fh));
3907	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3908	error = VOP_VPTOFH(vp, &fh.fh_fid);
3909	vput(vp);
3910	if (error == 0)
3911		error = copyout(&fh, uap->fhp, sizeof (fh));
3912	return (error);
3913}
3914
3915#ifndef _SYS_SYSPROTO_H_
3916struct getfh_args {
3917	char	*fname;
3918	fhandle_t *fhp;
3919};
3920#endif
3921int
3922sys_getfh(struct thread *td, struct getfh_args *uap)
3923{
3924	struct nameidata nd;
3925	fhandle_t fh;
3926	struct vnode *vp;
3927	int error;
3928
3929	error = priv_check(td, PRIV_VFS_GETFH);
3930	if (error != 0)
3931		return (error);
3932	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
3933	    uap->fname, td);
3934	error = namei(&nd);
3935	if (error != 0)
3936		return (error);
3937	NDFREE(&nd, NDF_ONLY_PNBUF);
3938	vp = nd.ni_vp;
3939	bzero(&fh, sizeof(fh));
3940	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
3941	error = VOP_VPTOFH(vp, &fh.fh_fid);
3942	vput(vp);
3943	if (error == 0)
3944		error = copyout(&fh, uap->fhp, sizeof (fh));
3945	return (error);
3946}
3947
3948/*
3949 * syscall for the rpc.lockd to use to translate a NFS file handle into an
3950 * open descriptor.
3951 *
3952 * warning: do not remove the priv_check() call or this becomes one giant
3953 * security hole.
3954 */
3955#ifndef _SYS_SYSPROTO_H_
3956struct fhopen_args {
3957	const struct fhandle *u_fhp;
3958	int flags;
3959};
3960#endif
3961int
3962sys_fhopen(struct thread *td, struct fhopen_args *uap)
3963{
3964	struct mount *mp;
3965	struct vnode *vp;
3966	struct fhandle fhp;
3967	struct file *fp;
3968	int fmode, error;
3969	int indx;
3970
3971	error = priv_check(td, PRIV_VFS_FHOPEN);
3972	if (error != 0)
3973		return (error);
3974	indx = -1;
3975	fmode = FFLAGS(uap->flags);
3976	/* why not allow a non-read/write open for our lockd? */
3977	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
3978		return (EINVAL);
3979	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
3980	if (error != 0)
3981		return(error);
3982	/* find the mount point */
3983	mp = vfs_busyfs(&fhp.fh_fsid);
3984	if (mp == NULL)
3985		return (ESTALE);
3986	/* now give me my vnode, it gets returned to me locked */
3987	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
3988	vfs_unbusy(mp);
3989	if (error != 0)
3990		return (error);
3991
3992	error = falloc_noinstall(td, &fp);
3993	if (error != 0) {
3994		vput(vp);
3995		return (error);
3996	}
3997	/*
3998	 * An extra reference on `fp' has been held for us by
3999	 * falloc_noinstall().
4000	 */
4001
4002#ifdef INVARIANTS
4003	td->td_dupfd = -1;
4004#endif
4005	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
4006	if (error != 0) {
4007		KASSERT(fp->f_ops == &badfileops,
4008		    ("VOP_OPEN in fhopen() set f_ops"));
4009		KASSERT(td->td_dupfd < 0,
4010		    ("fhopen() encountered fdopen()"));
4011
4012		vput(vp);
4013		goto bad;
4014	}
4015#ifdef INVARIANTS
4016	td->td_dupfd = 0;
4017#endif
4018	fp->f_vnode = vp;
4019	fp->f_seqcount = 1;
4020	finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
4021	    &vnops);
4022	VOP_UNLOCK(vp, 0);
4023	if ((fmode & O_TRUNC) != 0) {
4024		error = fo_truncate(fp, 0, td->td_ucred, td);
4025		if (error != 0)
4026			goto bad;
4027	}
4028
4029	error = finstall(td, fp, &indx, fmode, NULL);
4030bad:
4031	fdrop(fp, td);
4032	td->td_retval[0] = indx;
4033	return (error);
4034}
4035
4036/*
4037 * Stat an (NFS) file handle.
4038 */
4039#ifndef _SYS_SYSPROTO_H_
4040struct fhstat_args {
4041	struct fhandle *u_fhp;
4042	struct stat *sb;
4043};
4044#endif
4045int
4046sys_fhstat(struct thread *td, struct fhstat_args *uap)
4047{
4048	struct stat sb;
4049	struct fhandle fh;
4050	int error;
4051
4052	error = copyin(uap->u_fhp, &fh, sizeof(fh));
4053	if (error != 0)
4054		return (error);
4055	error = kern_fhstat(td, fh, &sb);
4056	if (error == 0)
4057		error = copyout(&sb, uap->sb, sizeof(sb));
4058	return (error);
4059}
4060
4061int
4062kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
4063{
4064	struct mount *mp;
4065	struct vnode *vp;
4066	int error;
4067
4068	error = priv_check(td, PRIV_VFS_FHSTAT);
4069	if (error != 0)
4070		return (error);
4071	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4072		return (ESTALE);
4073	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4074	vfs_unbusy(mp);
4075	if (error != 0)
4076		return (error);
4077	error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
4078	vput(vp);
4079	return (error);
4080}
4081
4082/*
4083 * Implement fstatfs() for (NFS) file handles.
4084 */
4085#ifndef _SYS_SYSPROTO_H_
4086struct fhstatfs_args {
4087	struct fhandle *u_fhp;
4088	struct statfs *buf;
4089};
4090#endif
4091int
4092sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
4093{
4094	struct statfs *sfp;
4095	fhandle_t fh;
4096	int error;
4097
4098	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
4099	if (error != 0)
4100		return (error);
4101	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
4102	error = kern_fhstatfs(td, fh, sfp);
4103	if (error == 0)
4104		error = copyout(sfp, uap->buf, sizeof(*sfp));
4105	free(sfp, M_STATFS);
4106	return (error);
4107}
4108
4109int
4110kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
4111{
4112	struct statfs *sp;
4113	struct mount *mp;
4114	struct vnode *vp;
4115	int error;
4116
4117	error = priv_check(td, PRIV_VFS_FHSTATFS);
4118	if (error != 0)
4119		return (error);
4120	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
4121		return (ESTALE);
4122	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
4123	if (error != 0) {
4124		vfs_unbusy(mp);
4125		return (error);
4126	}
4127	vput(vp);
4128	error = prison_canseemount(td->td_ucred, mp);
4129	if (error != 0)
4130		goto out;
4131#ifdef MAC
4132	error = mac_mount_check_stat(td->td_ucred, mp);
4133	if (error != 0)
4134		goto out;
4135#endif
4136	/*
4137	 * Set these in case the underlying filesystem fails to do so.
4138	 */
4139	sp = &mp->mnt_stat;
4140	sp->f_version = STATFS_VERSION;
4141	sp->f_namemax = NAME_MAX;
4142	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
4143	error = VFS_STATFS(mp, sp);
4144	if (error == 0)
4145		*buf = *sp;
4146out:
4147	vfs_unbusy(mp);
4148	return (error);
4149}
4150
4151int
4152kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
4153{
4154	struct file *fp;
4155	struct mount *mp;
4156	struct vnode *vp;
4157	cap_rights_t rights;
4158	off_t olen, ooffset;
4159	int error;
4160
4161	if (offset < 0 || len <= 0)
4162		return (EINVAL);
4163	/* Check for wrap. */
4164	if (offset > OFF_MAX - len)
4165		return (EFBIG);
4166	error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
4167	if (error != 0)
4168		return (error);
4169	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4170		error = ESPIPE;
4171		goto out;
4172	}
4173	if ((fp->f_flag & FWRITE) == 0) {
4174		error = EBADF;
4175		goto out;
4176	}
4177	if (fp->f_type != DTYPE_VNODE) {
4178		error = ENODEV;
4179		goto out;
4180	}
4181	vp = fp->f_vnode;
4182	if (vp->v_type != VREG) {
4183		error = ENODEV;
4184		goto out;
4185	}
4186
4187	/* Allocating blocks may take a long time, so iterate. */
4188	for (;;) {
4189		olen = len;
4190		ooffset = offset;
4191
4192		bwillwrite();
4193		mp = NULL;
4194		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
4195		if (error != 0)
4196			break;
4197		error = vn_lock(vp, LK_EXCLUSIVE);
4198		if (error != 0) {
4199			vn_finished_write(mp);
4200			break;
4201		}
4202#ifdef MAC
4203		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
4204		if (error == 0)
4205#endif
4206			error = VOP_ALLOCATE(vp, &offset, &len);
4207		VOP_UNLOCK(vp, 0);
4208		vn_finished_write(mp);
4209
4210		if (olen + ooffset != offset + len) {
4211			panic("offset + len changed from %jx/%jx to %jx/%jx",
4212			    ooffset, olen, offset, len);
4213		}
4214		if (error != 0 || len == 0)
4215			break;
4216		KASSERT(olen > len, ("Iteration did not make progress?"));
4217		maybe_yield();
4218	}
4219 out:
4220	fdrop(fp, td);
4221	return (error);
4222}
4223
4224int
4225sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
4226{
4227	int error;
4228
4229	error = kern_posix_fallocate(td, uap->fd, uap->offset, uap->len);
4230	return (kern_posix_error(td, error));
4231}
4232
4233/*
4234 * Unlike madvise(2), we do not make a best effort to remember every
4235 * possible caching hint.  Instead, we remember the last setting with
4236 * the exception that we will allow POSIX_FADV_NORMAL to adjust the
4237 * region of any current setting.
4238 */
4239int
4240kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
4241    int advice)
4242{
4243	struct fadvise_info *fa, *new;
4244	struct file *fp;
4245	struct vnode *vp;
4246	cap_rights_t rights;
4247	off_t end;
4248	int error;
4249
4250	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
4251		return (EINVAL);
4252	switch (advice) {
4253	case POSIX_FADV_SEQUENTIAL:
4254	case POSIX_FADV_RANDOM:
4255	case POSIX_FADV_NOREUSE:
4256		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
4257		break;
4258	case POSIX_FADV_NORMAL:
4259	case POSIX_FADV_WILLNEED:
4260	case POSIX_FADV_DONTNEED:
4261		new = NULL;
4262		break;
4263	default:
4264		return (EINVAL);
4265	}
4266	/* XXX: CAP_POSIX_FADVISE? */
4267	error = fget(td, fd, cap_rights_init(&rights), &fp);
4268	if (error != 0)
4269		goto out;
4270	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
4271		error = ESPIPE;
4272		goto out;
4273	}
4274	if (fp->f_type != DTYPE_VNODE) {
4275		error = ENODEV;
4276		goto out;
4277	}
4278	vp = fp->f_vnode;
4279	if (vp->v_type != VREG) {
4280		error = ENODEV;
4281		goto out;
4282	}
4283	if (len == 0)
4284		end = OFF_MAX;
4285	else
4286		end = offset + len - 1;
4287	switch (advice) {
4288	case POSIX_FADV_SEQUENTIAL:
4289	case POSIX_FADV_RANDOM:
4290	case POSIX_FADV_NOREUSE:
4291		/*
4292		 * Try to merge any existing non-standard region with
4293		 * this new region if possible, otherwise create a new
4294		 * non-standard region for this request.
4295		 */
4296		mtx_pool_lock(mtxpool_sleep, fp);
4297		fa = fp->f_advice;
4298		if (fa != NULL && fa->fa_advice == advice &&
4299		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
4300		    (end != OFF_MAX && fa->fa_start == end + 1) ||
4301		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
4302			if (offset < fa->fa_start)
4303				fa->fa_start = offset;
4304			if (end > fa->fa_end)
4305				fa->fa_end = end;
4306		} else {
4307			new->fa_advice = advice;
4308			new->fa_start = offset;
4309			new->fa_end = end;
4310			fp->f_advice = new;
4311			new = fa;
4312		}
4313		mtx_pool_unlock(mtxpool_sleep, fp);
4314		break;
4315	case POSIX_FADV_NORMAL:
4316		/*
4317		 * If a the "normal" region overlaps with an existing
4318		 * non-standard region, trim or remove the
4319		 * non-standard region.
4320		 */
4321		mtx_pool_lock(mtxpool_sleep, fp);
4322		fa = fp->f_advice;
4323		if (fa != NULL) {
4324			if (offset <= fa->fa_start && end >= fa->fa_end) {
4325				new = fa;
4326				fp->f_advice = NULL;
4327			} else if (offset <= fa->fa_start &&
4328			    end >= fa->fa_start)
4329				fa->fa_start = end + 1;
4330			else if (offset <= fa->fa_end && end >= fa->fa_end)
4331				fa->fa_end = offset - 1;
4332			else if (offset >= fa->fa_start && end <= fa->fa_end) {
4333				/*
4334				 * If the "normal" region is a middle
4335				 * portion of the existing
4336				 * non-standard region, just remove
4337				 * the whole thing rather than picking
4338				 * one side or the other to
4339				 * preserve.
4340				 */
4341				new = fa;
4342				fp->f_advice = NULL;
4343			}
4344		}
4345		mtx_pool_unlock(mtxpool_sleep, fp);
4346		break;
4347	case POSIX_FADV_WILLNEED:
4348	case POSIX_FADV_DONTNEED:
4349		error = VOP_ADVISE(vp, offset, end, advice);
4350		break;
4351	}
4352out:
4353	if (fp != NULL)
4354		fdrop(fp, td);
4355	free(new, M_FADVISE);
4356	return (error);
4357}
4358
4359int
4360sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
4361{
4362	int error;
4363
4364	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
4365	    uap->advice);
4366	return (kern_posix_error(td, error));
4367}
4368