kern_descrip.c revision 284665
1100616Smp/*-
259243Sobrien * Copyright (c) 1982, 1986, 1989, 1991, 1993
359243Sobrien *	The Regents of the University of California.  All rights reserved.
459243Sobrien * (c) UNIX System Laboratories, Inc.
559243Sobrien * All or some portions of this file are derived from material licensed
659243Sobrien * to the University of California by American Telephone and Telegraph
759243Sobrien * Co. or Unix System Laboratories, Inc. and are reproduced herein with
859243Sobrien * the permission of UNIX System Laboratories, Inc.
959243Sobrien *
1059243Sobrien * Redistribution and use in source and binary forms, with or without
1159243Sobrien * modification, are permitted provided that the following conditions
1259243Sobrien * are met:
1359243Sobrien * 1. Redistributions of source code must retain the above copyright
1459243Sobrien *    notice, this list of conditions and the following disclaimer.
1559243Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1659243Sobrien *    notice, this list of conditions and the following disclaimer in the
17100616Smp *    documentation and/or other materials provided with the distribution.
1859243Sobrien * 4. Neither the name of the University nor the names of its contributors
1959243Sobrien *    may be used to endorse or promote products derived from this software
2059243Sobrien *    without specific prior written permission.
2159243Sobrien *
2259243Sobrien * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2359243Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2459243Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2559243Sobrien * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2659243Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2759243Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2859243Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2959243Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3059243Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3159243Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3259243Sobrien * SUCH DAMAGE.
3359243Sobrien *
3459243Sobrien *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35100616Smp */
3659243Sobrien
3759243Sobrien#include <sys/cdefs.h>
3859243Sobrien__FBSDID("$FreeBSD: stable/10/sys/kern/kern_descrip.c 284665 2015-06-21 06:28:26Z trasz $");
3959243Sobrien
4059243Sobrien#include "opt_capsicum.h"
4159243Sobrien#include "opt_compat.h"
4259243Sobrien#include "opt_ddb.h"
4359243Sobrien#include "opt_ktrace.h"
4459243Sobrien#include "opt_procdesc.h"
4559243Sobrien
4659243Sobrien#include <sys/param.h>
4759243Sobrien#include <sys/systm.h>
4859243Sobrien
4959243Sobrien#include <sys/capsicum.h>
5059243Sobrien#include <sys/conf.h>
5159243Sobrien#include <sys/domain.h>
5259243Sobrien#include <sys/fcntl.h>
5359243Sobrien#include <sys/file.h>
5459243Sobrien#include <sys/filedesc.h>
5559243Sobrien#include <sys/filio.h>
5659243Sobrien#include <sys/jail.h>
5759243Sobrien#include <sys/kernel.h>
5859243Sobrien#include <sys/ksem.h>
5959243Sobrien#include <sys/limits.h>
6059243Sobrien#include <sys/lock.h>
6159243Sobrien#include <sys/malloc.h>
6259243Sobrien#include <sys/mman.h>
6359243Sobrien#include <sys/mount.h>
6459243Sobrien#include <sys/mqueue.h>
6559243Sobrien#include <sys/mutex.h>
6659243Sobrien#include <sys/namei.h>
6759243Sobrien#include <sys/selinfo.h>
6859243Sobrien#include <sys/pipe.h>
6959243Sobrien#include <sys/priv.h>
7059243Sobrien#include <sys/proc.h>
7159243Sobrien#include <sys/procdesc.h>
7259243Sobrien#include <sys/protosw.h>
73100616Smp#include <sys/racct.h>
7459243Sobrien#include <sys/resourcevar.h>
7559243Sobrien#include <sys/sbuf.h>
7659243Sobrien#include <sys/signalvar.h>
7759243Sobrien#include <sys/socketvar.h>
7859243Sobrien#include <sys/stat.h>
7959243Sobrien#include <sys/sx.h>
8059243Sobrien#include <sys/syscallsubr.h>
8159243Sobrien#include <sys/sysctl.h>
8259243Sobrien#include <sys/sysproto.h>
8359243Sobrien#include <sys/tty.h>
8459243Sobrien#include <sys/unistd.h>
8559243Sobrien#include <sys/un.h>
8659243Sobrien#include <sys/unpcb.h>
8759243Sobrien#include <sys/user.h>
8859243Sobrien#include <sys/vnode.h>
8959243Sobrien#ifdef KTRACE
9059243Sobrien#include <sys/ktrace.h>
9159243Sobrien#endif
9259243Sobrien
9359243Sobrien#include <net/vnet.h>
9459243Sobrien
9559243Sobrien#include <netinet/in.h>
9659243Sobrien#include <netinet/in_pcb.h>
9759243Sobrien
9859243Sobrien#include <security/audit/audit.h>
9959243Sobrien
10059243Sobrien#include <vm/uma.h>
10159243Sobrien#include <vm/vm.h>
10259243Sobrien
10359243Sobrien#include <ddb/ddb.h>
10459243Sobrien
10559243Sobrienstatic MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
10659243Sobrienstatic MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
10759243Sobrien    "file desc to leader structures");
10859243Sobrienstatic MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
10959243SobrienMALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
11059243Sobrien
11159243SobrienMALLOC_DECLARE(M_FADVISE);
11259243Sobrien
11359243Sobrienstatic uma_zone_t file_zone;
11459243Sobrien
11559243Sobrienvoid	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
11659243Sobrien
11759243Sobrienstatic int	closefp(struct filedesc *fdp, int fd, struct file *fp,
118100616Smp		    struct thread *td, int holdleaders);
119100616Smpstatic int	fd_first_free(struct filedesc *fdp, int low, int size);
12059243Sobrienstatic int	fd_last_used(struct filedesc *fdp, int size);
12159243Sobrienstatic void	fdgrowtable(struct filedesc *fdp, int nfd);
12259243Sobrienstatic void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
12359243Sobrienstatic void	fdunused(struct filedesc *fdp, int fd);
12459243Sobrienstatic void	fdused(struct filedesc *fdp, int fd);
12559243Sobrienstatic int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
12659243Sobrienstatic int	fill_procdesc_info(struct procdesc *pdp,
12759243Sobrien		    struct kinfo_file *kif);
12859243Sobrienstatic int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
12959243Sobrienstatic int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
13059243Sobrienstatic int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
13159243Sobrienstatic int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
13259243Sobrienstatic int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
13359243Sobrienstatic int	getmaxfd(struct proc *p);
13459243Sobrien
13559243Sobrien/*
13659243Sobrien * Each process has:
13759243Sobrien *
13859243Sobrien * - An array of open file descriptors (fd_ofiles)
13959243Sobrien * - An array of file flags (fd_ofileflags)
14059243Sobrien * - A bitmap recording which descriptors are in use (fd_map)
14159243Sobrien *
14259243Sobrien * A process starts out with NDFILE descriptors.  The value of NDFILE has
14359243Sobrien * been selected based the historical limit of 20 open files, and an
14459243Sobrien * assumption that the majority of processes, especially short-lived
14559243Sobrien * processes like shells, will never need more.
14659243Sobrien *
14759243Sobrien * If this initial allocation is exhausted, a larger descriptor table and
14859243Sobrien * map are allocated dynamically, and the pointers in the process's struct
14959243Sobrien * filedesc are updated to point to those.  This is repeated every time
15059243Sobrien * the process runs out of file descriptors (provided it hasn't hit its
15159243Sobrien * resource limit).
15259243Sobrien *
15359243Sobrien * Since threads may hold references to individual descriptor table
15459243Sobrien * entries, the tables are never freed.  Instead, they are placed on a
15559243Sobrien * linked list and freed only when the struct filedesc is released.
15659243Sobrien */
15759243Sobrien#define NDFILE		20
15859243Sobrien#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
15959243Sobrien#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
16059243Sobrien#define NDSLOT(x)	((x) / NDENTRIES)
16159243Sobrien#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
16259243Sobrien#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
16359243Sobrien
16459243Sobrien/*
16559243Sobrien * SLIST entry used to keep track of ofiles which must be reclaimed when
16659243Sobrien * the process exits.
16759243Sobrien */
16859243Sobrienstruct freetable {
16959243Sobrien	struct filedescent *ft_table;
17059243Sobrien	SLIST_ENTRY(freetable) ft_next;
17159243Sobrien};
17259243Sobrien
17359243Sobrien/*
17459243Sobrien * Initial allocation: a filedesc structure + the head of SLIST used to
17559243Sobrien * keep track of old ofiles + enough space for NDFILE descriptors.
17659243Sobrien */
17759243Sobrienstruct filedesc0 {
17859243Sobrien	struct filedesc fd_fd;
17959243Sobrien	SLIST_HEAD(, freetable) fd_free;
18059243Sobrien	struct	filedescent fd_dfiles[NDFILE];
18159243Sobrien	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
18259243Sobrien};
18359243Sobrien
18459243Sobrien/*
18559243Sobrien * Descriptor management.
18659243Sobrien */
18759243Sobrienvolatile int openfiles;			/* actual number of open files */
18859243Sobrienstruct mtx sigio_lock;		/* mtx to protect pointers to sigio */
18959243Sobrienvoid (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
19059243Sobrien
19159243Sobrien/* A mutex to protect the association between a proc and filedesc. */
19259243Sobrienstatic struct mtx fdesc_mtx;
19359243Sobrien
19459243Sobrien/*
19559243Sobrien * If low >= size, just return low. Otherwise find the first zero bit in the
19659243Sobrien * given bitmap, starting at low and not exceeding size - 1. Return size if
19759243Sobrien * not found.
19859243Sobrien */
19959243Sobrienstatic int
20059243Sobrienfd_first_free(struct filedesc *fdp, int low, int size)
20159243Sobrien{
20259243Sobrien	NDSLOTTYPE *map = fdp->fd_map;
20359243Sobrien	NDSLOTTYPE mask;
20459243Sobrien	int off, maxoff;
20559243Sobrien
20659243Sobrien	if (low >= size)
20759243Sobrien		return (low);
20859243Sobrien
20959243Sobrien	off = NDSLOT(low);
21059243Sobrien	if (low % NDENTRIES) {
21159243Sobrien		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
21259243Sobrien		if ((mask &= ~map[off]) != 0UL)
21359243Sobrien			return (off * NDENTRIES + ffsl(mask) - 1);
21459243Sobrien		++off;
21559243Sobrien	}
21659243Sobrien	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
21759243Sobrien		if (map[off] != ~0UL)
21859243Sobrien			return (off * NDENTRIES + ffsl(~map[off]) - 1);
21959243Sobrien	return (size);
22059243Sobrien}
22159243Sobrien
22259243Sobrien/*
22359243Sobrien * Find the highest non-zero bit in the given bitmap, starting at 0 and
22459243Sobrien * not exceeding size - 1. Return -1 if not found.
22559243Sobrien */
22659243Sobrienstatic int
22759243Sobrienfd_last_used(struct filedesc *fdp, int size)
22859243Sobrien{
22959243Sobrien	NDSLOTTYPE *map = fdp->fd_map;
23059243Sobrien	NDSLOTTYPE mask;
23159243Sobrien	int off, minoff;
23259243Sobrien
23359243Sobrien	off = NDSLOT(size);
23459243Sobrien	if (size % NDENTRIES) {
23559243Sobrien		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
23659243Sobrien		if ((mask &= map[off]) != 0)
23759243Sobrien			return (off * NDENTRIES + flsl(mask) - 1);
23859243Sobrien		--off;
23959243Sobrien	}
24059243Sobrien	for (minoff = NDSLOT(0); off >= minoff; --off)
24159243Sobrien		if (map[off] != 0)
24259243Sobrien			return (off * NDENTRIES + flsl(map[off]) - 1);
24359243Sobrien	return (-1);
24459243Sobrien}
24559243Sobrien
24659243Sobrienstatic int
24759243Sobrienfdisused(struct filedesc *fdp, int fd)
24859243Sobrien{
24959243Sobrien
25059243Sobrien	FILEDESC_LOCK_ASSERT(fdp);
25159243Sobrien
25259243Sobrien	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
25359243Sobrien	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
25459243Sobrien
25559243Sobrien	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
25659243Sobrien}
25759243Sobrien
25859243Sobrien/*
25959243Sobrien * Mark a file descriptor as used.
26059243Sobrien */
26159243Sobrienstatic void
26259243Sobrienfdused(struct filedesc *fdp, int fd)
26359243Sobrien{
26459243Sobrien
26559243Sobrien	FILEDESC_XLOCK_ASSERT(fdp);
26659243Sobrien
26759243Sobrien	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
26859243Sobrien
26959243Sobrien	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
27059243Sobrien	if (fd > fdp->fd_lastfile)
27159243Sobrien		fdp->fd_lastfile = fd;
27259243Sobrien	if (fd == fdp->fd_freefile)
27359243Sobrien		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
27459243Sobrien}
27559243Sobrien
27659243Sobrien/*
27759243Sobrien * Mark a file descriptor as unused.
27859243Sobrien */
27959243Sobrienstatic void
28059243Sobrienfdunused(struct filedesc *fdp, int fd)
28159243Sobrien{
28259243Sobrien
28359243Sobrien	FILEDESC_XLOCK_ASSERT(fdp);
28459243Sobrien
28559243Sobrien	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
28659243Sobrien	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
28759243Sobrien	    ("fd=%d is still in use", fd));
28859243Sobrien
28959243Sobrien	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
29059243Sobrien	if (fd < fdp->fd_freefile)
29159243Sobrien		fdp->fd_freefile = fd;
29259243Sobrien	if (fd == fdp->fd_lastfile)
29359243Sobrien		fdp->fd_lastfile = fd_last_used(fdp, fd);
29459243Sobrien}
29559243Sobrien
29659243Sobrien/*
29759243Sobrien * Free a file descriptor.
29859243Sobrien *
29959243Sobrien * Avoid some work if fdp is about to be destroyed.
30059243Sobrien */
30159243Sobrienstatic inline void
30259243Sobrien_fdfree(struct filedesc *fdp, int fd, int last)
30359243Sobrien{
30459243Sobrien	struct filedescent *fde;
30559243Sobrien
30659243Sobrien	fde = &fdp->fd_ofiles[fd];
30759243Sobrien#ifdef CAPABILITIES
30859243Sobrien	if (!last)
30959243Sobrien		seq_write_begin(&fde->fde_seq);
31059243Sobrien#endif
31159243Sobrien	filecaps_free(&fde->fde_caps);
31259243Sobrien	if (last)
31359243Sobrien		return;
31459243Sobrien	bzero(fde, fde_change_size);
31559243Sobrien	fdunused(fdp, fd);
31659243Sobrien#ifdef CAPABILITIES
31759243Sobrien	seq_write_end(&fde->fde_seq);
31859243Sobrien#endif
31959243Sobrien}
32059243Sobrien
32159243Sobrienstatic inline void
32259243Sobrienfdfree(struct filedesc *fdp, int fd)
32359243Sobrien{
32459243Sobrien
32559243Sobrien	_fdfree(fdp, fd, 0);
32659243Sobrien}
32759243Sobrien
32859243Sobrienstatic inline void
32959243Sobrienfdfree_last(struct filedesc *fdp, int fd)
33059243Sobrien{
33159243Sobrien
33259243Sobrien	_fdfree(fdp, fd, 1);
33359243Sobrien}
33459243Sobrien
33559243Sobrien/*
33659243Sobrien * System calls on descriptors.
33759243Sobrien */
33859243Sobrien#ifndef _SYS_SYSPROTO_H_
33959243Sobrienstruct getdtablesize_args {
34059243Sobrien	int	dummy;
34159243Sobrien};
34259243Sobrien#endif
34359243Sobrien/* ARGSUSED */
34459243Sobrienint
34559243Sobriensys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
34659243Sobrien{
34759243Sobrien	struct proc *p = td->td_proc;
34859243Sobrien	uint64_t lim;
34959243Sobrien
35059243Sobrien	PROC_LOCK(p);
35159243Sobrien	td->td_retval[0] =
35259243Sobrien	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
35359243Sobrien	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
35459243Sobrien	PROC_UNLOCK(p);
35559243Sobrien	if (lim < td->td_retval[0])
35659243Sobrien		td->td_retval[0] = lim;
35759243Sobrien	return (0);
35859243Sobrien}
35959243Sobrien
36059243Sobrien/*
36159243Sobrien * Duplicate a file descriptor to a particular value.
36259243Sobrien *
36359243Sobrien * Note: keep in mind that a potential race condition exists when closing
36459243Sobrien * descriptors from a shared descriptor table (via rfork).
36559243Sobrien */
36659243Sobrien#ifndef _SYS_SYSPROTO_H_
36759243Sobrienstruct dup2_args {
36859243Sobrien	u_int	from;
36959243Sobrien	u_int	to;
37059243Sobrien};
37159243Sobrien#endif
37259243Sobrien/* ARGSUSED */
37359243Sobrienint
37459243Sobriensys_dup2(struct thread *td, struct dup2_args *uap)
37559243Sobrien{
37659243Sobrien
37759243Sobrien	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
37859243Sobrien		    td->td_retval));
37959243Sobrien}
38059243Sobrien
38159243Sobrien/*
38259243Sobrien * Duplicate a file descriptor.
38359243Sobrien */
38459243Sobrien#ifndef _SYS_SYSPROTO_H_
38559243Sobrienstruct dup_args {
38659243Sobrien	u_int	fd;
38759243Sobrien};
38859243Sobrien#endif
38959243Sobrien/* ARGSUSED */
39059243Sobrienint
39159243Sobriensys_dup(struct thread *td, struct dup_args *uap)
39259243Sobrien{
39359243Sobrien
39459243Sobrien	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
39559243Sobrien}
39659243Sobrien
39759243Sobrien/*
39859243Sobrien * The file control system call.
39959243Sobrien */
40059243Sobrien#ifndef _SYS_SYSPROTO_H_
40159243Sobrienstruct fcntl_args {
40259243Sobrien	int	fd;
40359243Sobrien	int	cmd;
40459243Sobrien	long	arg;
40559243Sobrien};
40659243Sobrien#endif
40759243Sobrien/* ARGSUSED */
40859243Sobrienint
40959243Sobriensys_fcntl(struct thread *td, struct fcntl_args *uap)
41059243Sobrien{
41159243Sobrien
41259243Sobrien	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
41359243Sobrien}
41459243Sobrien
41559243Sobrienint
41659243Sobrienkern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
41759243Sobrien{
41859243Sobrien	struct flock fl;
41959243Sobrien	struct __oflock ofl;
42059243Sobrien	intptr_t arg1;
42159243Sobrien	int error;
42259243Sobrien
42359243Sobrien	error = 0;
42459243Sobrien	switch (cmd) {
42559243Sobrien	case F_OGETLK:
42659243Sobrien	case F_OSETLK:
42759243Sobrien	case F_OSETLKW:
42859243Sobrien		/*
42959243Sobrien		 * Convert old flock structure to new.
43059243Sobrien		 */
43159243Sobrien		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
43259243Sobrien		fl.l_start = ofl.l_start;
43359243Sobrien		fl.l_len = ofl.l_len;
43459243Sobrien		fl.l_pid = ofl.l_pid;
43559243Sobrien		fl.l_type = ofl.l_type;
43659243Sobrien		fl.l_whence = ofl.l_whence;
43759243Sobrien		fl.l_sysid = 0;
43859243Sobrien
43959243Sobrien		switch (cmd) {
44059243Sobrien		case F_OGETLK:
44159243Sobrien		    cmd = F_GETLK;
44259243Sobrien		    break;
44359243Sobrien		case F_OSETLK:
44459243Sobrien		    cmd = F_SETLK;
44559243Sobrien		    break;
44659243Sobrien		case F_OSETLKW:
44759243Sobrien		    cmd = F_SETLKW;
44859243Sobrien		    break;
44959243Sobrien		}
45059243Sobrien		arg1 = (intptr_t)&fl;
45159243Sobrien		break;
45259243Sobrien        case F_GETLK:
45359243Sobrien        case F_SETLK:
45459243Sobrien        case F_SETLKW:
45559243Sobrien	case F_SETLK_REMOTE:
45659243Sobrien                error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
45759243Sobrien                arg1 = (intptr_t)&fl;
45859243Sobrien                break;
45959243Sobrien	default:
46059243Sobrien		arg1 = arg;
46159243Sobrien		break;
46259243Sobrien	}
46359243Sobrien	if (error)
46459243Sobrien		return (error);
46559243Sobrien	error = kern_fcntl(td, fd, cmd, arg1);
46659243Sobrien	if (error)
46759243Sobrien		return (error);
46859243Sobrien	if (cmd == F_OGETLK) {
46959243Sobrien		ofl.l_start = fl.l_start;
47059243Sobrien		ofl.l_len = fl.l_len;
47159243Sobrien		ofl.l_pid = fl.l_pid;
47259243Sobrien		ofl.l_type = fl.l_type;
47359243Sobrien		ofl.l_whence = fl.l_whence;
47459243Sobrien		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
47559243Sobrien	} else if (cmd == F_GETLK) {
47659243Sobrien		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
47759243Sobrien	}
47859243Sobrien	return (error);
47959243Sobrien}
48059243Sobrien
48159243Sobrienint
48259243Sobrienkern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
48359243Sobrien{
48459243Sobrien	struct filedesc *fdp;
48559243Sobrien	struct flock *flp;
48659243Sobrien	struct file *fp, *fp2;
48759243Sobrien	struct filedescent *fde;
48859243Sobrien	struct proc *p;
48959243Sobrien	struct vnode *vp;
49059243Sobrien	cap_rights_t rights;
49159243Sobrien	int error, flg, tmp;
49259243Sobrien	uint64_t bsize;
49359243Sobrien	off_t foffset;
49459243Sobrien
49559243Sobrien	error = 0;
49659243Sobrien	flg = F_POSIX;
49759243Sobrien	p = td->td_proc;
49859243Sobrien	fdp = p->p_fd;
49959243Sobrien
50059243Sobrien	switch (cmd) {
50159243Sobrien	case F_DUPFD:
50259243Sobrien		tmp = arg;
50359243Sobrien		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
50459243Sobrien		break;
50559243Sobrien
50659243Sobrien	case F_DUPFD_CLOEXEC:
50759243Sobrien		tmp = arg;
50859243Sobrien		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
50959243Sobrien		    td->td_retval);
51059243Sobrien		break;
51159243Sobrien
51259243Sobrien	case F_DUP2FD:
51359243Sobrien		tmp = arg;
51459243Sobrien		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
51559243Sobrien		break;
51659243Sobrien
51759243Sobrien	case F_DUP2FD_CLOEXEC:
51859243Sobrien		tmp = arg;
51959243Sobrien		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
52059243Sobrien		    td->td_retval);
52159243Sobrien		break;
52259243Sobrien
52359243Sobrien	case F_GETFD:
52459243Sobrien		FILEDESC_SLOCK(fdp);
52559243Sobrien		if ((fp = fget_locked(fdp, fd)) == NULL) {
52659243Sobrien			FILEDESC_SUNLOCK(fdp);
52759243Sobrien			error = EBADF;
52859243Sobrien			break;
52959243Sobrien		}
53059243Sobrien		fde = &fdp->fd_ofiles[fd];
53159243Sobrien		td->td_retval[0] =
53259243Sobrien		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
53359243Sobrien		FILEDESC_SUNLOCK(fdp);
53459243Sobrien		break;
53559243Sobrien
53659243Sobrien	case F_SETFD:
53759243Sobrien		FILEDESC_XLOCK(fdp);
53859243Sobrien		if ((fp = fget_locked(fdp, fd)) == NULL) {
53959243Sobrien			FILEDESC_XUNLOCK(fdp);
54059243Sobrien			error = EBADF;
54159243Sobrien			break;
54259243Sobrien		}
54359243Sobrien		fde = &fdp->fd_ofiles[fd];
54459243Sobrien		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
54559243Sobrien		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
54659243Sobrien		FILEDESC_XUNLOCK(fdp);
54759243Sobrien		break;
54859243Sobrien
54959243Sobrien	case F_GETFL:
55059243Sobrien		error = fget_unlocked(fdp, fd,
55159243Sobrien		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
55259243Sobrien		if (error != 0)
55359243Sobrien			break;
55459243Sobrien		td->td_retval[0] = OFLAGS(fp->f_flag);
55559243Sobrien		fdrop(fp, td);
55659243Sobrien		break;
55759243Sobrien
55859243Sobrien	case F_SETFL:
55959243Sobrien		error = fget_unlocked(fdp, fd,
56059243Sobrien		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
56159243Sobrien		if (error != 0)
56259243Sobrien			break;
56359243Sobrien		do {
56459243Sobrien			tmp = flg = fp->f_flag;
56559243Sobrien			tmp &= ~FCNTLFLAGS;
56659243Sobrien			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
56759243Sobrien		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
56859243Sobrien		tmp = fp->f_flag & FNONBLOCK;
56959243Sobrien		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
57059243Sobrien		if (error != 0) {
57159243Sobrien			fdrop(fp, td);
57259243Sobrien			break;
57359243Sobrien		}
57459243Sobrien		tmp = fp->f_flag & FASYNC;
57559243Sobrien		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
57659243Sobrien		if (error == 0) {
57759243Sobrien			fdrop(fp, td);
57859243Sobrien			break;
57959243Sobrien		}
58059243Sobrien		atomic_clear_int(&fp->f_flag, FNONBLOCK);
58159243Sobrien		tmp = 0;
58259243Sobrien		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
58359243Sobrien		fdrop(fp, td);
58459243Sobrien		break;
58559243Sobrien
58659243Sobrien	case F_GETOWN:
58759243Sobrien		error = fget_unlocked(fdp, fd,
58859243Sobrien		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
58959243Sobrien		if (error != 0)
59059243Sobrien			break;
59159243Sobrien		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
59259243Sobrien		if (error == 0)
59359243Sobrien			td->td_retval[0] = tmp;
59459243Sobrien		fdrop(fp, td);
59559243Sobrien		break;
59659243Sobrien
59759243Sobrien	case F_SETOWN:
59859243Sobrien		error = fget_unlocked(fdp, fd,
59959243Sobrien		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
60059243Sobrien		if (error != 0)
60159243Sobrien			break;
60259243Sobrien		tmp = arg;
60359243Sobrien		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
60459243Sobrien		fdrop(fp, td);
60559243Sobrien		break;
60659243Sobrien
60759243Sobrien	case F_SETLK_REMOTE:
60859243Sobrien		error = priv_check(td, PRIV_NFS_LOCKD);
60959243Sobrien		if (error)
61059243Sobrien			return (error);
61159243Sobrien		flg = F_REMOTE;
61259243Sobrien		goto do_setlk;
61359243Sobrien
61459243Sobrien	case F_SETLKW:
61559243Sobrien		flg |= F_WAIT;
61659243Sobrien		/* FALLTHROUGH F_SETLK */
61759243Sobrien
61859243Sobrien	case F_SETLK:
61959243Sobrien	do_setlk:
62059243Sobrien		cap_rights_init(&rights, CAP_FLOCK);
62159243Sobrien		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
62259243Sobrien		if (error != 0)
62359243Sobrien			break;
624		if (fp->f_type != DTYPE_VNODE) {
625			error = EBADF;
626			fdrop(fp, td);
627			break;
628		}
629
630		flp = (struct flock *)arg;
631		if (flp->l_whence == SEEK_CUR) {
632			foffset = foffset_get(fp);
633			if (foffset < 0 ||
634			    (flp->l_start > 0 &&
635			     foffset > OFF_MAX - flp->l_start)) {
636				FILEDESC_SUNLOCK(fdp);
637				error = EOVERFLOW;
638				fdrop(fp, td);
639				break;
640			}
641			flp->l_start += foffset;
642		}
643
644		vp = fp->f_vnode;
645		switch (flp->l_type) {
646		case F_RDLCK:
647			if ((fp->f_flag & FREAD) == 0) {
648				error = EBADF;
649				break;
650			}
651			PROC_LOCK(p->p_leader);
652			p->p_leader->p_flag |= P_ADVLOCK;
653			PROC_UNLOCK(p->p_leader);
654			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
655			    flp, flg);
656			break;
657		case F_WRLCK:
658			if ((fp->f_flag & FWRITE) == 0) {
659				error = EBADF;
660				break;
661			}
662			PROC_LOCK(p->p_leader);
663			p->p_leader->p_flag |= P_ADVLOCK;
664			PROC_UNLOCK(p->p_leader);
665			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
666			    flp, flg);
667			break;
668		case F_UNLCK:
669			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
670			    flp, flg);
671			break;
672		case F_UNLCKSYS:
673			/*
674			 * Temporary api for testing remote lock
675			 * infrastructure.
676			 */
677			if (flg != F_REMOTE) {
678				error = EINVAL;
679				break;
680			}
681			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
682			    F_UNLCKSYS, flp, flg);
683			break;
684		default:
685			error = EINVAL;
686			break;
687		}
688		if (error != 0 || flp->l_type == F_UNLCK ||
689		    flp->l_type == F_UNLCKSYS) {
690			fdrop(fp, td);
691			break;
692		}
693
694		/*
695		 * Check for a race with close.
696		 *
697		 * The vnode is now advisory locked (or unlocked, but this case
698		 * is not really important) as the caller requested.
699		 * We had to drop the filedesc lock, so we need to recheck if
700		 * the descriptor is still valid, because if it was closed
701		 * in the meantime we need to remove advisory lock from the
702		 * vnode - close on any descriptor leading to an advisory
703		 * locked vnode, removes that lock.
704		 * We will return 0 on purpose in that case, as the result of
705		 * successful advisory lock might have been externally visible
706		 * already. This is fine - effectively we pretend to the caller
707		 * that the closing thread was a bit slower and that the
708		 * advisory lock succeeded before the close.
709		 */
710		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
711		if (error != 0) {
712			fdrop(fp, td);
713			break;
714		}
715		if (fp != fp2) {
716			flp->l_whence = SEEK_SET;
717			flp->l_start = 0;
718			flp->l_len = 0;
719			flp->l_type = F_UNLCK;
720			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
721			    F_UNLCK, flp, F_POSIX);
722		}
723		fdrop(fp, td);
724		fdrop(fp2, td);
725		break;
726
727	case F_GETLK:
728		error = fget_unlocked(fdp, fd,
729		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
730		if (error != 0)
731			break;
732		if (fp->f_type != DTYPE_VNODE) {
733			error = EBADF;
734			fdrop(fp, td);
735			break;
736		}
737		flp = (struct flock *)arg;
738		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
739		    flp->l_type != F_UNLCK) {
740			error = EINVAL;
741			fdrop(fp, td);
742			break;
743		}
744		if (flp->l_whence == SEEK_CUR) {
745			foffset = foffset_get(fp);
746			if ((flp->l_start > 0 &&
747			    foffset > OFF_MAX - flp->l_start) ||
748			    (flp->l_start < 0 &&
749			     foffset < OFF_MIN - flp->l_start)) {
750				FILEDESC_SUNLOCK(fdp);
751				error = EOVERFLOW;
752				fdrop(fp, td);
753				break;
754			}
755			flp->l_start += foffset;
756		}
757		vp = fp->f_vnode;
758		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
759		    F_POSIX);
760		fdrop(fp, td);
761		break;
762
763	case F_RDAHEAD:
764		arg = arg ? 128 * 1024: 0;
765		/* FALLTHROUGH */
766	case F_READAHEAD:
767		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
768		if (error != 0)
769			break;
770		if (fp->f_type != DTYPE_VNODE) {
771			fdrop(fp, td);
772			error = EBADF;
773			break;
774		}
775		vp = fp->f_vnode;
776		/*
777		 * Exclusive lock synchronizes against f_seqcount reads and
778		 * writes in sequential_heuristic().
779		 */
780		error = vn_lock(vp, LK_EXCLUSIVE);
781		if (error != 0) {
782			fdrop(fp, td);
783			break;
784		}
785		if (arg >= 0) {
786			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
787			fp->f_seqcount = (arg + bsize - 1) / bsize;
788			atomic_set_int(&fp->f_flag, FRDAHEAD);
789		} else {
790			atomic_clear_int(&fp->f_flag, FRDAHEAD);
791		}
792		VOP_UNLOCK(vp, 0);
793		fdrop(fp, td);
794		break;
795
796	default:
797		error = EINVAL;
798		break;
799	}
800	return (error);
801}
802
803static int
804getmaxfd(struct proc *p)
805{
806	int maxfd;
807
808	PROC_LOCK(p);
809	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
810	PROC_UNLOCK(p);
811
812	return (maxfd);
813}
814
815/*
816 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
817 */
818int
819do_dup(struct thread *td, int flags, int old, int new,
820    register_t *retval)
821{
822	struct filedesc *fdp;
823	struct filedescent *oldfde, *newfde;
824	struct proc *p;
825	struct file *fp;
826	struct file *delfp;
827	int error, maxfd;
828
829	p = td->td_proc;
830	fdp = p->p_fd;
831
832	/*
833	 * Verify we have a valid descriptor to dup from and possibly to
834	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
835	 * return EINVAL when the new descriptor is out of bounds.
836	 */
837	if (old < 0)
838		return (EBADF);
839	if (new < 0)
840		return (flags & DUP_FCNTL ? EINVAL : EBADF);
841	maxfd = getmaxfd(p);
842	if (new >= maxfd)
843		return (flags & DUP_FCNTL ? EINVAL : EBADF);
844
845	FILEDESC_XLOCK(fdp);
846	if (fget_locked(fdp, old) == NULL) {
847		FILEDESC_XUNLOCK(fdp);
848		return (EBADF);
849	}
850	oldfde = &fdp->fd_ofiles[old];
851	if (flags & DUP_FIXED && old == new) {
852		*retval = new;
853		if (flags & DUP_CLOEXEC)
854			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
855		FILEDESC_XUNLOCK(fdp);
856		return (0);
857	}
858	fp = oldfde->fde_file;
859	fhold(fp);
860
861	/*
862	 * If the caller specified a file descriptor, make sure the file
863	 * table is large enough to hold it, and grab it.  Otherwise, just
864	 * allocate a new descriptor the usual way.
865	 */
866	if (flags & DUP_FIXED) {
867		if (new >= fdp->fd_nfiles) {
868			/*
869			 * The resource limits are here instead of e.g.
870			 * fdalloc(), because the file descriptor table may be
871			 * shared between processes, so we can't really use
872			 * racct_add()/racct_sub().  Instead of counting the
873			 * number of actually allocated descriptors, just put
874			 * the limit on the size of the file descriptor table.
875			 */
876#ifdef RACCT
877			if (racct_enable) {
878				PROC_LOCK(p);
879				error = racct_set(p, RACCT_NOFILE, new + 1);
880				PROC_UNLOCK(p);
881				if (error != 0) {
882					FILEDESC_XUNLOCK(fdp);
883					fdrop(fp, td);
884					return (EMFILE);
885				}
886			}
887#endif
888			fdgrowtable_exp(fdp, new + 1);
889			oldfde = &fdp->fd_ofiles[old];
890		}
891		newfde = &fdp->fd_ofiles[new];
892		if (newfde->fde_file == NULL)
893			fdused(fdp, new);
894	} else {
895		if ((error = fdalloc(td, new, &new)) != 0) {
896			FILEDESC_XUNLOCK(fdp);
897			fdrop(fp, td);
898			return (error);
899		}
900		newfde = &fdp->fd_ofiles[new];
901	}
902
903	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
904	KASSERT(old != new, ("new fd is same as old"));
905
906	delfp = newfde->fde_file;
907
908	/*
909	 * Duplicate the source descriptor.
910	 */
911#ifdef CAPABILITIES
912	seq_write_begin(&newfde->fde_seq);
913#endif
914	filecaps_free(&newfde->fde_caps);
915	memcpy(newfde, oldfde, fde_change_size);
916	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
917	if ((flags & DUP_CLOEXEC) != 0)
918		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
919	else
920		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
921#ifdef CAPABILITIES
922	seq_write_end(&newfde->fde_seq);
923#endif
924	*retval = new;
925
926	if (delfp != NULL) {
927		(void) closefp(fdp, new, delfp, td, 1);
928		/* closefp() drops the FILEDESC lock for us. */
929	} else {
930		FILEDESC_XUNLOCK(fdp);
931	}
932
933	return (0);
934}
935
936/*
937 * If sigio is on the list associated with a process or process group,
938 * disable signalling from the device, remove sigio from the list and
939 * free sigio.
940 */
941void
942funsetown(struct sigio **sigiop)
943{
944	struct sigio *sigio;
945
946	SIGIO_LOCK();
947	sigio = *sigiop;
948	if (sigio == NULL) {
949		SIGIO_UNLOCK();
950		return;
951	}
952	*(sigio->sio_myref) = NULL;
953	if ((sigio)->sio_pgid < 0) {
954		struct pgrp *pg = (sigio)->sio_pgrp;
955		PGRP_LOCK(pg);
956		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
957			     sigio, sio_pgsigio);
958		PGRP_UNLOCK(pg);
959	} else {
960		struct proc *p = (sigio)->sio_proc;
961		PROC_LOCK(p);
962		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
963			     sigio, sio_pgsigio);
964		PROC_UNLOCK(p);
965	}
966	SIGIO_UNLOCK();
967	crfree(sigio->sio_ucred);
968	free(sigio, M_SIGIO);
969}
970
971/*
972 * Free a list of sigio structures.
973 * We only need to lock the SIGIO_LOCK because we have made ourselves
974 * inaccessible to callers of fsetown and therefore do not need to lock
975 * the proc or pgrp struct for the list manipulation.
976 */
977void
978funsetownlst(struct sigiolst *sigiolst)
979{
980	struct proc *p;
981	struct pgrp *pg;
982	struct sigio *sigio;
983
984	sigio = SLIST_FIRST(sigiolst);
985	if (sigio == NULL)
986		return;
987	p = NULL;
988	pg = NULL;
989
990	/*
991	 * Every entry of the list should belong
992	 * to a single proc or pgrp.
993	 */
994	if (sigio->sio_pgid < 0) {
995		pg = sigio->sio_pgrp;
996		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
997	} else /* if (sigio->sio_pgid > 0) */ {
998		p = sigio->sio_proc;
999		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1000	}
1001
1002	SIGIO_LOCK();
1003	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
1004		*(sigio->sio_myref) = NULL;
1005		if (pg != NULL) {
1006			KASSERT(sigio->sio_pgid < 0,
1007			    ("Proc sigio in pgrp sigio list"));
1008			KASSERT(sigio->sio_pgrp == pg,
1009			    ("Bogus pgrp in sigio list"));
1010			PGRP_LOCK(pg);
1011			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
1012			    sio_pgsigio);
1013			PGRP_UNLOCK(pg);
1014		} else /* if (p != NULL) */ {
1015			KASSERT(sigio->sio_pgid > 0,
1016			    ("Pgrp sigio in proc sigio list"));
1017			KASSERT(sigio->sio_proc == p,
1018			    ("Bogus proc in sigio list"));
1019			PROC_LOCK(p);
1020			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1021			    sio_pgsigio);
1022			PROC_UNLOCK(p);
1023		}
1024		SIGIO_UNLOCK();
1025		crfree(sigio->sio_ucred);
1026		free(sigio, M_SIGIO);
1027		SIGIO_LOCK();
1028	}
1029	SIGIO_UNLOCK();
1030}
1031
1032/*
1033 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1034 *
1035 * After permission checking, add a sigio structure to the sigio list for
1036 * the process or process group.
1037 */
1038int
1039fsetown(pid_t pgid, struct sigio **sigiop)
1040{
1041	struct proc *proc;
1042	struct pgrp *pgrp;
1043	struct sigio *sigio;
1044	int ret;
1045
1046	if (pgid == 0) {
1047		funsetown(sigiop);
1048		return (0);
1049	}
1050
1051	ret = 0;
1052
1053	/* Allocate and fill in the new sigio out of locks. */
1054	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1055	sigio->sio_pgid = pgid;
1056	sigio->sio_ucred = crhold(curthread->td_ucred);
1057	sigio->sio_myref = sigiop;
1058
1059	sx_slock(&proctree_lock);
1060	if (pgid > 0) {
1061		proc = pfind(pgid);
1062		if (proc == NULL) {
1063			ret = ESRCH;
1064			goto fail;
1065		}
1066
1067		/*
1068		 * Policy - Don't allow a process to FSETOWN a process
1069		 * in another session.
1070		 *
1071		 * Remove this test to allow maximum flexibility or
1072		 * restrict FSETOWN to the current process or process
1073		 * group for maximum safety.
1074		 */
1075		PROC_UNLOCK(proc);
1076		if (proc->p_session != curthread->td_proc->p_session) {
1077			ret = EPERM;
1078			goto fail;
1079		}
1080
1081		pgrp = NULL;
1082	} else /* if (pgid < 0) */ {
1083		pgrp = pgfind(-pgid);
1084		if (pgrp == NULL) {
1085			ret = ESRCH;
1086			goto fail;
1087		}
1088		PGRP_UNLOCK(pgrp);
1089
1090		/*
1091		 * Policy - Don't allow a process to FSETOWN a process
1092		 * in another session.
1093		 *
1094		 * Remove this test to allow maximum flexibility or
1095		 * restrict FSETOWN to the current process or process
1096		 * group for maximum safety.
1097		 */
1098		if (pgrp->pg_session != curthread->td_proc->p_session) {
1099			ret = EPERM;
1100			goto fail;
1101		}
1102
1103		proc = NULL;
1104	}
1105	funsetown(sigiop);
1106	if (pgid > 0) {
1107		PROC_LOCK(proc);
1108		/*
1109		 * Since funsetownlst() is called without the proctree
1110		 * locked, we need to check for P_WEXIT.
1111		 * XXX: is ESRCH correct?
1112		 */
1113		if ((proc->p_flag & P_WEXIT) != 0) {
1114			PROC_UNLOCK(proc);
1115			ret = ESRCH;
1116			goto fail;
1117		}
1118		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1119		sigio->sio_proc = proc;
1120		PROC_UNLOCK(proc);
1121	} else {
1122		PGRP_LOCK(pgrp);
1123		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1124		sigio->sio_pgrp = pgrp;
1125		PGRP_UNLOCK(pgrp);
1126	}
1127	sx_sunlock(&proctree_lock);
1128	SIGIO_LOCK();
1129	*sigiop = sigio;
1130	SIGIO_UNLOCK();
1131	return (0);
1132
1133fail:
1134	sx_sunlock(&proctree_lock);
1135	crfree(sigio->sio_ucred);
1136	free(sigio, M_SIGIO);
1137	return (ret);
1138}
1139
1140/*
1141 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1142 */
1143pid_t
1144fgetown(sigiop)
1145	struct sigio **sigiop;
1146{
1147	pid_t pgid;
1148
1149	SIGIO_LOCK();
1150	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1151	SIGIO_UNLOCK();
1152	return (pgid);
1153}
1154
1155/*
1156 * Function drops the filedesc lock on return.
1157 */
1158static int
1159closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1160    int holdleaders)
1161{
1162	int error;
1163
1164	FILEDESC_XLOCK_ASSERT(fdp);
1165
1166	if (holdleaders) {
1167		if (td->td_proc->p_fdtol != NULL) {
1168			/*
1169			 * Ask fdfree() to sleep to ensure that all relevant
1170			 * process leaders can be traversed in closef().
1171			 */
1172			fdp->fd_holdleaderscount++;
1173		} else {
1174			holdleaders = 0;
1175		}
1176	}
1177
1178	/*
1179	 * We now hold the fp reference that used to be owned by the
1180	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1181	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1182	 * added, and deleteing a knote for the new fd.
1183	 */
1184	knote_fdclose(td, fd);
1185
1186	/*
1187	 * We need to notify mqueue if the object is of type mqueue.
1188	 */
1189	if (fp->f_type == DTYPE_MQUEUE)
1190		mq_fdclose(td, fd, fp);
1191	FILEDESC_XUNLOCK(fdp);
1192
1193	error = closef(fp, td);
1194	if (holdleaders) {
1195		FILEDESC_XLOCK(fdp);
1196		fdp->fd_holdleaderscount--;
1197		if (fdp->fd_holdleaderscount == 0 &&
1198		    fdp->fd_holdleaderswakeup != 0) {
1199			fdp->fd_holdleaderswakeup = 0;
1200			wakeup(&fdp->fd_holdleaderscount);
1201		}
1202		FILEDESC_XUNLOCK(fdp);
1203	}
1204	return (error);
1205}
1206
1207/*
1208 * Close a file descriptor.
1209 */
1210#ifndef _SYS_SYSPROTO_H_
1211struct close_args {
1212	int     fd;
1213};
1214#endif
1215/* ARGSUSED */
1216int
1217sys_close(td, uap)
1218	struct thread *td;
1219	struct close_args *uap;
1220{
1221
1222	return (kern_close(td, uap->fd));
1223}
1224
1225int
1226kern_close(td, fd)
1227	struct thread *td;
1228	int fd;
1229{
1230	struct filedesc *fdp;
1231	struct file *fp;
1232
1233	fdp = td->td_proc->p_fd;
1234
1235	AUDIT_SYSCLOSE(td, fd);
1236
1237	FILEDESC_XLOCK(fdp);
1238	if ((fp = fget_locked(fdp, fd)) == NULL) {
1239		FILEDESC_XUNLOCK(fdp);
1240		return (EBADF);
1241	}
1242	fdfree(fdp, fd);
1243
1244	/* closefp() drops the FILEDESC lock for us. */
1245	return (closefp(fdp, fd, fp, td, 1));
1246}
1247
1248/*
1249 * Close open file descriptors.
1250 */
1251#ifndef _SYS_SYSPROTO_H_
1252struct closefrom_args {
1253	int	lowfd;
1254};
1255#endif
1256/* ARGSUSED */
1257int
1258sys_closefrom(struct thread *td, struct closefrom_args *uap)
1259{
1260	struct filedesc *fdp;
1261	int fd;
1262
1263	fdp = td->td_proc->p_fd;
1264	AUDIT_ARG_FD(uap->lowfd);
1265
1266	/*
1267	 * Treat negative starting file descriptor values identical to
1268	 * closefrom(0) which closes all files.
1269	 */
1270	if (uap->lowfd < 0)
1271		uap->lowfd = 0;
1272	FILEDESC_SLOCK(fdp);
1273	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
1274		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1275			FILEDESC_SUNLOCK(fdp);
1276			(void)kern_close(td, fd);
1277			FILEDESC_SLOCK(fdp);
1278		}
1279	}
1280	FILEDESC_SUNLOCK(fdp);
1281	return (0);
1282}
1283
1284#if defined(COMPAT_43)
1285/*
1286 * Return status information about a file descriptor.
1287 */
1288#ifndef _SYS_SYSPROTO_H_
1289struct ofstat_args {
1290	int	fd;
1291	struct	ostat *sb;
1292};
1293#endif
1294/* ARGSUSED */
1295int
1296ofstat(struct thread *td, struct ofstat_args *uap)
1297{
1298	struct ostat oub;
1299	struct stat ub;
1300	int error;
1301
1302	error = kern_fstat(td, uap->fd, &ub);
1303	if (error == 0) {
1304		cvtstat(&ub, &oub);
1305		error = copyout(&oub, uap->sb, sizeof(oub));
1306	}
1307	return (error);
1308}
1309#endif /* COMPAT_43 */
1310
1311/*
1312 * Return status information about a file descriptor.
1313 */
1314#ifndef _SYS_SYSPROTO_H_
1315struct fstat_args {
1316	int	fd;
1317	struct	stat *sb;
1318};
1319#endif
1320/* ARGSUSED */
1321int
1322sys_fstat(struct thread *td, struct fstat_args *uap)
1323{
1324	struct stat ub;
1325	int error;
1326
1327	error = kern_fstat(td, uap->fd, &ub);
1328	if (error == 0)
1329		error = copyout(&ub, uap->sb, sizeof(ub));
1330	return (error);
1331}
1332
1333int
1334kern_fstat(struct thread *td, int fd, struct stat *sbp)
1335{
1336	struct file *fp;
1337	cap_rights_t rights;
1338	int error;
1339
1340	AUDIT_ARG_FD(fd);
1341
1342	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1343	if (error != 0)
1344		return (error);
1345
1346	AUDIT_ARG_FILE(td->td_proc, fp);
1347
1348	error = fo_stat(fp, sbp, td->td_ucred, td);
1349	fdrop(fp, td);
1350#ifdef KTRACE
1351	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1352		ktrstat(sbp);
1353#endif
1354	return (error);
1355}
1356
1357/*
1358 * Return status information about a file descriptor.
1359 */
1360#ifndef _SYS_SYSPROTO_H_
1361struct nfstat_args {
1362	int	fd;
1363	struct	nstat *sb;
1364};
1365#endif
1366/* ARGSUSED */
1367int
1368sys_nfstat(struct thread *td, struct nfstat_args *uap)
1369{
1370	struct nstat nub;
1371	struct stat ub;
1372	int error;
1373
1374	error = kern_fstat(td, uap->fd, &ub);
1375	if (error == 0) {
1376		cvtnstat(&ub, &nub);
1377		error = copyout(&nub, uap->sb, sizeof(nub));
1378	}
1379	return (error);
1380}
1381
1382/*
1383 * Return pathconf information about a file descriptor.
1384 */
1385#ifndef _SYS_SYSPROTO_H_
1386struct fpathconf_args {
1387	int	fd;
1388	int	name;
1389};
1390#endif
1391/* ARGSUSED */
1392int
1393sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1394{
1395	struct file *fp;
1396	struct vnode *vp;
1397	cap_rights_t rights;
1398	int error;
1399
1400	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1401	if (error != 0)
1402		return (error);
1403
1404	/* If asynchronous I/O is available, it works for all descriptors. */
1405	if (uap->name == _PC_ASYNC_IO) {
1406		td->td_retval[0] = async_io_version;
1407		goto out;
1408	}
1409	vp = fp->f_vnode;
1410	if (vp != NULL) {
1411		vn_lock(vp, LK_SHARED | LK_RETRY);
1412		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1413		VOP_UNLOCK(vp, 0);
1414	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1415		if (uap->name != _PC_PIPE_BUF) {
1416			error = EINVAL;
1417		} else {
1418			td->td_retval[0] = PIPE_BUF;
1419			error = 0;
1420		}
1421	} else {
1422		error = EOPNOTSUPP;
1423	}
1424out:
1425	fdrop(fp, td);
1426	return (error);
1427}
1428
1429/*
1430 * Initialize filecaps structure.
1431 */
1432void
1433filecaps_init(struct filecaps *fcaps)
1434{
1435
1436	bzero(fcaps, sizeof(*fcaps));
1437	fcaps->fc_nioctls = -1;
1438}
1439
1440/*
1441 * Copy filecaps structure allocating memory for ioctls array if needed.
1442 */
1443void
1444filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1445{
1446	size_t size;
1447
1448	*dst = *src;
1449	if (src->fc_ioctls != NULL) {
1450		KASSERT(src->fc_nioctls > 0,
1451		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1452
1453		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1454		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1455		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1456	}
1457}
1458
1459/*
1460 * Move filecaps structure to the new place and clear the old place.
1461 */
1462void
1463filecaps_move(struct filecaps *src, struct filecaps *dst)
1464{
1465
1466	*dst = *src;
1467	bzero(src, sizeof(*src));
1468}
1469
1470/*
1471 * Fill the given filecaps structure with full rights.
1472 */
1473static void
1474filecaps_fill(struct filecaps *fcaps)
1475{
1476
1477	CAP_ALL(&fcaps->fc_rights);
1478	fcaps->fc_ioctls = NULL;
1479	fcaps->fc_nioctls = -1;
1480	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1481}
1482
1483/*
1484 * Free memory allocated within filecaps structure.
1485 */
1486void
1487filecaps_free(struct filecaps *fcaps)
1488{
1489
1490	free(fcaps->fc_ioctls, M_FILECAPS);
1491	bzero(fcaps, sizeof(*fcaps));
1492}
1493
1494/*
1495 * Validate the given filecaps structure.
1496 */
1497static void
1498filecaps_validate(const struct filecaps *fcaps, const char *func)
1499{
1500
1501	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1502	    ("%s: invalid rights", func));
1503	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1504	    ("%s: invalid fcntls", func));
1505	KASSERT(fcaps->fc_fcntls == 0 ||
1506	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1507	    ("%s: fcntls without CAP_FCNTL", func));
1508	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1509	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1510	    ("%s: invalid ioctls", func));
1511	KASSERT(fcaps->fc_nioctls == 0 ||
1512	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1513	    ("%s: ioctls without CAP_IOCTL", func));
1514}
1515
1516static void
1517fdgrowtable_exp(struct filedesc *fdp, int nfd)
1518{
1519	int nfd1;
1520
1521	FILEDESC_XLOCK_ASSERT(fdp);
1522
1523	nfd1 = fdp->fd_nfiles * 2;
1524	if (nfd1 < nfd)
1525		nfd1 = nfd;
1526	fdgrowtable(fdp, nfd1);
1527}
1528
1529/*
1530 * Grow the file table to accomodate (at least) nfd descriptors.
1531 */
1532static void
1533fdgrowtable(struct filedesc *fdp, int nfd)
1534{
1535	struct filedesc0 *fdp0;
1536	struct freetable *ft;
1537	struct filedescent *ntable;
1538	struct filedescent *otable;
1539	int nnfiles, onfiles;
1540	NDSLOTTYPE *nmap, *omap;
1541
1542	FILEDESC_XLOCK_ASSERT(fdp);
1543
1544	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1545
1546	/* save old values */
1547	onfiles = fdp->fd_nfiles;
1548	otable = fdp->fd_ofiles;
1549	omap = fdp->fd_map;
1550
1551	/* compute the size of the new table */
1552	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1553	if (nnfiles <= onfiles)
1554		/* the table is already large enough */
1555		return;
1556
1557	/*
1558	 * Allocate a new table.  We need enough space for the
1559	 * file entries themselves and the struct freetable we will use
1560	 * when we decommission the table and place it on the freelist.
1561	 * We place the struct freetable in the middle so we don't have
1562	 * to worry about padding.
1563	 */
1564	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
1565	    M_FILEDESC, M_ZERO | M_WAITOK);
1566	/* copy the old data over and point at the new tables */
1567	memcpy(ntable, otable, onfiles * sizeof(*otable));
1568	fdp->fd_ofiles = ntable;
1569
1570	/*
1571	 * Allocate a new map only if the old is not large enough.  It will
1572	 * grow at a slower rate than the table as it can map more
1573	 * entries than the table can hold.
1574	 */
1575	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1576		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1577		    M_ZERO | M_WAITOK);
1578		/* copy over the old data and update the pointer */
1579		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1580		fdp->fd_map = nmap;
1581	}
1582
1583	/*
1584	 * In order to have a valid pattern for fget_unlocked()
1585	 * fdp->fd_nfiles must be the last member to be updated, otherwise
1586	 * fget_unlocked() consumers may reference a new, higher value for
1587	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
1588	 * resulting in OOB accesses.
1589	 */
1590	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
1591
1592	/*
1593	 * Do not free the old file table, as some threads may still
1594	 * reference entries within it.  Instead, place it on a freelist
1595	 * which will be processed when the struct filedesc is released.
1596	 *
1597	 * Note that if onfiles == NDFILE, we're dealing with the original
1598	 * static allocation contained within (struct filedesc0 *)fdp,
1599	 * which must not be freed.
1600	 */
1601	if (onfiles > NDFILE) {
1602		ft = (struct freetable *)&otable[onfiles];
1603		fdp0 = (struct filedesc0 *)fdp;
1604		ft->ft_table = otable;
1605		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1606	}
1607	/*
1608	 * The map does not have the same possibility of threads still
1609	 * holding references to it.  So always free it as long as it
1610	 * does not reference the original static allocation.
1611	 */
1612	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1613		free(omap, M_FILEDESC);
1614}
1615
1616/*
1617 * Allocate a file descriptor for the process.
1618 */
1619int
1620fdalloc(struct thread *td, int minfd, int *result)
1621{
1622	struct proc *p = td->td_proc;
1623	struct filedesc *fdp = p->p_fd;
1624	int fd = -1, maxfd, allocfd;
1625#ifdef RACCT
1626	int error;
1627#endif
1628
1629	FILEDESC_XLOCK_ASSERT(fdp);
1630
1631	if (fdp->fd_freefile > minfd)
1632		minfd = fdp->fd_freefile;
1633
1634	maxfd = getmaxfd(p);
1635
1636	/*
1637	 * Search the bitmap for a free descriptor starting at minfd.
1638	 * If none is found, grow the file table.
1639	 */
1640	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1641	if (fd >= maxfd)
1642		return (EMFILE);
1643	if (fd >= fdp->fd_nfiles) {
1644		allocfd = min(fd * 2, maxfd);
1645#ifdef RACCT
1646		if (racct_enable) {
1647			PROC_LOCK(p);
1648			error = racct_set(p, RACCT_NOFILE, allocfd);
1649			PROC_UNLOCK(p);
1650			if (error != 0)
1651				return (EMFILE);
1652		}
1653#endif
1654		/*
1655		 * fd is already equal to first free descriptor >= minfd, so
1656		 * we only need to grow the table and we are done.
1657		 */
1658		fdgrowtable_exp(fdp, allocfd);
1659	}
1660
1661	/*
1662	 * Perform some sanity checks, then mark the file descriptor as
1663	 * used and return it to the caller.
1664	 */
1665	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1666	    ("invalid descriptor %d", fd));
1667	KASSERT(!fdisused(fdp, fd),
1668	    ("fd_first_free() returned non-free descriptor"));
1669	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1670	    ("file descriptor isn't free"));
1671	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1672	fdused(fdp, fd);
1673	*result = fd;
1674	return (0);
1675}
1676
1677/*
1678 * Allocate n file descriptors for the process.
1679 */
1680int
1681fdallocn(struct thread *td, int minfd, int *fds, int n)
1682{
1683	struct proc *p = td->td_proc;
1684	struct filedesc *fdp = p->p_fd;
1685	int i;
1686
1687	FILEDESC_XLOCK_ASSERT(fdp);
1688
1689	if (!fdavail(td, n))
1690		return (EMFILE);
1691
1692	for (i = 0; i < n; i++)
1693		if (fdalloc(td, 0, &fds[i]) != 0)
1694			break;
1695
1696	if (i < n) {
1697		for (i--; i >= 0; i--)
1698			fdunused(fdp, fds[i]);
1699		return (EMFILE);
1700	}
1701
1702	return (0);
1703}
1704
1705/*
1706 * Check to see whether n user file descriptors are available to the process
1707 * p.
1708 */
1709int
1710fdavail(struct thread *td, int n)
1711{
1712	struct proc *p = td->td_proc;
1713	struct filedesc *fdp = td->td_proc->p_fd;
1714	int i, lim, last;
1715
1716	FILEDESC_LOCK_ASSERT(fdp);
1717
1718	/*
1719	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
1720	 *      call racct_add() from there instead of dealing with containers
1721	 *      here.
1722	 */
1723	lim = getmaxfd(p);
1724	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1725		return (1);
1726	last = min(fdp->fd_nfiles, lim);
1727	for (i = fdp->fd_freefile; i < last; i++) {
1728		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
1729			return (1);
1730	}
1731	return (0);
1732}
1733
1734/*
1735 * Create a new open file structure and allocate a file decriptor for the
1736 * process that refers to it.  We add one reference to the file for the
1737 * descriptor table and one reference for resultfp. This is to prevent us
1738 * being preempted and the entry in the descriptor table closed after we
1739 * release the FILEDESC lock.
1740 */
1741int
1742falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1743{
1744	struct file *fp;
1745	int error, fd;
1746
1747	error = falloc_noinstall(td, &fp);
1748	if (error)
1749		return (error);		/* no reference held on error */
1750
1751	error = finstall(td, fp, &fd, flags, NULL);
1752	if (error) {
1753		fdrop(fp, td);		/* one reference (fp only) */
1754		return (error);
1755	}
1756
1757	if (resultfp != NULL)
1758		*resultfp = fp;		/* copy out result */
1759	else
1760		fdrop(fp, td);		/* release local reference */
1761
1762	if (resultfd != NULL)
1763		*resultfd = fd;
1764
1765	return (0);
1766}
1767
1768/*
1769 * Create a new open file structure without allocating a file descriptor.
1770 */
1771int
1772falloc_noinstall(struct thread *td, struct file **resultfp)
1773{
1774	struct file *fp;
1775	int maxuserfiles = maxfiles - (maxfiles / 20);
1776	static struct timeval lastfail;
1777	static int curfail;
1778
1779	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1780
1781	if ((openfiles >= maxuserfiles &&
1782	    priv_check(td, PRIV_MAXFILES) != 0) ||
1783	    openfiles >= maxfiles) {
1784		if (ppsratecheck(&lastfail, &curfail, 1)) {
1785			printf("kern.maxfiles limit exceeded by uid %i, "
1786			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1787		}
1788		return (ENFILE);
1789	}
1790	atomic_add_int(&openfiles, 1);
1791	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1792	refcount_init(&fp->f_count, 1);
1793	fp->f_cred = crhold(td->td_ucred);
1794	fp->f_ops = &badfileops;
1795	fp->f_data = NULL;
1796	fp->f_vnode = NULL;
1797	*resultfp = fp;
1798	return (0);
1799}
1800
1801/*
1802 * Install a file in a file descriptor table.
1803 */
1804int
1805finstall(struct thread *td, struct file *fp, int *fd, int flags,
1806    struct filecaps *fcaps)
1807{
1808	struct filedesc *fdp = td->td_proc->p_fd;
1809	struct filedescent *fde;
1810	int error;
1811
1812	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1813	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1814	if (fcaps != NULL)
1815		filecaps_validate(fcaps, __func__);
1816
1817	FILEDESC_XLOCK(fdp);
1818	if ((error = fdalloc(td, 0, fd))) {
1819		FILEDESC_XUNLOCK(fdp);
1820		return (error);
1821	}
1822	fhold(fp);
1823	fde = &fdp->fd_ofiles[*fd];
1824#ifdef CAPABILITIES
1825	seq_write_begin(&fde->fde_seq);
1826#endif
1827	fde->fde_file = fp;
1828	if ((flags & O_CLOEXEC) != 0)
1829		fde->fde_flags |= UF_EXCLOSE;
1830	if (fcaps != NULL)
1831		filecaps_move(fcaps, &fde->fde_caps);
1832	else
1833		filecaps_fill(&fde->fde_caps);
1834#ifdef CAPABILITIES
1835	seq_write_end(&fde->fde_seq);
1836#endif
1837	FILEDESC_XUNLOCK(fdp);
1838	return (0);
1839}
1840
1841/*
1842 * Build a new filedesc structure from another.
1843 * Copy the current, root, and jail root vnode references.
1844 */
1845struct filedesc *
1846fdinit(struct filedesc *fdp)
1847{
1848	struct filedesc0 *newfdp;
1849
1850	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1851	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1852	if (fdp != NULL) {
1853		FILEDESC_SLOCK(fdp);
1854		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1855		if (newfdp->fd_fd.fd_cdir)
1856			VREF(newfdp->fd_fd.fd_cdir);
1857		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1858		if (newfdp->fd_fd.fd_rdir)
1859			VREF(newfdp->fd_fd.fd_rdir);
1860		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1861		if (newfdp->fd_fd.fd_jdir)
1862			VREF(newfdp->fd_fd.fd_jdir);
1863		FILEDESC_SUNLOCK(fdp);
1864	}
1865
1866	/* Create the file descriptor table. */
1867	newfdp->fd_fd.fd_refcnt = 1;
1868	newfdp->fd_fd.fd_holdcnt = 1;
1869	newfdp->fd_fd.fd_cmask = CMASK;
1870	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1871	newfdp->fd_fd.fd_nfiles = NDFILE;
1872	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1873	newfdp->fd_fd.fd_lastfile = -1;
1874	return (&newfdp->fd_fd);
1875}
1876
1877static struct filedesc *
1878fdhold(struct proc *p)
1879{
1880	struct filedesc *fdp;
1881
1882	mtx_lock(&fdesc_mtx);
1883	fdp = p->p_fd;
1884	if (fdp != NULL)
1885		fdp->fd_holdcnt++;
1886	mtx_unlock(&fdesc_mtx);
1887	return (fdp);
1888}
1889
1890static void
1891fddrop(struct filedesc *fdp)
1892{
1893	struct filedesc0 *fdp0;
1894	struct freetable *ft;
1895	int i;
1896
1897	mtx_lock(&fdesc_mtx);
1898	i = --fdp->fd_holdcnt;
1899	mtx_unlock(&fdesc_mtx);
1900	if (i > 0)
1901		return;
1902
1903	FILEDESC_LOCK_DESTROY(fdp);
1904	fdp0 = (struct filedesc0 *)fdp;
1905	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1906		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1907		free(ft->ft_table, M_FILEDESC);
1908	}
1909	free(fdp, M_FILEDESC);
1910}
1911
1912/*
1913 * Share a filedesc structure.
1914 */
1915struct filedesc *
1916fdshare(struct filedesc *fdp)
1917{
1918
1919	FILEDESC_XLOCK(fdp);
1920	fdp->fd_refcnt++;
1921	FILEDESC_XUNLOCK(fdp);
1922	return (fdp);
1923}
1924
1925/*
1926 * Unshare a filedesc structure, if necessary by making a copy
1927 */
1928void
1929fdunshare(struct thread *td)
1930{
1931	struct filedesc *tmp;
1932	struct proc *p = td->td_proc;
1933
1934	if (p->p_fd->fd_refcnt == 1)
1935		return;
1936
1937	tmp = fdcopy(p->p_fd);
1938	fdescfree(td);
1939	p->p_fd = tmp;
1940}
1941
1942/*
1943 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1944 * this is to ease callers, not catch errors.
1945 */
1946struct filedesc *
1947fdcopy(struct filedesc *fdp)
1948{
1949	struct filedesc *newfdp;
1950	struct filedescent *nfde, *ofde;
1951	int i;
1952
1953	/* Certain daemons might not have file descriptors. */
1954	if (fdp == NULL)
1955		return (NULL);
1956
1957	newfdp = fdinit(fdp);
1958	FILEDESC_SLOCK(fdp);
1959	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1960		FILEDESC_SUNLOCK(fdp);
1961		FILEDESC_XLOCK(newfdp);
1962		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1963		FILEDESC_XUNLOCK(newfdp);
1964		FILEDESC_SLOCK(fdp);
1965	}
1966	/* copy all passable descriptors (i.e. not kqueue) */
1967	newfdp->fd_freefile = -1;
1968	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1969		ofde = &fdp->fd_ofiles[i];
1970		if (fdisused(fdp, i) &&
1971		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1972		    ofde->fde_file->f_ops != &badfileops) {
1973			nfde = &newfdp->fd_ofiles[i];
1974			*nfde = *ofde;
1975			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1976			fhold(nfde->fde_file);
1977			newfdp->fd_lastfile = i;
1978		} else {
1979			if (newfdp->fd_freefile == -1)
1980				newfdp->fd_freefile = i;
1981		}
1982	}
1983	newfdp->fd_cmask = fdp->fd_cmask;
1984	FILEDESC_SUNLOCK(fdp);
1985	FILEDESC_XLOCK(newfdp);
1986	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1987		if (newfdp->fd_ofiles[i].fde_file != NULL)
1988			fdused(newfdp, i);
1989	}
1990	if (newfdp->fd_freefile == -1)
1991		newfdp->fd_freefile = i;
1992	FILEDESC_XUNLOCK(newfdp);
1993	return (newfdp);
1994}
1995
1996/*
1997 * Release a filedesc structure.
1998 */
1999void
2000fdescfree(struct thread *td)
2001{
2002	struct filedesc *fdp;
2003	int i;
2004	struct filedesc_to_leader *fdtol;
2005	struct file *fp;
2006	struct vnode *cdir, *jdir, *rdir, *vp;
2007	struct flock lf;
2008
2009	/* Certain daemons might not have file descriptors. */
2010	fdp = td->td_proc->p_fd;
2011	if (fdp == NULL)
2012		return;
2013
2014#ifdef RACCT
2015	if (racct_enable) {
2016		PROC_LOCK(td->td_proc);
2017		racct_set(td->td_proc, RACCT_NOFILE, 0);
2018		PROC_UNLOCK(td->td_proc);
2019	}
2020#endif
2021
2022	/* Check for special need to clear POSIX style locks */
2023	fdtol = td->td_proc->p_fdtol;
2024	if (fdtol != NULL) {
2025		FILEDESC_XLOCK(fdp);
2026		KASSERT(fdtol->fdl_refcount > 0,
2027		    ("filedesc_to_refcount botch: fdl_refcount=%d",
2028		    fdtol->fdl_refcount));
2029		if (fdtol->fdl_refcount == 1 &&
2030		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2031			for (i = 0; i <= fdp->fd_lastfile; i++) {
2032				fp = fdp->fd_ofiles[i].fde_file;
2033				if (fp == NULL || fp->f_type != DTYPE_VNODE)
2034					continue;
2035				fhold(fp);
2036				FILEDESC_XUNLOCK(fdp);
2037				lf.l_whence = SEEK_SET;
2038				lf.l_start = 0;
2039				lf.l_len = 0;
2040				lf.l_type = F_UNLCK;
2041				vp = fp->f_vnode;
2042				(void) VOP_ADVLOCK(vp,
2043				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
2044				    &lf, F_POSIX);
2045				FILEDESC_XLOCK(fdp);
2046				fdrop(fp, td);
2047			}
2048		}
2049	retry:
2050		if (fdtol->fdl_refcount == 1) {
2051			if (fdp->fd_holdleaderscount > 0 &&
2052			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2053				/*
2054				 * close() or do_dup() has cleared a reference
2055				 * in a shared file descriptor table.
2056				 */
2057				fdp->fd_holdleaderswakeup = 1;
2058				sx_sleep(&fdp->fd_holdleaderscount,
2059				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2060				goto retry;
2061			}
2062			if (fdtol->fdl_holdcount > 0) {
2063				/*
2064				 * Ensure that fdtol->fdl_leader remains
2065				 * valid in closef().
2066				 */
2067				fdtol->fdl_wakeup = 1;
2068				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2069				    "fdlhold", 0);
2070				goto retry;
2071			}
2072		}
2073		fdtol->fdl_refcount--;
2074		if (fdtol->fdl_refcount == 0 &&
2075		    fdtol->fdl_holdcount == 0) {
2076			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2077			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2078		} else
2079			fdtol = NULL;
2080		td->td_proc->p_fdtol = NULL;
2081		FILEDESC_XUNLOCK(fdp);
2082		if (fdtol != NULL)
2083			free(fdtol, M_FILEDESC_TO_LEADER);
2084	}
2085
2086	mtx_lock(&fdesc_mtx);
2087	td->td_proc->p_fd = NULL;
2088	mtx_unlock(&fdesc_mtx);
2089
2090	FILEDESC_XLOCK(fdp);
2091	i = --fdp->fd_refcnt;
2092	if (i > 0) {
2093		FILEDESC_XUNLOCK(fdp);
2094		return;
2095	}
2096
2097	cdir = fdp->fd_cdir;
2098	fdp->fd_cdir = NULL;
2099	rdir = fdp->fd_rdir;
2100	fdp->fd_rdir = NULL;
2101	jdir = fdp->fd_jdir;
2102	fdp->fd_jdir = NULL;
2103	FILEDESC_XUNLOCK(fdp);
2104
2105	for (i = 0; i <= fdp->fd_lastfile; i++) {
2106		fp = fdp->fd_ofiles[i].fde_file;
2107		if (fp != NULL) {
2108			fdfree_last(fdp, i);
2109			(void) closef(fp, td);
2110		}
2111	}
2112
2113	if (fdp->fd_nfiles > NDFILE)
2114		free(fdp->fd_ofiles, M_FILEDESC);
2115	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2116		free(fdp->fd_map, M_FILEDESC);
2117
2118	if (cdir != NULL)
2119		vrele(cdir);
2120	if (rdir != NULL)
2121		vrele(rdir);
2122	if (jdir != NULL)
2123		vrele(jdir);
2124
2125	fddrop(fdp);
2126}
2127
2128/*
2129 * For setugid programs, we don't want to people to use that setugidness
2130 * to generate error messages which write to a file which otherwise would
2131 * otherwise be off-limits to the process.  We check for filesystems where
2132 * the vnode can change out from under us after execve (like [lin]procfs).
2133 *
2134 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2135 * sufficient.  We also don't check for setugidness since we know we are.
2136 */
2137static int
2138is_unsafe(struct file *fp)
2139{
2140	if (fp->f_type == DTYPE_VNODE) {
2141		struct vnode *vp = fp->f_vnode;
2142
2143		if ((vp->v_vflag & VV_PROCDEP) != 0)
2144			return (1);
2145	}
2146	return (0);
2147}
2148
2149/*
2150 * Make this setguid thing safe, if at all possible.
2151 */
2152void
2153setugidsafety(struct thread *td)
2154{
2155	struct filedesc *fdp;
2156	struct file *fp;
2157	int i;
2158
2159	fdp = td->td_proc->p_fd;
2160	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2161	FILEDESC_XLOCK(fdp);
2162	for (i = 0; i <= fdp->fd_lastfile; i++) {
2163		if (i > 2)
2164			break;
2165		fp = fdp->fd_ofiles[i].fde_file;
2166		if (fp != NULL && is_unsafe(fp)) {
2167			knote_fdclose(td, i);
2168			/*
2169			 * NULL-out descriptor prior to close to avoid
2170			 * a race while close blocks.
2171			 */
2172			fdfree(fdp, i);
2173			FILEDESC_XUNLOCK(fdp);
2174			(void) closef(fp, td);
2175			FILEDESC_XLOCK(fdp);
2176		}
2177	}
2178	FILEDESC_XUNLOCK(fdp);
2179}
2180
2181/*
2182 * If a specific file object occupies a specific file descriptor, close the
2183 * file descriptor entry and drop a reference on the file object.  This is a
2184 * convenience function to handle a subsequent error in a function that calls
2185 * falloc() that handles the race that another thread might have closed the
2186 * file descriptor out from under the thread creating the file object.
2187 */
2188void
2189fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2190{
2191
2192	FILEDESC_XLOCK(fdp);
2193	if (fdp->fd_ofiles[idx].fde_file == fp) {
2194		fdfree(fdp, idx);
2195		FILEDESC_XUNLOCK(fdp);
2196		fdrop(fp, td);
2197	} else
2198		FILEDESC_XUNLOCK(fdp);
2199}
2200
2201/*
2202 * Close any files on exec?
2203 */
2204void
2205fdcloseexec(struct thread *td)
2206{
2207	struct filedesc *fdp;
2208	struct filedescent *fde;
2209	struct file *fp;
2210	int i;
2211
2212	fdp = td->td_proc->p_fd;
2213	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2214	FILEDESC_XLOCK(fdp);
2215	for (i = 0; i <= fdp->fd_lastfile; i++) {
2216		fde = &fdp->fd_ofiles[i];
2217		fp = fde->fde_file;
2218		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2219		    (fde->fde_flags & UF_EXCLOSE))) {
2220			fdfree(fdp, i);
2221			(void) closefp(fdp, i, fp, td, 0);
2222			/* closefp() drops the FILEDESC lock. */
2223			FILEDESC_XLOCK(fdp);
2224		}
2225	}
2226	FILEDESC_XUNLOCK(fdp);
2227}
2228
2229/*
2230 * It is unsafe for set[ug]id processes to be started with file
2231 * descriptors 0..2 closed, as these descriptors are given implicit
2232 * significance in the Standard C library.  fdcheckstd() will create a
2233 * descriptor referencing /dev/null for each of stdin, stdout, and
2234 * stderr that is not already open.
2235 */
2236int
2237fdcheckstd(struct thread *td)
2238{
2239	struct filedesc *fdp;
2240	register_t retval, save;
2241	int i, error, devnull;
2242
2243	fdp = td->td_proc->p_fd;
2244	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2245	devnull = -1;
2246	error = 0;
2247	for (i = 0; i < 3; i++) {
2248		if (fdp->fd_ofiles[i].fde_file != NULL)
2249			continue;
2250		if (devnull < 0) {
2251			save = td->td_retval[0];
2252			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2253			    O_RDWR, 0);
2254			devnull = td->td_retval[0];
2255			td->td_retval[0] = save;
2256			if (error)
2257				break;
2258			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2259		} else {
2260			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2261			if (error != 0)
2262				break;
2263		}
2264	}
2265	return (error);
2266}
2267
2268/*
2269 * Internal form of close.  Decrement reference count on file structure.
2270 * Note: td may be NULL when closing a file that was being passed in a
2271 * message.
2272 *
2273 * XXXRW: Giant is not required for the caller, but often will be held; this
2274 * makes it moderately likely the Giant will be recursed in the VFS case.
2275 */
2276int
2277closef(struct file *fp, struct thread *td)
2278{
2279	struct vnode *vp;
2280	struct flock lf;
2281	struct filedesc_to_leader *fdtol;
2282	struct filedesc *fdp;
2283
2284	/*
2285	 * POSIX record locking dictates that any close releases ALL
2286	 * locks owned by this process.  This is handled by setting
2287	 * a flag in the unlock to free ONLY locks obeying POSIX
2288	 * semantics, and not to free BSD-style file locks.
2289	 * If the descriptor was in a message, POSIX-style locks
2290	 * aren't passed with the descriptor, and the thread pointer
2291	 * will be NULL.  Callers should be careful only to pass a
2292	 * NULL thread pointer when there really is no owning
2293	 * context that might have locks, or the locks will be
2294	 * leaked.
2295	 */
2296	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2297		vp = fp->f_vnode;
2298		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2299			lf.l_whence = SEEK_SET;
2300			lf.l_start = 0;
2301			lf.l_len = 0;
2302			lf.l_type = F_UNLCK;
2303			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2304			    F_UNLCK, &lf, F_POSIX);
2305		}
2306		fdtol = td->td_proc->p_fdtol;
2307		if (fdtol != NULL) {
2308			/*
2309			 * Handle special case where file descriptor table is
2310			 * shared between multiple process leaders.
2311			 */
2312			fdp = td->td_proc->p_fd;
2313			FILEDESC_XLOCK(fdp);
2314			for (fdtol = fdtol->fdl_next;
2315			     fdtol != td->td_proc->p_fdtol;
2316			     fdtol = fdtol->fdl_next) {
2317				if ((fdtol->fdl_leader->p_flag &
2318				     P_ADVLOCK) == 0)
2319					continue;
2320				fdtol->fdl_holdcount++;
2321				FILEDESC_XUNLOCK(fdp);
2322				lf.l_whence = SEEK_SET;
2323				lf.l_start = 0;
2324				lf.l_len = 0;
2325				lf.l_type = F_UNLCK;
2326				vp = fp->f_vnode;
2327				(void) VOP_ADVLOCK(vp,
2328				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2329				    F_POSIX);
2330				FILEDESC_XLOCK(fdp);
2331				fdtol->fdl_holdcount--;
2332				if (fdtol->fdl_holdcount == 0 &&
2333				    fdtol->fdl_wakeup != 0) {
2334					fdtol->fdl_wakeup = 0;
2335					wakeup(fdtol);
2336				}
2337			}
2338			FILEDESC_XUNLOCK(fdp);
2339		}
2340	}
2341	return (fdrop(fp, td));
2342}
2343
2344/*
2345 * Initialize the file pointer with the specified properties.
2346 *
2347 * The ops are set with release semantics to be certain that the flags, type,
2348 * and data are visible when ops is.  This is to prevent ops methods from being
2349 * called with bad data.
2350 */
2351void
2352finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2353{
2354	fp->f_data = data;
2355	fp->f_flag = flag;
2356	fp->f_type = type;
2357	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2358}
2359
2360int
2361fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2362    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2363{
2364#ifdef CAPABILITIES
2365	struct filedescent fde;
2366#endif
2367	struct file *fp;
2368	u_int count;
2369#ifdef CAPABILITIES
2370	seq_t seq;
2371	cap_rights_t haverights;
2372	int error;
2373#endif
2374
2375	/*
2376	 * Avoid reads reordering and then a first access to the
2377	 * fdp->fd_ofiles table which could result in OOB operation.
2378	 */
2379	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
2380		return (EBADF);
2381	/*
2382	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2383	 * never raising a refcount above 0.  To accomplish this we have
2384	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2385	 * must be re-verified once we acquire a reference to be certain
2386	 * that the identity is still correct and we did not lose a race
2387	 * due to preemption.
2388	 */
2389	for (;;) {
2390#ifdef CAPABILITIES
2391		seq = seq_read(fd_seq(fdp, fd));
2392		fde = fdp->fd_ofiles[fd];
2393		if (!seq_consistent(fd_seq(fdp, fd), seq)) {
2394			cpu_spinwait();
2395			continue;
2396		}
2397		fp = fde.fde_file;
2398#else
2399		fp = fdp->fd_ofiles[fd].fde_file;
2400#endif
2401		if (fp == NULL)
2402			return (EBADF);
2403#ifdef CAPABILITIES
2404		haverights = *cap_rights_fde(&fde);
2405		if (needrightsp != NULL) {
2406			error = cap_check(&haverights, needrightsp);
2407			if (error != 0)
2408				return (error);
2409			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2410				error = cap_fcntl_check_fde(&fde, needfcntl);
2411				if (error != 0)
2412					return (error);
2413			}
2414		}
2415#endif
2416		count = fp->f_count;
2417		if (count == 0)
2418			continue;
2419		/*
2420		 * Use an acquire barrier to prevent caching of fd_ofiles
2421		 * so it is refreshed for verification.
2422		 */
2423		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2424			continue;
2425#ifdef	CAPABILITIES
2426		if (seq_consistent_nomb(fd_seq(fdp, fd), seq))
2427#else
2428		if (fp == fdp->fd_ofiles[fd].fde_file)
2429#endif
2430			break;
2431		fdrop(fp, curthread);
2432	}
2433	*fpp = fp;
2434	if (haverightsp != NULL) {
2435#ifdef CAPABILITIES
2436		*haverightsp = haverights;
2437#else
2438		CAP_ALL(haverightsp);
2439#endif
2440	}
2441	return (0);
2442}
2443
2444/*
2445 * Extract the file pointer associated with the specified descriptor for the
2446 * current user process.
2447 *
2448 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2449 * returned.
2450 *
2451 * File's rights will be checked against the capability rights mask.
2452 *
2453 * If an error occured the non-zero error is returned and *fpp is set to
2454 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2455 * responsible for fdrop().
2456 */
2457static __inline int
2458_fget(struct thread *td, int fd, struct file **fpp, int flags,
2459    cap_rights_t *needrightsp, u_char *maxprotp)
2460{
2461	struct filedesc *fdp;
2462	struct file *fp;
2463	cap_rights_t haverights, needrights;
2464	int error;
2465
2466	*fpp = NULL;
2467	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2468		return (EBADF);
2469	if (needrightsp != NULL)
2470		needrights = *needrightsp;
2471	else
2472		cap_rights_init(&needrights);
2473	if (maxprotp != NULL)
2474		cap_rights_set(&needrights, CAP_MMAP);
2475	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2476	if (error != 0)
2477		return (error);
2478	if (fp->f_ops == &badfileops) {
2479		fdrop(fp, td);
2480		return (EBADF);
2481	}
2482
2483#ifdef CAPABILITIES
2484	/*
2485	 * If requested, convert capability rights to access flags.
2486	 */
2487	if (maxprotp != NULL)
2488		*maxprotp = cap_rights_to_vmprot(&haverights);
2489#else /* !CAPABILITIES */
2490	if (maxprotp != NULL)
2491		*maxprotp = VM_PROT_ALL;
2492#endif /* CAPABILITIES */
2493
2494	/*
2495	 * FREAD and FWRITE failure return EBADF as per POSIX.
2496	 */
2497	error = 0;
2498	switch (flags) {
2499	case FREAD:
2500	case FWRITE:
2501		if ((fp->f_flag & flags) == 0)
2502			error = EBADF;
2503		break;
2504	case FEXEC:
2505	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2506		    ((fp->f_flag & FWRITE) != 0))
2507			error = EBADF;
2508		break;
2509	case 0:
2510		break;
2511	default:
2512		KASSERT(0, ("wrong flags"));
2513	}
2514
2515	if (error != 0) {
2516		fdrop(fp, td);
2517		return (error);
2518	}
2519
2520	*fpp = fp;
2521	return (0);
2522}
2523
2524int
2525fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2526{
2527
2528	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2529}
2530
2531int
2532fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2533    struct file **fpp)
2534{
2535
2536	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2537}
2538
2539int
2540fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2541{
2542
2543	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2544}
2545
2546int
2547fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2548{
2549
2550	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2551}
2552
2553/*
2554 * Like fget() but loads the underlying vnode, or returns an error if the
2555 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2556 * never have VM objects.  The returned vnode will be vref()'d.
2557 *
2558 * XXX: what about the unused flags ?
2559 */
2560static __inline int
2561_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2562    struct vnode **vpp)
2563{
2564	struct file *fp;
2565	int error;
2566
2567	*vpp = NULL;
2568	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2569	if (error != 0)
2570		return (error);
2571	if (fp->f_vnode == NULL) {
2572		error = EINVAL;
2573	} else {
2574		*vpp = fp->f_vnode;
2575		vref(*vpp);
2576	}
2577	fdrop(fp, td);
2578
2579	return (error);
2580}
2581
2582int
2583fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2584{
2585
2586	return (_fgetvp(td, fd, 0, rightsp, vpp));
2587}
2588
2589int
2590fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2591    struct filecaps *havecaps, struct vnode **vpp)
2592{
2593	struct filedesc *fdp;
2594	struct file *fp;
2595#ifdef CAPABILITIES
2596	int error;
2597#endif
2598
2599	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2600		return (EBADF);
2601
2602	fp = fget_locked(fdp, fd);
2603	if (fp == NULL || fp->f_ops == &badfileops)
2604		return (EBADF);
2605
2606#ifdef CAPABILITIES
2607	if (needrightsp != NULL) {
2608		error = cap_check(cap_rights(fdp, fd), needrightsp);
2609		if (error != 0)
2610			return (error);
2611	}
2612#endif
2613
2614	if (fp->f_vnode == NULL)
2615		return (EINVAL);
2616
2617	*vpp = fp->f_vnode;
2618	vref(*vpp);
2619	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2620
2621	return (0);
2622}
2623
2624int
2625fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2626{
2627
2628	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2629}
2630
2631int
2632fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2633{
2634
2635	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2636}
2637
2638#ifdef notyet
2639int
2640fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2641    struct vnode **vpp)
2642{
2643
2644	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2645}
2646#endif
2647
2648/*
2649 * Like fget() but loads the underlying socket, or returns an error if the
2650 * descriptor does not represent a socket.
2651 *
2652 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2653 * in the future.
2654 *
2655 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2656 * on their file descriptor reference to prevent the socket from being free'd
2657 * during use.
2658 */
2659int
2660fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2661    u_int *fflagp)
2662{
2663	struct file *fp;
2664	int error;
2665
2666	*spp = NULL;
2667	if (fflagp != NULL)
2668		*fflagp = 0;
2669	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2670		return (error);
2671	if (fp->f_type != DTYPE_SOCKET) {
2672		error = ENOTSOCK;
2673	} else {
2674		*spp = fp->f_data;
2675		if (fflagp)
2676			*fflagp = fp->f_flag;
2677		SOCK_LOCK(*spp);
2678		soref(*spp);
2679		SOCK_UNLOCK(*spp);
2680	}
2681	fdrop(fp, td);
2682
2683	return (error);
2684}
2685
2686/*
2687 * Drop the reference count on the socket and XXX release the SX lock in the
2688 * future.  The last reference closes the socket.
2689 *
2690 * Note: fputsock() is deprecated, see comment for fgetsock().
2691 */
2692void
2693fputsock(struct socket *so)
2694{
2695
2696	ACCEPT_LOCK();
2697	SOCK_LOCK(so);
2698	CURVNET_SET(so->so_vnet);
2699	sorele(so);
2700	CURVNET_RESTORE();
2701}
2702
2703/*
2704 * Handle the last reference to a file being closed.
2705 */
2706int
2707_fdrop(struct file *fp, struct thread *td)
2708{
2709	int error;
2710
2711	error = 0;
2712	if (fp->f_count != 0)
2713		panic("fdrop: count %d", fp->f_count);
2714	if (fp->f_ops != &badfileops)
2715		error = fo_close(fp, td);
2716	atomic_subtract_int(&openfiles, 1);
2717	crfree(fp->f_cred);
2718	free(fp->f_advice, M_FADVISE);
2719	uma_zfree(file_zone, fp);
2720
2721	return (error);
2722}
2723
2724/*
2725 * Apply an advisory lock on a file descriptor.
2726 *
2727 * Just attempt to get a record lock of the requested type on the entire file
2728 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2729 */
2730#ifndef _SYS_SYSPROTO_H_
2731struct flock_args {
2732	int	fd;
2733	int	how;
2734};
2735#endif
2736/* ARGSUSED */
2737int
2738sys_flock(struct thread *td, struct flock_args *uap)
2739{
2740	struct file *fp;
2741	struct vnode *vp;
2742	struct flock lf;
2743	cap_rights_t rights;
2744	int error;
2745
2746	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2747	if (error != 0)
2748		return (error);
2749	if (fp->f_type != DTYPE_VNODE) {
2750		fdrop(fp, td);
2751		return (EOPNOTSUPP);
2752	}
2753
2754	vp = fp->f_vnode;
2755	lf.l_whence = SEEK_SET;
2756	lf.l_start = 0;
2757	lf.l_len = 0;
2758	if (uap->how & LOCK_UN) {
2759		lf.l_type = F_UNLCK;
2760		atomic_clear_int(&fp->f_flag, FHASLOCK);
2761		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2762		goto done2;
2763	}
2764	if (uap->how & LOCK_EX)
2765		lf.l_type = F_WRLCK;
2766	else if (uap->how & LOCK_SH)
2767		lf.l_type = F_RDLCK;
2768	else {
2769		error = EBADF;
2770		goto done2;
2771	}
2772	atomic_set_int(&fp->f_flag, FHASLOCK);
2773	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2774	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2775done2:
2776	fdrop(fp, td);
2777	return (error);
2778}
2779/*
2780 * Duplicate the specified descriptor to a free descriptor.
2781 */
2782int
2783dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2784    int openerror, int *indxp)
2785{
2786	struct filedescent *newfde, *oldfde;
2787	struct file *fp;
2788	int error, indx;
2789
2790	KASSERT(openerror == ENODEV || openerror == ENXIO,
2791	    ("unexpected error %d in %s", openerror, __func__));
2792
2793	/*
2794	 * If the to-be-dup'd fd number is greater than the allowed number
2795	 * of file descriptors, or the fd to be dup'd has already been
2796	 * closed, then reject.
2797	 */
2798	FILEDESC_XLOCK(fdp);
2799	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2800		FILEDESC_XUNLOCK(fdp);
2801		return (EBADF);
2802	}
2803
2804	error = fdalloc(td, 0, &indx);
2805	if (error != 0) {
2806		FILEDESC_XUNLOCK(fdp);
2807		return (error);
2808	}
2809
2810	/*
2811	 * There are two cases of interest here.
2812	 *
2813	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2814	 *
2815	 * For ENXIO steal away the file structure from (dfd) and store it in
2816	 * (indx).  (dfd) is effectively closed by this operation.
2817	 */
2818	switch (openerror) {
2819	case ENODEV:
2820		/*
2821		 * Check that the mode the file is being opened for is a
2822		 * subset of the mode of the existing descriptor.
2823		 */
2824		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2825			fdunused(fdp, indx);
2826			FILEDESC_XUNLOCK(fdp);
2827			return (EACCES);
2828		}
2829		fhold(fp);
2830		newfde = &fdp->fd_ofiles[indx];
2831		oldfde = &fdp->fd_ofiles[dfd];
2832#ifdef CAPABILITIES
2833		seq_write_begin(&newfde->fde_seq);
2834#endif
2835		memcpy(newfde, oldfde, fde_change_size);
2836		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
2837#ifdef CAPABILITIES
2838		seq_write_end(&newfde->fde_seq);
2839#endif
2840		break;
2841	case ENXIO:
2842		/*
2843		 * Steal away the file pointer from dfd and stuff it into indx.
2844		 */
2845		newfde = &fdp->fd_ofiles[indx];
2846		oldfde = &fdp->fd_ofiles[dfd];
2847#ifdef CAPABILITIES
2848		seq_write_begin(&newfde->fde_seq);
2849#endif
2850		memcpy(newfde, oldfde, fde_change_size);
2851		bzero(oldfde, fde_change_size);
2852		fdunused(fdp, dfd);
2853#ifdef CAPABILITIES
2854		seq_write_end(&newfde->fde_seq);
2855#endif
2856		break;
2857	}
2858	FILEDESC_XUNLOCK(fdp);
2859	*indxp = indx;
2860	return (0);
2861}
2862
2863/*
2864 * Scan all active processes and prisons to see if any of them have a current
2865 * or root directory of `olddp'. If so, replace them with the new mount point.
2866 */
2867void
2868mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2869{
2870	struct filedesc *fdp;
2871	struct prison *pr;
2872	struct proc *p;
2873	int nrele;
2874
2875	if (vrefcnt(olddp) == 1)
2876		return;
2877	nrele = 0;
2878	sx_slock(&allproc_lock);
2879	FOREACH_PROC_IN_SYSTEM(p) {
2880		fdp = fdhold(p);
2881		if (fdp == NULL)
2882			continue;
2883		FILEDESC_XLOCK(fdp);
2884		if (fdp->fd_cdir == olddp) {
2885			vref(newdp);
2886			fdp->fd_cdir = newdp;
2887			nrele++;
2888		}
2889		if (fdp->fd_rdir == olddp) {
2890			vref(newdp);
2891			fdp->fd_rdir = newdp;
2892			nrele++;
2893		}
2894		if (fdp->fd_jdir == olddp) {
2895			vref(newdp);
2896			fdp->fd_jdir = newdp;
2897			nrele++;
2898		}
2899		FILEDESC_XUNLOCK(fdp);
2900		fddrop(fdp);
2901	}
2902	sx_sunlock(&allproc_lock);
2903	if (rootvnode == olddp) {
2904		vref(newdp);
2905		rootvnode = newdp;
2906		nrele++;
2907	}
2908	mtx_lock(&prison0.pr_mtx);
2909	if (prison0.pr_root == olddp) {
2910		vref(newdp);
2911		prison0.pr_root = newdp;
2912		nrele++;
2913	}
2914	mtx_unlock(&prison0.pr_mtx);
2915	sx_slock(&allprison_lock);
2916	TAILQ_FOREACH(pr, &allprison, pr_list) {
2917		mtx_lock(&pr->pr_mtx);
2918		if (pr->pr_root == olddp) {
2919			vref(newdp);
2920			pr->pr_root = newdp;
2921			nrele++;
2922		}
2923		mtx_unlock(&pr->pr_mtx);
2924	}
2925	sx_sunlock(&allprison_lock);
2926	while (nrele--)
2927		vrele(olddp);
2928}
2929
2930struct filedesc_to_leader *
2931filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2932{
2933	struct filedesc_to_leader *fdtol;
2934
2935	fdtol = malloc(sizeof(struct filedesc_to_leader),
2936	       M_FILEDESC_TO_LEADER,
2937	       M_WAITOK);
2938	fdtol->fdl_refcount = 1;
2939	fdtol->fdl_holdcount = 0;
2940	fdtol->fdl_wakeup = 0;
2941	fdtol->fdl_leader = leader;
2942	if (old != NULL) {
2943		FILEDESC_XLOCK(fdp);
2944		fdtol->fdl_next = old->fdl_next;
2945		fdtol->fdl_prev = old;
2946		old->fdl_next = fdtol;
2947		fdtol->fdl_next->fdl_prev = fdtol;
2948		FILEDESC_XUNLOCK(fdp);
2949	} else {
2950		fdtol->fdl_next = fdtol;
2951		fdtol->fdl_prev = fdtol;
2952	}
2953	return (fdtol);
2954}
2955
2956/*
2957 * Get file structures globally.
2958 */
2959static int
2960sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2961{
2962	struct xfile xf;
2963	struct filedesc *fdp;
2964	struct file *fp;
2965	struct proc *p;
2966	int error, n;
2967
2968	error = sysctl_wire_old_buffer(req, 0);
2969	if (error != 0)
2970		return (error);
2971	if (req->oldptr == NULL) {
2972		n = 0;
2973		sx_slock(&allproc_lock);
2974		FOREACH_PROC_IN_SYSTEM(p) {
2975			if (p->p_state == PRS_NEW)
2976				continue;
2977			fdp = fdhold(p);
2978			if (fdp == NULL)
2979				continue;
2980			/* overestimates sparse tables. */
2981			if (fdp->fd_lastfile > 0)
2982				n += fdp->fd_lastfile;
2983			fddrop(fdp);
2984		}
2985		sx_sunlock(&allproc_lock);
2986		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2987	}
2988	error = 0;
2989	bzero(&xf, sizeof(xf));
2990	xf.xf_size = sizeof(xf);
2991	sx_slock(&allproc_lock);
2992	FOREACH_PROC_IN_SYSTEM(p) {
2993		PROC_LOCK(p);
2994		if (p->p_state == PRS_NEW) {
2995			PROC_UNLOCK(p);
2996			continue;
2997		}
2998		if (p_cansee(req->td, p) != 0) {
2999			PROC_UNLOCK(p);
3000			continue;
3001		}
3002		xf.xf_pid = p->p_pid;
3003		xf.xf_uid = p->p_ucred->cr_uid;
3004		PROC_UNLOCK(p);
3005		fdp = fdhold(p);
3006		if (fdp == NULL)
3007			continue;
3008		FILEDESC_SLOCK(fdp);
3009		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
3010			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3011				continue;
3012			xf.xf_fd = n;
3013			xf.xf_file = fp;
3014			xf.xf_data = fp->f_data;
3015			xf.xf_vnode = fp->f_vnode;
3016			xf.xf_type = fp->f_type;
3017			xf.xf_count = fp->f_count;
3018			xf.xf_msgcount = 0;
3019			xf.xf_offset = foffset_get(fp);
3020			xf.xf_flag = fp->f_flag;
3021			error = SYSCTL_OUT(req, &xf, sizeof(xf));
3022			if (error)
3023				break;
3024		}
3025		FILEDESC_SUNLOCK(fdp);
3026		fddrop(fdp);
3027		if (error)
3028			break;
3029	}
3030	sx_sunlock(&allproc_lock);
3031	return (error);
3032}
3033
3034SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
3035    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
3036
3037#ifdef KINFO_OFILE_SIZE
3038CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
3039#endif
3040
3041#ifdef COMPAT_FREEBSD7
3042static int
3043export_vnode_for_osysctl(struct vnode *vp, int type,
3044    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
3045{
3046	int error;
3047	char *fullpath, *freepath;
3048
3049	bzero(kif, sizeof(*kif));
3050	kif->kf_structsize = sizeof(*kif);
3051
3052	vref(vp);
3053	kif->kf_fd = type;
3054	kif->kf_type = KF_TYPE_VNODE;
3055	/* This function only handles directories. */
3056	if (vp->v_type != VDIR) {
3057		vrele(vp);
3058		return (ENOTDIR);
3059	}
3060	kif->kf_vnode_type = KF_VTYPE_VDIR;
3061
3062	/*
3063	 * This is not a true file descriptor, so we set a bogus refcount
3064	 * and offset to indicate these fields should be ignored.
3065	 */
3066	kif->kf_ref_count = -1;
3067	kif->kf_offset = -1;
3068
3069	freepath = NULL;
3070	fullpath = "-";
3071	FILEDESC_SUNLOCK(fdp);
3072	vn_fullpath(curthread, vp, &fullpath, &freepath);
3073	vrele(vp);
3074	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3075	if (freepath != NULL)
3076		free(freepath, M_TEMP);
3077	error = SYSCTL_OUT(req, kif, sizeof(*kif));
3078	FILEDESC_SLOCK(fdp);
3079	return (error);
3080}
3081
3082/*
3083 * Get per-process file descriptors for use by procstat(1), et al.
3084 */
3085static int
3086sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
3087{
3088	char *fullpath, *freepath;
3089	struct kinfo_ofile *kif;
3090	struct filedesc *fdp;
3091	int error, i, *name;
3092	struct shmfd *shmfd;
3093	struct socket *so;
3094	struct vnode *vp;
3095	struct ksem *ks;
3096	struct file *fp;
3097	struct proc *p;
3098	struct tty *tp;
3099
3100	name = (int *)arg1;
3101	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3102	if (error != 0)
3103		return (error);
3104	fdp = fdhold(p);
3105	PROC_UNLOCK(p);
3106	if (fdp == NULL)
3107		return (ENOENT);
3108	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3109	FILEDESC_SLOCK(fdp);
3110	if (fdp->fd_cdir != NULL)
3111		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3112				fdp, req);
3113	if (fdp->fd_rdir != NULL)
3114		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3115				fdp, req);
3116	if (fdp->fd_jdir != NULL)
3117		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3118				fdp, req);
3119	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3120		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3121			continue;
3122		bzero(kif, sizeof(*kif));
3123		kif->kf_structsize = sizeof(*kif);
3124		ks = NULL;
3125		vp = NULL;
3126		so = NULL;
3127		tp = NULL;
3128		shmfd = NULL;
3129		kif->kf_fd = i;
3130
3131		switch (fp->f_type) {
3132		case DTYPE_VNODE:
3133			kif->kf_type = KF_TYPE_VNODE;
3134			vp = fp->f_vnode;
3135			break;
3136
3137		case DTYPE_SOCKET:
3138			kif->kf_type = KF_TYPE_SOCKET;
3139			so = fp->f_data;
3140			break;
3141
3142		case DTYPE_PIPE:
3143			kif->kf_type = KF_TYPE_PIPE;
3144			break;
3145
3146		case DTYPE_FIFO:
3147			kif->kf_type = KF_TYPE_FIFO;
3148			vp = fp->f_vnode;
3149			break;
3150
3151		case DTYPE_KQUEUE:
3152			kif->kf_type = KF_TYPE_KQUEUE;
3153			break;
3154
3155		case DTYPE_CRYPTO:
3156			kif->kf_type = KF_TYPE_CRYPTO;
3157			break;
3158
3159		case DTYPE_MQUEUE:
3160			kif->kf_type = KF_TYPE_MQUEUE;
3161			break;
3162
3163		case DTYPE_SHM:
3164			kif->kf_type = KF_TYPE_SHM;
3165			shmfd = fp->f_data;
3166			break;
3167
3168		case DTYPE_SEM:
3169			kif->kf_type = KF_TYPE_SEM;
3170			ks = fp->f_data;
3171			break;
3172
3173		case DTYPE_PTS:
3174			kif->kf_type = KF_TYPE_PTS;
3175			tp = fp->f_data;
3176			break;
3177
3178#ifdef PROCDESC
3179		case DTYPE_PROCDESC:
3180			kif->kf_type = KF_TYPE_PROCDESC;
3181			break;
3182#endif
3183
3184		default:
3185			kif->kf_type = KF_TYPE_UNKNOWN;
3186			break;
3187		}
3188		kif->kf_ref_count = fp->f_count;
3189		if (fp->f_flag & FREAD)
3190			kif->kf_flags |= KF_FLAG_READ;
3191		if (fp->f_flag & FWRITE)
3192			kif->kf_flags |= KF_FLAG_WRITE;
3193		if (fp->f_flag & FAPPEND)
3194			kif->kf_flags |= KF_FLAG_APPEND;
3195		if (fp->f_flag & FASYNC)
3196			kif->kf_flags |= KF_FLAG_ASYNC;
3197		if (fp->f_flag & FFSYNC)
3198			kif->kf_flags |= KF_FLAG_FSYNC;
3199		if (fp->f_flag & FNONBLOCK)
3200			kif->kf_flags |= KF_FLAG_NONBLOCK;
3201		if (fp->f_flag & O_DIRECT)
3202			kif->kf_flags |= KF_FLAG_DIRECT;
3203		if (fp->f_flag & FHASLOCK)
3204			kif->kf_flags |= KF_FLAG_HASLOCK;
3205		kif->kf_offset = foffset_get(fp);
3206		if (vp != NULL) {
3207			vref(vp);
3208			switch (vp->v_type) {
3209			case VNON:
3210				kif->kf_vnode_type = KF_VTYPE_VNON;
3211				break;
3212			case VREG:
3213				kif->kf_vnode_type = KF_VTYPE_VREG;
3214				break;
3215			case VDIR:
3216				kif->kf_vnode_type = KF_VTYPE_VDIR;
3217				break;
3218			case VBLK:
3219				kif->kf_vnode_type = KF_VTYPE_VBLK;
3220				break;
3221			case VCHR:
3222				kif->kf_vnode_type = KF_VTYPE_VCHR;
3223				break;
3224			case VLNK:
3225				kif->kf_vnode_type = KF_VTYPE_VLNK;
3226				break;
3227			case VSOCK:
3228				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3229				break;
3230			case VFIFO:
3231				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3232				break;
3233			case VBAD:
3234				kif->kf_vnode_type = KF_VTYPE_VBAD;
3235				break;
3236			default:
3237				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3238				break;
3239			}
3240			/*
3241			 * It is OK to drop the filedesc lock here as we will
3242			 * re-validate and re-evaluate its properties when
3243			 * the loop continues.
3244			 */
3245			freepath = NULL;
3246			fullpath = "-";
3247			FILEDESC_SUNLOCK(fdp);
3248			vn_fullpath(curthread, vp, &fullpath, &freepath);
3249			vrele(vp);
3250			strlcpy(kif->kf_path, fullpath,
3251			    sizeof(kif->kf_path));
3252			if (freepath != NULL)
3253				free(freepath, M_TEMP);
3254			FILEDESC_SLOCK(fdp);
3255		}
3256		if (so != NULL) {
3257			struct sockaddr *sa;
3258
3259			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3260			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3261				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3262				free(sa, M_SONAME);
3263			}
3264			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3265			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3266				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3267				free(sa, M_SONAME);
3268			}
3269			kif->kf_sock_domain =
3270			    so->so_proto->pr_domain->dom_family;
3271			kif->kf_sock_type = so->so_type;
3272			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3273		}
3274		if (tp != NULL) {
3275			strlcpy(kif->kf_path, tty_devname(tp),
3276			    sizeof(kif->kf_path));
3277		}
3278		if (shmfd != NULL)
3279			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3280		if (ks != NULL && ksem_info != NULL)
3281			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
3282		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3283		if (error)
3284			break;
3285	}
3286	FILEDESC_SUNLOCK(fdp);
3287	fddrop(fdp);
3288	free(kif, M_TEMP);
3289	return (0);
3290}
3291
3292static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
3293    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
3294    "Process ofiledesc entries");
3295#endif	/* COMPAT_FREEBSD7 */
3296
3297#ifdef KINFO_FILE_SIZE
3298CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3299#endif
3300
3301struct export_fd_buf {
3302	struct filedesc		*fdp;
3303	struct sbuf 		*sb;
3304	ssize_t			remainder;
3305	struct kinfo_file	kif;
3306};
3307
3308static int
3309export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
3310    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
3311{
3312	struct {
3313		int	fflag;
3314		int	kf_fflag;
3315	} fflags_table[] = {
3316		{ FAPPEND, KF_FLAG_APPEND },
3317		{ FASYNC, KF_FLAG_ASYNC },
3318		{ FFSYNC, KF_FLAG_FSYNC },
3319		{ FHASLOCK, KF_FLAG_HASLOCK },
3320		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3321		{ FREAD, KF_FLAG_READ },
3322		{ FWRITE, KF_FLAG_WRITE },
3323		{ O_CREAT, KF_FLAG_CREAT },
3324		{ O_DIRECT, KF_FLAG_DIRECT },
3325		{ O_EXCL, KF_FLAG_EXCL },
3326		{ O_EXEC, KF_FLAG_EXEC },
3327		{ O_EXLOCK, KF_FLAG_EXLOCK },
3328		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3329		{ O_SHLOCK, KF_FLAG_SHLOCK },
3330		{ O_TRUNC, KF_FLAG_TRUNC }
3331	};
3332#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3333	struct kinfo_file *kif;
3334	struct vnode *vp;
3335	int error, locked;
3336	unsigned int i;
3337
3338	if (efbuf->remainder == 0)
3339		return (0);
3340	kif = &efbuf->kif;
3341	bzero(kif, sizeof(*kif));
3342	locked = efbuf->fdp != NULL;
3343	switch (type) {
3344	case KF_TYPE_FIFO:
3345	case KF_TYPE_VNODE:
3346		if (locked) {
3347			FILEDESC_SUNLOCK(efbuf->fdp);
3348			locked = 0;
3349		}
3350		vp = (struct vnode *)data;
3351		error = fill_vnode_info(vp, kif);
3352		vrele(vp);
3353		break;
3354	case KF_TYPE_SOCKET:
3355		error = fill_socket_info((struct socket *)data, kif);
3356		break;
3357	case KF_TYPE_PIPE:
3358		error = fill_pipe_info((struct pipe *)data, kif);
3359		break;
3360	case KF_TYPE_PTS:
3361		error = fill_pts_info((struct tty *)data, kif);
3362		break;
3363	case KF_TYPE_PROCDESC:
3364		error = fill_procdesc_info((struct procdesc *)data, kif);
3365		break;
3366	case KF_TYPE_SEM:
3367		error = fill_sem_info((struct file *)data, kif);
3368		break;
3369	case KF_TYPE_SHM:
3370		error = fill_shm_info((struct file *)data, kif);
3371		break;
3372	default:
3373		error = 0;
3374	}
3375	if (error == 0)
3376		kif->kf_status |= KF_ATTR_VALID;
3377
3378	/*
3379	 * Translate file access flags.
3380	 */
3381	for (i = 0; i < NFFLAGS; i++)
3382		if (fflags & fflags_table[i].fflag)
3383			kif->kf_flags |=  fflags_table[i].kf_fflag;
3384	if (rightsp != NULL)
3385		kif->kf_cap_rights = *rightsp;
3386	else
3387		cap_rights_init(&kif->kf_cap_rights);
3388	kif->kf_fd = fd;
3389	kif->kf_type = type;
3390	kif->kf_ref_count = refcnt;
3391	kif->kf_offset = offset;
3392	/* Pack record size down */
3393	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3394	    strlen(kif->kf_path) + 1;
3395	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3396	if (efbuf->remainder != -1) {
3397		if (efbuf->remainder < kif->kf_structsize) {
3398			/* Terminate export. */
3399			efbuf->remainder = 0;
3400			if (efbuf->fdp != NULL && !locked)
3401				FILEDESC_SLOCK(efbuf->fdp);
3402			return (0);
3403		}
3404		efbuf->remainder -= kif->kf_structsize;
3405	}
3406	if (locked)
3407		FILEDESC_SUNLOCK(efbuf->fdp);
3408	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM;
3409	if (efbuf->fdp != NULL)
3410		FILEDESC_SLOCK(efbuf->fdp);
3411	return (error);
3412}
3413
3414/*
3415 * Store a process file descriptor information to sbuf.
3416 *
3417 * Takes a locked proc as argument, and returns with the proc unlocked.
3418 */
3419int
3420kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3421{
3422	struct file *fp;
3423	struct filedesc *fdp;
3424	struct export_fd_buf *efbuf;
3425	struct vnode *cttyvp, *textvp, *tracevp;
3426	int64_t offset;
3427	void *data;
3428	int error, i;
3429	int type, refcnt, fflags;
3430	cap_rights_t rights;
3431
3432	PROC_LOCK_ASSERT(p, MA_OWNED);
3433
3434	/* ktrace vnode */
3435	tracevp = p->p_tracevp;
3436	if (tracevp != NULL)
3437		vref(tracevp);
3438	/* text vnode */
3439	textvp = p->p_textvp;
3440	if (textvp != NULL)
3441		vref(textvp);
3442	/* Controlling tty. */
3443	cttyvp = NULL;
3444	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3445		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3446		if (cttyvp != NULL)
3447			vref(cttyvp);
3448	}
3449	fdp = fdhold(p);
3450	PROC_UNLOCK(p);
3451	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3452	efbuf->fdp = NULL;
3453	efbuf->sb = sb;
3454	efbuf->remainder = maxlen;
3455	if (tracevp != NULL)
3456		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3457		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3458	if (textvp != NULL)
3459		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3460		    FREAD, -1, -1, NULL, efbuf);
3461	if (cttyvp != NULL)
3462		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3463		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3464	error = 0;
3465	if (fdp == NULL)
3466		goto fail;
3467	efbuf->fdp = fdp;
3468	FILEDESC_SLOCK(fdp);
3469	/* working directory */
3470	if (fdp->fd_cdir != NULL) {
3471		vref(fdp->fd_cdir);
3472		data = fdp->fd_cdir;
3473		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3474		    FREAD, -1, -1, NULL, efbuf);
3475	}
3476	/* root directory */
3477	if (fdp->fd_rdir != NULL) {
3478		vref(fdp->fd_rdir);
3479		data = fdp->fd_rdir;
3480		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3481		    FREAD, -1, -1, NULL, efbuf);
3482	}
3483	/* jail directory */
3484	if (fdp->fd_jdir != NULL) {
3485		vref(fdp->fd_jdir);
3486		data = fdp->fd_jdir;
3487		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3488		    FREAD, -1, -1, NULL, efbuf);
3489	}
3490	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3491		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3492			continue;
3493		data = NULL;
3494#ifdef CAPABILITIES
3495		rights = *cap_rights(fdp, i);
3496#else /* !CAPABILITIES */
3497		cap_rights_init(&rights);
3498#endif
3499		switch (fp->f_type) {
3500		case DTYPE_VNODE:
3501			type = KF_TYPE_VNODE;
3502			vref(fp->f_vnode);
3503			data = fp->f_vnode;
3504			break;
3505
3506		case DTYPE_SOCKET:
3507			type = KF_TYPE_SOCKET;
3508			data = fp->f_data;
3509			break;
3510
3511		case DTYPE_PIPE:
3512			type = KF_TYPE_PIPE;
3513			data = fp->f_data;
3514			break;
3515
3516		case DTYPE_FIFO:
3517			type = KF_TYPE_FIFO;
3518			vref(fp->f_vnode);
3519			data = fp->f_vnode;
3520			break;
3521
3522		case DTYPE_KQUEUE:
3523			type = KF_TYPE_KQUEUE;
3524			break;
3525
3526		case DTYPE_CRYPTO:
3527			type = KF_TYPE_CRYPTO;
3528			break;
3529
3530		case DTYPE_MQUEUE:
3531			type = KF_TYPE_MQUEUE;
3532			break;
3533
3534		case DTYPE_SHM:
3535			type = KF_TYPE_SHM;
3536			data = fp;
3537			break;
3538
3539		case DTYPE_SEM:
3540			type = KF_TYPE_SEM;
3541			data = fp;
3542			break;
3543
3544		case DTYPE_PTS:
3545			type = KF_TYPE_PTS;
3546			data = fp->f_data;
3547			break;
3548
3549#ifdef PROCDESC
3550		case DTYPE_PROCDESC:
3551			type = KF_TYPE_PROCDESC;
3552			data = fp->f_data;
3553			break;
3554#endif
3555
3556		default:
3557			type = KF_TYPE_UNKNOWN;
3558			break;
3559		}
3560		refcnt = fp->f_count;
3561		fflags = fp->f_flag;
3562		offset = foffset_get(fp);
3563
3564		/*
3565		 * Create sysctl entry.
3566		 * It is OK to drop the filedesc lock here as we will
3567		 * re-validate and re-evaluate its properties when
3568		 * the loop continues.
3569		 */
3570		error = export_fd_to_sb(data, type, i, fflags, refcnt,
3571		    offset, &rights, efbuf);
3572		if (error != 0)
3573			break;
3574	}
3575	FILEDESC_SUNLOCK(fdp);
3576	fddrop(fdp);
3577fail:
3578	free(efbuf, M_TEMP);
3579	return (error);
3580}
3581
3582#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3583
3584/*
3585 * Get per-process file descriptors for use by procstat(1), et al.
3586 */
3587static int
3588sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3589{
3590	struct sbuf sb;
3591	struct proc *p;
3592	ssize_t maxlen;
3593	int error, error2, *name;
3594
3595	name = (int *)arg1;
3596
3597	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3598	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3599	if (error != 0) {
3600		sbuf_delete(&sb);
3601		return (error);
3602	}
3603	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3604	error = kern_proc_filedesc_out(p, &sb, maxlen);
3605	error2 = sbuf_finish(&sb);
3606	sbuf_delete(&sb);
3607	return (error != 0 ? error : error2);
3608}
3609
3610int
3611vntype_to_kinfo(int vtype)
3612{
3613	struct {
3614		int	vtype;
3615		int	kf_vtype;
3616	} vtypes_table[] = {
3617		{ VBAD, KF_VTYPE_VBAD },
3618		{ VBLK, KF_VTYPE_VBLK },
3619		{ VCHR, KF_VTYPE_VCHR },
3620		{ VDIR, KF_VTYPE_VDIR },
3621		{ VFIFO, KF_VTYPE_VFIFO },
3622		{ VLNK, KF_VTYPE_VLNK },
3623		{ VNON, KF_VTYPE_VNON },
3624		{ VREG, KF_VTYPE_VREG },
3625		{ VSOCK, KF_VTYPE_VSOCK }
3626	};
3627#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3628	unsigned int i;
3629
3630	/*
3631	 * Perform vtype translation.
3632	 */
3633	for (i = 0; i < NVTYPES; i++)
3634		if (vtypes_table[i].vtype == vtype)
3635			break;
3636	if (i < NVTYPES)
3637		return (vtypes_table[i].kf_vtype);
3638
3639	return (KF_VTYPE_UNKNOWN);
3640}
3641
3642static int
3643fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3644{
3645	struct vattr va;
3646	char *fullpath, *freepath;
3647	int error;
3648
3649	if (vp == NULL)
3650		return (1);
3651	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3652	freepath = NULL;
3653	fullpath = "-";
3654	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3655	if (error == 0) {
3656		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3657	}
3658	if (freepath != NULL)
3659		free(freepath, M_TEMP);
3660
3661	/*
3662	 * Retrieve vnode attributes.
3663	 */
3664	va.va_fsid = VNOVAL;
3665	va.va_rdev = NODEV;
3666	vn_lock(vp, LK_SHARED | LK_RETRY);
3667	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3668	VOP_UNLOCK(vp, 0);
3669	if (error != 0)
3670		return (error);
3671	if (va.va_fsid != VNOVAL)
3672		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3673	else
3674		kif->kf_un.kf_file.kf_file_fsid =
3675		    vp->v_mount->mnt_stat.f_fsid.val[0];
3676	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3677	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3678	kif->kf_un.kf_file.kf_file_size = va.va_size;
3679	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3680	return (0);
3681}
3682
3683static int
3684fill_socket_info(struct socket *so, struct kinfo_file *kif)
3685{
3686	struct sockaddr *sa;
3687	struct inpcb *inpcb;
3688	struct unpcb *unpcb;
3689	int error;
3690
3691	if (so == NULL)
3692		return (1);
3693	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3694	kif->kf_sock_type = so->so_type;
3695	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3696	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3697	switch(kif->kf_sock_domain) {
3698	case AF_INET:
3699	case AF_INET6:
3700		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3701			if (so->so_pcb != NULL) {
3702				inpcb = (struct inpcb *)(so->so_pcb);
3703				kif->kf_un.kf_sock.kf_sock_inpcb =
3704				    (uintptr_t)inpcb->inp_ppcb;
3705			}
3706		}
3707		break;
3708	case AF_UNIX:
3709		if (so->so_pcb != NULL) {
3710			unpcb = (struct unpcb *)(so->so_pcb);
3711			if (unpcb->unp_conn) {
3712				kif->kf_un.kf_sock.kf_sock_unpconn =
3713				    (uintptr_t)unpcb->unp_conn;
3714				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3715				    so->so_rcv.sb_state;
3716				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3717				    so->so_snd.sb_state;
3718			}
3719		}
3720		break;
3721	}
3722	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3723	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3724		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3725		free(sa, M_SONAME);
3726	}
3727	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3728	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3729		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3730		free(sa, M_SONAME);
3731	}
3732	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3733	    sizeof(kif->kf_path));
3734	return (0);
3735}
3736
3737static int
3738fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3739{
3740
3741	if (tp == NULL)
3742		return (1);
3743	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3744	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3745	return (0);
3746}
3747
3748static int
3749fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3750{
3751
3752	if (pi == NULL)
3753		return (1);
3754	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3755	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3756	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3757	return (0);
3758}
3759
3760static int
3761fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3762{
3763
3764	if (pdp == NULL)
3765		return (1);
3766	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3767	return (0);
3768}
3769
3770static int
3771fill_sem_info(struct file *fp, struct kinfo_file *kif)
3772{
3773	struct thread *td;
3774	struct stat sb;
3775
3776	td = curthread;
3777	if (fp->f_data == NULL)
3778		return (1);
3779	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3780		return (1);
3781	if (ksem_info == NULL)
3782		return (1);
3783	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
3784	    &kif->kf_un.kf_sem.kf_sem_value);
3785	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
3786	return (0);
3787}
3788
3789static int
3790fill_shm_info(struct file *fp, struct kinfo_file *kif)
3791{
3792	struct thread *td;
3793	struct stat sb;
3794
3795	td = curthread;
3796	if (fp->f_data == NULL)
3797		return (1);
3798	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3799		return (1);
3800	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3801	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3802	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3803	return (0);
3804}
3805
3806static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
3807    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
3808    "Process filedesc entries");
3809
3810#ifdef DDB
3811/*
3812 * For the purposes of debugging, generate a human-readable string for the
3813 * file type.
3814 */
3815static const char *
3816file_type_to_name(short type)
3817{
3818
3819	switch (type) {
3820	case 0:
3821		return ("zero");
3822	case DTYPE_VNODE:
3823		return ("vnod");
3824	case DTYPE_SOCKET:
3825		return ("sock");
3826	case DTYPE_PIPE:
3827		return ("pipe");
3828	case DTYPE_FIFO:
3829		return ("fifo");
3830	case DTYPE_KQUEUE:
3831		return ("kque");
3832	case DTYPE_CRYPTO:
3833		return ("crpt");
3834	case DTYPE_MQUEUE:
3835		return ("mque");
3836	case DTYPE_SHM:
3837		return ("shm");
3838	case DTYPE_SEM:
3839		return ("ksem");
3840	default:
3841		return ("unkn");
3842	}
3843}
3844
3845/*
3846 * For the purposes of debugging, identify a process (if any, perhaps one of
3847 * many) that references the passed file in its file descriptor array. Return
3848 * NULL if none.
3849 */
3850static struct proc *
3851file_to_first_proc(struct file *fp)
3852{
3853	struct filedesc *fdp;
3854	struct proc *p;
3855	int n;
3856
3857	FOREACH_PROC_IN_SYSTEM(p) {
3858		if (p->p_state == PRS_NEW)
3859			continue;
3860		fdp = p->p_fd;
3861		if (fdp == NULL)
3862			continue;
3863		for (n = 0; n <= fdp->fd_lastfile; n++) {
3864			if (fp == fdp->fd_ofiles[n].fde_file)
3865				return (p);
3866		}
3867	}
3868	return (NULL);
3869}
3870
3871static void
3872db_print_file(struct file *fp, int header)
3873{
3874	struct proc *p;
3875
3876	if (header)
3877		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3878		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3879		    "MCount", "Vnode", "FPID", "FCmd");
3880	p = file_to_first_proc(fp);
3881	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3882	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3883	    0, fp->f_count, 0, fp->f_vnode,
3884	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3885}
3886
3887DB_SHOW_COMMAND(file, db_show_file)
3888{
3889	struct file *fp;
3890
3891	if (!have_addr) {
3892		db_printf("usage: show file <addr>\n");
3893		return;
3894	}
3895	fp = (struct file *)addr;
3896	db_print_file(fp, 1);
3897}
3898
3899DB_SHOW_COMMAND(files, db_show_files)
3900{
3901	struct filedesc *fdp;
3902	struct file *fp;
3903	struct proc *p;
3904	int header;
3905	int n;
3906
3907	header = 1;
3908	FOREACH_PROC_IN_SYSTEM(p) {
3909		if (p->p_state == PRS_NEW)
3910			continue;
3911		if ((fdp = p->p_fd) == NULL)
3912			continue;
3913		for (n = 0; n <= fdp->fd_lastfile; ++n) {
3914			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3915				continue;
3916			db_print_file(fp, header);
3917			header = 0;
3918		}
3919	}
3920}
3921#endif
3922
3923SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3924    &maxfilesperproc, 0, "Maximum files allowed open per process");
3925
3926SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3927    &maxfiles, 0, "Maximum number of files");
3928
3929SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3930    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3931
3932/* ARGSUSED*/
3933static void
3934filelistinit(void *dummy)
3935{
3936
3937	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3938	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3939	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3940	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3941}
3942SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3943
3944/*-------------------------------------------------------------------*/
3945
3946static int
3947badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3948    int flags, struct thread *td)
3949{
3950
3951	return (EBADF);
3952}
3953
3954static int
3955badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3956    struct thread *td)
3957{
3958
3959	return (EINVAL);
3960}
3961
3962static int
3963badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3964    struct thread *td)
3965{
3966
3967	return (EBADF);
3968}
3969
3970static int
3971badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3972    struct thread *td)
3973{
3974
3975	return (0);
3976}
3977
3978static int
3979badfo_kqfilter(struct file *fp, struct knote *kn)
3980{
3981
3982	return (EBADF);
3983}
3984
3985static int
3986badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3987    struct thread *td)
3988{
3989
3990	return (EBADF);
3991}
3992
3993static int
3994badfo_close(struct file *fp, struct thread *td)
3995{
3996
3997	return (EBADF);
3998}
3999
4000static int
4001badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
4002    struct thread *td)
4003{
4004
4005	return (EBADF);
4006}
4007
4008static int
4009badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
4010    struct thread *td)
4011{
4012
4013	return (EBADF);
4014}
4015
4016static int
4017badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
4018    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
4019    int kflags, struct thread *td)
4020{
4021
4022	return (EBADF);
4023}
4024
4025struct fileops badfileops = {
4026	.fo_read = badfo_readwrite,
4027	.fo_write = badfo_readwrite,
4028	.fo_truncate = badfo_truncate,
4029	.fo_ioctl = badfo_ioctl,
4030	.fo_poll = badfo_poll,
4031	.fo_kqfilter = badfo_kqfilter,
4032	.fo_stat = badfo_stat,
4033	.fo_close = badfo_close,
4034	.fo_chmod = badfo_chmod,
4035	.fo_chown = badfo_chown,
4036	.fo_sendfile = badfo_sendfile,
4037};
4038
4039int
4040invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
4041    struct thread *td)
4042{
4043
4044	return (EINVAL);
4045}
4046
4047int
4048invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
4049    struct thread *td)
4050{
4051
4052	return (EINVAL);
4053}
4054
4055int
4056invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
4057    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
4058    int kflags, struct thread *td)
4059{
4060
4061	return (EINVAL);
4062}
4063
4064/*-------------------------------------------------------------------*/
4065
4066/*
4067 * File Descriptor pseudo-device driver (/dev/fd/).
4068 *
4069 * Opening minor device N dup()s the file (if any) connected to file
4070 * descriptor N belonging to the calling process.  Note that this driver
4071 * consists of only the ``open()'' routine, because all subsequent
4072 * references to this file will be direct to the other driver.
4073 *
4074 * XXX: we could give this one a cloning event handler if necessary.
4075 */
4076
4077/* ARGSUSED */
4078static int
4079fdopen(struct cdev *dev, int mode, int type, struct thread *td)
4080{
4081
4082	/*
4083	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
4084	 * the file descriptor being sought for duplication. The error
4085	 * return ensures that the vnode for this device will be released
4086	 * by vn_open. Open will detect this special error and take the
4087	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
4088	 * will simply report the error.
4089	 */
4090	td->td_dupfd = dev2unit(dev);
4091	return (ENODEV);
4092}
4093
4094static struct cdevsw fildesc_cdevsw = {
4095	.d_version =	D_VERSION,
4096	.d_open =	fdopen,
4097	.d_name =	"FD",
4098};
4099
4100static void
4101fildesc_drvinit(void *unused)
4102{
4103	struct cdev *dev;
4104
4105	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
4106	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
4107	make_dev_alias(dev, "stdin");
4108	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
4109	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
4110	make_dev_alias(dev, "stdout");
4111	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
4112	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
4113	make_dev_alias(dev, "stderr");
4114}
4115
4116SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
4117