kern_descrip.c revision 272246
1/*-
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: stable/10/sys/kern/kern_descrip.c 272246 2014-09-28 11:08:32Z kib $");
39
40#include "opt_capsicum.h"
41#include "opt_compat.h"
42#include "opt_ddb.h"
43#include "opt_ktrace.h"
44#include "opt_procdesc.h"
45
46#include <sys/param.h>
47#include <sys/systm.h>
48
49#include <sys/capability.h>
50#include <sys/conf.h>
51#include <sys/domain.h>
52#include <sys/fcntl.h>
53#include <sys/file.h>
54#include <sys/filedesc.h>
55#include <sys/filio.h>
56#include <sys/jail.h>
57#include <sys/kernel.h>
58#include <sys/ksem.h>
59#include <sys/limits.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mman.h>
63#include <sys/mount.h>
64#include <sys/mqueue.h>
65#include <sys/mutex.h>
66#include <sys/namei.h>
67#include <sys/selinfo.h>
68#include <sys/pipe.h>
69#include <sys/priv.h>
70#include <sys/proc.h>
71#include <sys/procdesc.h>
72#include <sys/protosw.h>
73#include <sys/racct.h>
74#include <sys/resourcevar.h>
75#include <sys/sbuf.h>
76#include <sys/signalvar.h>
77#include <sys/socketvar.h>
78#include <sys/stat.h>
79#include <sys/sx.h>
80#include <sys/syscallsubr.h>
81#include <sys/sysctl.h>
82#include <sys/sysproto.h>
83#include <sys/tty.h>
84#include <sys/unistd.h>
85#include <sys/un.h>
86#include <sys/unpcb.h>
87#include <sys/user.h>
88#include <sys/vnode.h>
89#ifdef KTRACE
90#include <sys/ktrace.h>
91#endif
92
93#include <net/vnet.h>
94
95#include <netinet/in.h>
96#include <netinet/in_pcb.h>
97
98#include <security/audit/audit.h>
99
100#include <vm/uma.h>
101#include <vm/vm.h>
102
103#include <ddb/ddb.h>
104
105static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
106static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
107    "file desc to leader structures");
108static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
109MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
110
111MALLOC_DECLARE(M_FADVISE);
112
113static uma_zone_t file_zone;
114
115void	(*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
116
117static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
118		    struct thread *td, int holdleaders);
119static int	fd_first_free(struct filedesc *fdp, int low, int size);
120static int	fd_last_used(struct filedesc *fdp, int size);
121static void	fdgrowtable(struct filedesc *fdp, int nfd);
122static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
123static void	fdunused(struct filedesc *fdp, int fd);
124static void	fdused(struct filedesc *fdp, int fd);
125static int	fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
126static int	fill_procdesc_info(struct procdesc *pdp,
127		    struct kinfo_file *kif);
128static int	fill_pts_info(struct tty *tp, struct kinfo_file *kif);
129static int	fill_sem_info(struct file *fp, struct kinfo_file *kif);
130static int	fill_shm_info(struct file *fp, struct kinfo_file *kif);
131static int	fill_socket_info(struct socket *so, struct kinfo_file *kif);
132static int	fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
133static int	getmaxfd(struct proc *p);
134
135/*
136 * Each process has:
137 *
138 * - An array of open file descriptors (fd_ofiles)
139 * - An array of file flags (fd_ofileflags)
140 * - A bitmap recording which descriptors are in use (fd_map)
141 *
142 * A process starts out with NDFILE descriptors.  The value of NDFILE has
143 * been selected based the historical limit of 20 open files, and an
144 * assumption that the majority of processes, especially short-lived
145 * processes like shells, will never need more.
146 *
147 * If this initial allocation is exhausted, a larger descriptor table and
148 * map are allocated dynamically, and the pointers in the process's struct
149 * filedesc are updated to point to those.  This is repeated every time
150 * the process runs out of file descriptors (provided it hasn't hit its
151 * resource limit).
152 *
153 * Since threads may hold references to individual descriptor table
154 * entries, the tables are never freed.  Instead, they are placed on a
155 * linked list and freed only when the struct filedesc is released.
156 */
157#define NDFILE		20
158#define NDSLOTSIZE	sizeof(NDSLOTTYPE)
159#define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
160#define NDSLOT(x)	((x) / NDENTRIES)
161#define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
162#define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
163
164/*
165 * SLIST entry used to keep track of ofiles which must be reclaimed when
166 * the process exits.
167 */
168struct freetable {
169	struct filedescent *ft_table;
170	SLIST_ENTRY(freetable) ft_next;
171};
172
173/*
174 * Initial allocation: a filedesc structure + the head of SLIST used to
175 * keep track of old ofiles + enough space for NDFILE descriptors.
176 */
177struct filedesc0 {
178	struct filedesc fd_fd;
179	SLIST_HEAD(, freetable) fd_free;
180	struct	filedescent fd_dfiles[NDFILE];
181	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
182};
183
184/*
185 * Descriptor management.
186 */
187volatile int openfiles;			/* actual number of open files */
188struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
189void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
190
191/* A mutex to protect the association between a proc and filedesc. */
192static struct mtx fdesc_mtx;
193
194/*
195 * If low >= size, just return low. Otherwise find the first zero bit in the
196 * given bitmap, starting at low and not exceeding size - 1. Return size if
197 * not found.
198 */
199static int
200fd_first_free(struct filedesc *fdp, int low, int size)
201{
202	NDSLOTTYPE *map = fdp->fd_map;
203	NDSLOTTYPE mask;
204	int off, maxoff;
205
206	if (low >= size)
207		return (low);
208
209	off = NDSLOT(low);
210	if (low % NDENTRIES) {
211		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
212		if ((mask &= ~map[off]) != 0UL)
213			return (off * NDENTRIES + ffsl(mask) - 1);
214		++off;
215	}
216	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
217		if (map[off] != ~0UL)
218			return (off * NDENTRIES + ffsl(~map[off]) - 1);
219	return (size);
220}
221
222/*
223 * Find the highest non-zero bit in the given bitmap, starting at 0 and
224 * not exceeding size - 1. Return -1 if not found.
225 */
226static int
227fd_last_used(struct filedesc *fdp, int size)
228{
229	NDSLOTTYPE *map = fdp->fd_map;
230	NDSLOTTYPE mask;
231	int off, minoff;
232
233	off = NDSLOT(size);
234	if (size % NDENTRIES) {
235		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
236		if ((mask &= map[off]) != 0)
237			return (off * NDENTRIES + flsl(mask) - 1);
238		--off;
239	}
240	for (minoff = NDSLOT(0); off >= minoff; --off)
241		if (map[off] != 0)
242			return (off * NDENTRIES + flsl(map[off]) - 1);
243	return (-1);
244}
245
246static int
247fdisused(struct filedesc *fdp, int fd)
248{
249
250	FILEDESC_LOCK_ASSERT(fdp);
251
252	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
253	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
254
255	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
256}
257
258/*
259 * Mark a file descriptor as used.
260 */
261static void
262fdused(struct filedesc *fdp, int fd)
263{
264
265	FILEDESC_XLOCK_ASSERT(fdp);
266
267	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
268
269	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
270	if (fd > fdp->fd_lastfile)
271		fdp->fd_lastfile = fd;
272	if (fd == fdp->fd_freefile)
273		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
274}
275
276/*
277 * Mark a file descriptor as unused.
278 */
279static void
280fdunused(struct filedesc *fdp, int fd)
281{
282
283	FILEDESC_XLOCK_ASSERT(fdp);
284
285	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
286	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
287	    ("fd=%d is still in use", fd));
288
289	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
290	if (fd < fdp->fd_freefile)
291		fdp->fd_freefile = fd;
292	if (fd == fdp->fd_lastfile)
293		fdp->fd_lastfile = fd_last_used(fdp, fd);
294}
295
296/*
297 * Free a file descriptor.
298 *
299 * Avoid some work if fdp is about to be destroyed.
300 */
301static inline void
302_fdfree(struct filedesc *fdp, int fd, int last)
303{
304	struct filedescent *fde;
305
306	fde = &fdp->fd_ofiles[fd];
307	filecaps_free(&fde->fde_caps);
308	if (last)
309		return;
310	bzero(fde, sizeof(*fde));
311	fdunused(fdp, fd);
312}
313
314static inline void
315fdfree(struct filedesc *fdp, int fd)
316{
317
318	_fdfree(fdp, fd, 0);
319}
320
321static inline void
322fdfree_last(struct filedesc *fdp, int fd)
323{
324
325	_fdfree(fdp, fd, 1);
326}
327
328/*
329 * System calls on descriptors.
330 */
331#ifndef _SYS_SYSPROTO_H_
332struct getdtablesize_args {
333	int	dummy;
334};
335#endif
336/* ARGSUSED */
337int
338sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
339{
340	struct proc *p = td->td_proc;
341	uint64_t lim;
342
343	PROC_LOCK(p);
344	td->td_retval[0] =
345	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
346	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
347	PROC_UNLOCK(p);
348	if (lim < td->td_retval[0])
349		td->td_retval[0] = lim;
350	return (0);
351}
352
353/*
354 * Duplicate a file descriptor to a particular value.
355 *
356 * Note: keep in mind that a potential race condition exists when closing
357 * descriptors from a shared descriptor table (via rfork).
358 */
359#ifndef _SYS_SYSPROTO_H_
360struct dup2_args {
361	u_int	from;
362	u_int	to;
363};
364#endif
365/* ARGSUSED */
366int
367sys_dup2(struct thread *td, struct dup2_args *uap)
368{
369
370	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
371		    td->td_retval));
372}
373
374/*
375 * Duplicate a file descriptor.
376 */
377#ifndef _SYS_SYSPROTO_H_
378struct dup_args {
379	u_int	fd;
380};
381#endif
382/* ARGSUSED */
383int
384sys_dup(struct thread *td, struct dup_args *uap)
385{
386
387	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
388}
389
390/*
391 * The file control system call.
392 */
393#ifndef _SYS_SYSPROTO_H_
394struct fcntl_args {
395	int	fd;
396	int	cmd;
397	long	arg;
398};
399#endif
400/* ARGSUSED */
401int
402sys_fcntl(struct thread *td, struct fcntl_args *uap)
403{
404
405	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
406}
407
408int
409kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
410{
411	struct flock fl;
412	struct __oflock ofl;
413	intptr_t arg1;
414	int error;
415
416	error = 0;
417	switch (cmd) {
418	case F_OGETLK:
419	case F_OSETLK:
420	case F_OSETLKW:
421		/*
422		 * Convert old flock structure to new.
423		 */
424		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
425		fl.l_start = ofl.l_start;
426		fl.l_len = ofl.l_len;
427		fl.l_pid = ofl.l_pid;
428		fl.l_type = ofl.l_type;
429		fl.l_whence = ofl.l_whence;
430		fl.l_sysid = 0;
431
432		switch (cmd) {
433		case F_OGETLK:
434		    cmd = F_GETLK;
435		    break;
436		case F_OSETLK:
437		    cmd = F_SETLK;
438		    break;
439		case F_OSETLKW:
440		    cmd = F_SETLKW;
441		    break;
442		}
443		arg1 = (intptr_t)&fl;
444		break;
445        case F_GETLK:
446        case F_SETLK:
447        case F_SETLKW:
448	case F_SETLK_REMOTE:
449                error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
450                arg1 = (intptr_t)&fl;
451                break;
452	default:
453		arg1 = arg;
454		break;
455	}
456	if (error)
457		return (error);
458	error = kern_fcntl(td, fd, cmd, arg1);
459	if (error)
460		return (error);
461	if (cmd == F_OGETLK) {
462		ofl.l_start = fl.l_start;
463		ofl.l_len = fl.l_len;
464		ofl.l_pid = fl.l_pid;
465		ofl.l_type = fl.l_type;
466		ofl.l_whence = fl.l_whence;
467		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
468	} else if (cmd == F_GETLK) {
469		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
470	}
471	return (error);
472}
473
474int
475kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
476{
477	struct filedesc *fdp;
478	struct flock *flp;
479	struct file *fp, *fp2;
480	struct filedescent *fde;
481	struct proc *p;
482	struct vnode *vp;
483	cap_rights_t rights;
484	int error, flg, tmp;
485	uint64_t bsize;
486	off_t foffset;
487
488	error = 0;
489	flg = F_POSIX;
490	p = td->td_proc;
491	fdp = p->p_fd;
492
493	switch (cmd) {
494	case F_DUPFD:
495		tmp = arg;
496		error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
497		break;
498
499	case F_DUPFD_CLOEXEC:
500		tmp = arg;
501		error = do_dup(td, DUP_FCNTL | DUP_CLOEXEC, fd, tmp,
502		    td->td_retval);
503		break;
504
505	case F_DUP2FD:
506		tmp = arg;
507		error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
508		break;
509
510	case F_DUP2FD_CLOEXEC:
511		tmp = arg;
512		error = do_dup(td, DUP_FIXED | DUP_CLOEXEC, fd, tmp,
513		    td->td_retval);
514		break;
515
516	case F_GETFD:
517		FILEDESC_SLOCK(fdp);
518		if ((fp = fget_locked(fdp, fd)) == NULL) {
519			FILEDESC_SUNLOCK(fdp);
520			error = EBADF;
521			break;
522		}
523		fde = &fdp->fd_ofiles[fd];
524		td->td_retval[0] =
525		    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
526		FILEDESC_SUNLOCK(fdp);
527		break;
528
529	case F_SETFD:
530		FILEDESC_XLOCK(fdp);
531		if ((fp = fget_locked(fdp, fd)) == NULL) {
532			FILEDESC_XUNLOCK(fdp);
533			error = EBADF;
534			break;
535		}
536		fde = &fdp->fd_ofiles[fd];
537		fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
538		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
539		FILEDESC_XUNLOCK(fdp);
540		break;
541
542	case F_GETFL:
543		error = fget_unlocked(fdp, fd,
544		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
545		if (error != 0)
546			break;
547		td->td_retval[0] = OFLAGS(fp->f_flag);
548		fdrop(fp, td);
549		break;
550
551	case F_SETFL:
552		error = fget_unlocked(fdp, fd,
553		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
554		if (error != 0)
555			break;
556		do {
557			tmp = flg = fp->f_flag;
558			tmp &= ~FCNTLFLAGS;
559			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
560		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
561		tmp = fp->f_flag & FNONBLOCK;
562		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
563		if (error != 0) {
564			fdrop(fp, td);
565			break;
566		}
567		tmp = fp->f_flag & FASYNC;
568		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
569		if (error == 0) {
570			fdrop(fp, td);
571			break;
572		}
573		atomic_clear_int(&fp->f_flag, FNONBLOCK);
574		tmp = 0;
575		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
576		fdrop(fp, td);
577		break;
578
579	case F_GETOWN:
580		error = fget_unlocked(fdp, fd,
581		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
582		if (error != 0)
583			break;
584		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
585		if (error == 0)
586			td->td_retval[0] = tmp;
587		fdrop(fp, td);
588		break;
589
590	case F_SETOWN:
591		error = fget_unlocked(fdp, fd,
592		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
593		if (error != 0)
594			break;
595		tmp = arg;
596		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
597		fdrop(fp, td);
598		break;
599
600	case F_SETLK_REMOTE:
601		error = priv_check(td, PRIV_NFS_LOCKD);
602		if (error)
603			return (error);
604		flg = F_REMOTE;
605		goto do_setlk;
606
607	case F_SETLKW:
608		flg |= F_WAIT;
609		/* FALLTHROUGH F_SETLK */
610
611	case F_SETLK:
612	do_setlk:
613		cap_rights_init(&rights, CAP_FLOCK);
614		error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
615		if (error != 0)
616			break;
617		if (fp->f_type != DTYPE_VNODE) {
618			error = EBADF;
619			fdrop(fp, td);
620			break;
621		}
622
623		flp = (struct flock *)arg;
624		if (flp->l_whence == SEEK_CUR) {
625			foffset = foffset_get(fp);
626			if (foffset < 0 ||
627			    (flp->l_start > 0 &&
628			     foffset > OFF_MAX - flp->l_start)) {
629				FILEDESC_SUNLOCK(fdp);
630				error = EOVERFLOW;
631				fdrop(fp, td);
632				break;
633			}
634			flp->l_start += foffset;
635		}
636
637		vp = fp->f_vnode;
638		switch (flp->l_type) {
639		case F_RDLCK:
640			if ((fp->f_flag & FREAD) == 0) {
641				error = EBADF;
642				break;
643			}
644			PROC_LOCK(p->p_leader);
645			p->p_leader->p_flag |= P_ADVLOCK;
646			PROC_UNLOCK(p->p_leader);
647			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
648			    flp, flg);
649			break;
650		case F_WRLCK:
651			if ((fp->f_flag & FWRITE) == 0) {
652				error = EBADF;
653				break;
654			}
655			PROC_LOCK(p->p_leader);
656			p->p_leader->p_flag |= P_ADVLOCK;
657			PROC_UNLOCK(p->p_leader);
658			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
659			    flp, flg);
660			break;
661		case F_UNLCK:
662			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
663			    flp, flg);
664			break;
665		case F_UNLCKSYS:
666			/*
667			 * Temporary api for testing remote lock
668			 * infrastructure.
669			 */
670			if (flg != F_REMOTE) {
671				error = EINVAL;
672				break;
673			}
674			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
675			    F_UNLCKSYS, flp, flg);
676			break;
677		default:
678			error = EINVAL;
679			break;
680		}
681		if (error != 0 || flp->l_type == F_UNLCK ||
682		    flp->l_type == F_UNLCKSYS) {
683			fdrop(fp, td);
684			break;
685		}
686
687		/*
688		 * Check for a race with close.
689		 *
690		 * The vnode is now advisory locked (or unlocked, but this case
691		 * is not really important) as the caller requested.
692		 * We had to drop the filedesc lock, so we need to recheck if
693		 * the descriptor is still valid, because if it was closed
694		 * in the meantime we need to remove advisory lock from the
695		 * vnode - close on any descriptor leading to an advisory
696		 * locked vnode, removes that lock.
697		 * We will return 0 on purpose in that case, as the result of
698		 * successful advisory lock might have been externally visible
699		 * already. This is fine - effectively we pretend to the caller
700		 * that the closing thread was a bit slower and that the
701		 * advisory lock succeeded before the close.
702		 */
703		error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
704		if (error != 0) {
705			fdrop(fp, td);
706			break;
707		}
708		if (fp != fp2) {
709			flp->l_whence = SEEK_SET;
710			flp->l_start = 0;
711			flp->l_len = 0;
712			flp->l_type = F_UNLCK;
713			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
714			    F_UNLCK, flp, F_POSIX);
715		}
716		fdrop(fp, td);
717		fdrop(fp2, td);
718		break;
719
720	case F_GETLK:
721		error = fget_unlocked(fdp, fd,
722		    cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
723		if (error != 0)
724			break;
725		if (fp->f_type != DTYPE_VNODE) {
726			error = EBADF;
727			fdrop(fp, td);
728			break;
729		}
730		flp = (struct flock *)arg;
731		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
732		    flp->l_type != F_UNLCK) {
733			error = EINVAL;
734			fdrop(fp, td);
735			break;
736		}
737		if (flp->l_whence == SEEK_CUR) {
738			foffset = foffset_get(fp);
739			if ((flp->l_start > 0 &&
740			    foffset > OFF_MAX - flp->l_start) ||
741			    (flp->l_start < 0 &&
742			     foffset < OFF_MIN - flp->l_start)) {
743				FILEDESC_SUNLOCK(fdp);
744				error = EOVERFLOW;
745				fdrop(fp, td);
746				break;
747			}
748			flp->l_start += foffset;
749		}
750		vp = fp->f_vnode;
751		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
752		    F_POSIX);
753		fdrop(fp, td);
754		break;
755
756	case F_RDAHEAD:
757		arg = arg ? 128 * 1024: 0;
758		/* FALLTHROUGH */
759	case F_READAHEAD:
760		error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
761		if (error != 0)
762			break;
763		if (fp->f_type != DTYPE_VNODE) {
764			fdrop(fp, td);
765			error = EBADF;
766			break;
767		}
768		vp = fp->f_vnode;
769		/*
770		 * Exclusive lock synchronizes against f_seqcount reads and
771		 * writes in sequential_heuristic().
772		 */
773		error = vn_lock(vp, LK_EXCLUSIVE);
774		if (error != 0) {
775			fdrop(fp, td);
776			break;
777		}
778		if (arg >= 0) {
779			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
780			fp->f_seqcount = (arg + bsize - 1) / bsize;
781			atomic_set_int(&fp->f_flag, FRDAHEAD);
782		} else {
783			atomic_clear_int(&fp->f_flag, FRDAHEAD);
784		}
785		VOP_UNLOCK(vp, 0);
786		fdrop(fp, td);
787		break;
788
789	default:
790		error = EINVAL;
791		break;
792	}
793	return (error);
794}
795
796static int
797getmaxfd(struct proc *p)
798{
799	int maxfd;
800
801	PROC_LOCK(p);
802	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
803	PROC_UNLOCK(p);
804
805	return (maxfd);
806}
807
808/*
809 * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
810 */
811int
812do_dup(struct thread *td, int flags, int old, int new,
813    register_t *retval)
814{
815	struct filedesc *fdp;
816	struct filedescent *oldfde, *newfde;
817	struct proc *p;
818	struct file *fp;
819	struct file *delfp;
820	int error, maxfd;
821
822	p = td->td_proc;
823	fdp = p->p_fd;
824
825	/*
826	 * Verify we have a valid descriptor to dup from and possibly to
827	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
828	 * return EINVAL when the new descriptor is out of bounds.
829	 */
830	if (old < 0)
831		return (EBADF);
832	if (new < 0)
833		return (flags & DUP_FCNTL ? EINVAL : EBADF);
834	maxfd = getmaxfd(p);
835	if (new >= maxfd)
836		return (flags & DUP_FCNTL ? EINVAL : EBADF);
837
838	FILEDESC_XLOCK(fdp);
839	if (fget_locked(fdp, old) == NULL) {
840		FILEDESC_XUNLOCK(fdp);
841		return (EBADF);
842	}
843	oldfde = &fdp->fd_ofiles[old];
844	if (flags & DUP_FIXED && old == new) {
845		*retval = new;
846		if (flags & DUP_CLOEXEC)
847			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
848		FILEDESC_XUNLOCK(fdp);
849		return (0);
850	}
851	fp = oldfde->fde_file;
852	fhold(fp);
853
854	/*
855	 * If the caller specified a file descriptor, make sure the file
856	 * table is large enough to hold it, and grab it.  Otherwise, just
857	 * allocate a new descriptor the usual way.
858	 */
859	if (flags & DUP_FIXED) {
860		if (new >= fdp->fd_nfiles) {
861			/*
862			 * The resource limits are here instead of e.g.
863			 * fdalloc(), because the file descriptor table may be
864			 * shared between processes, so we can't really use
865			 * racct_add()/racct_sub().  Instead of counting the
866			 * number of actually allocated descriptors, just put
867			 * the limit on the size of the file descriptor table.
868			 */
869#ifdef RACCT
870			PROC_LOCK(p);
871			error = racct_set(p, RACCT_NOFILE, new + 1);
872			PROC_UNLOCK(p);
873			if (error != 0) {
874				FILEDESC_XUNLOCK(fdp);
875				fdrop(fp, td);
876				return (EMFILE);
877			}
878#endif
879			fdgrowtable_exp(fdp, new + 1);
880			oldfde = &fdp->fd_ofiles[old];
881		}
882		newfde = &fdp->fd_ofiles[new];
883		if (newfde->fde_file == NULL)
884			fdused(fdp, new);
885	} else {
886		if ((error = fdalloc(td, new, &new)) != 0) {
887			FILEDESC_XUNLOCK(fdp);
888			fdrop(fp, td);
889			return (error);
890		}
891		newfde = &fdp->fd_ofiles[new];
892	}
893
894	KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
895	KASSERT(old != new, ("new fd is same as old"));
896
897	delfp = newfde->fde_file;
898
899	/*
900	 * Duplicate the source descriptor.
901	 */
902	filecaps_free(&newfde->fde_caps);
903	*newfde = *oldfde;
904	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
905	if ((flags & DUP_CLOEXEC) != 0)
906		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
907	else
908		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
909	*retval = new;
910
911	if (delfp != NULL) {
912		(void) closefp(fdp, new, delfp, td, 1);
913		/* closefp() drops the FILEDESC lock for us. */
914	} else {
915		FILEDESC_XUNLOCK(fdp);
916	}
917
918	return (0);
919}
920
921/*
922 * If sigio is on the list associated with a process or process group,
923 * disable signalling from the device, remove sigio from the list and
924 * free sigio.
925 */
926void
927funsetown(struct sigio **sigiop)
928{
929	struct sigio *sigio;
930
931	SIGIO_LOCK();
932	sigio = *sigiop;
933	if (sigio == NULL) {
934		SIGIO_UNLOCK();
935		return;
936	}
937	*(sigio->sio_myref) = NULL;
938	if ((sigio)->sio_pgid < 0) {
939		struct pgrp *pg = (sigio)->sio_pgrp;
940		PGRP_LOCK(pg);
941		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
942			     sigio, sio_pgsigio);
943		PGRP_UNLOCK(pg);
944	} else {
945		struct proc *p = (sigio)->sio_proc;
946		PROC_LOCK(p);
947		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
948			     sigio, sio_pgsigio);
949		PROC_UNLOCK(p);
950	}
951	SIGIO_UNLOCK();
952	crfree(sigio->sio_ucred);
953	free(sigio, M_SIGIO);
954}
955
956/*
957 * Free a list of sigio structures.
958 * We only need to lock the SIGIO_LOCK because we have made ourselves
959 * inaccessible to callers of fsetown and therefore do not need to lock
960 * the proc or pgrp struct for the list manipulation.
961 */
962void
963funsetownlst(struct sigiolst *sigiolst)
964{
965	struct proc *p;
966	struct pgrp *pg;
967	struct sigio *sigio;
968
969	sigio = SLIST_FIRST(sigiolst);
970	if (sigio == NULL)
971		return;
972	p = NULL;
973	pg = NULL;
974
975	/*
976	 * Every entry of the list should belong
977	 * to a single proc or pgrp.
978	 */
979	if (sigio->sio_pgid < 0) {
980		pg = sigio->sio_pgrp;
981		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
982	} else /* if (sigio->sio_pgid > 0) */ {
983		p = sigio->sio_proc;
984		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
985	}
986
987	SIGIO_LOCK();
988	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
989		*(sigio->sio_myref) = NULL;
990		if (pg != NULL) {
991			KASSERT(sigio->sio_pgid < 0,
992			    ("Proc sigio in pgrp sigio list"));
993			KASSERT(sigio->sio_pgrp == pg,
994			    ("Bogus pgrp in sigio list"));
995			PGRP_LOCK(pg);
996			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
997			    sio_pgsigio);
998			PGRP_UNLOCK(pg);
999		} else /* if (p != NULL) */ {
1000			KASSERT(sigio->sio_pgid > 0,
1001			    ("Pgrp sigio in proc sigio list"));
1002			KASSERT(sigio->sio_proc == p,
1003			    ("Bogus proc in sigio list"));
1004			PROC_LOCK(p);
1005			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
1006			    sio_pgsigio);
1007			PROC_UNLOCK(p);
1008		}
1009		SIGIO_UNLOCK();
1010		crfree(sigio->sio_ucred);
1011		free(sigio, M_SIGIO);
1012		SIGIO_LOCK();
1013	}
1014	SIGIO_UNLOCK();
1015}
1016
1017/*
1018 * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
1019 *
1020 * After permission checking, add a sigio structure to the sigio list for
1021 * the process or process group.
1022 */
1023int
1024fsetown(pid_t pgid, struct sigio **sigiop)
1025{
1026	struct proc *proc;
1027	struct pgrp *pgrp;
1028	struct sigio *sigio;
1029	int ret;
1030
1031	if (pgid == 0) {
1032		funsetown(sigiop);
1033		return (0);
1034	}
1035
1036	ret = 0;
1037
1038	/* Allocate and fill in the new sigio out of locks. */
1039	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
1040	sigio->sio_pgid = pgid;
1041	sigio->sio_ucred = crhold(curthread->td_ucred);
1042	sigio->sio_myref = sigiop;
1043
1044	sx_slock(&proctree_lock);
1045	if (pgid > 0) {
1046		proc = pfind(pgid);
1047		if (proc == NULL) {
1048			ret = ESRCH;
1049			goto fail;
1050		}
1051
1052		/*
1053		 * Policy - Don't allow a process to FSETOWN a process
1054		 * in another session.
1055		 *
1056		 * Remove this test to allow maximum flexibility or
1057		 * restrict FSETOWN to the current process or process
1058		 * group for maximum safety.
1059		 */
1060		PROC_UNLOCK(proc);
1061		if (proc->p_session != curthread->td_proc->p_session) {
1062			ret = EPERM;
1063			goto fail;
1064		}
1065
1066		pgrp = NULL;
1067	} else /* if (pgid < 0) */ {
1068		pgrp = pgfind(-pgid);
1069		if (pgrp == NULL) {
1070			ret = ESRCH;
1071			goto fail;
1072		}
1073		PGRP_UNLOCK(pgrp);
1074
1075		/*
1076		 * Policy - Don't allow a process to FSETOWN a process
1077		 * in another session.
1078		 *
1079		 * Remove this test to allow maximum flexibility or
1080		 * restrict FSETOWN to the current process or process
1081		 * group for maximum safety.
1082		 */
1083		if (pgrp->pg_session != curthread->td_proc->p_session) {
1084			ret = EPERM;
1085			goto fail;
1086		}
1087
1088		proc = NULL;
1089	}
1090	funsetown(sigiop);
1091	if (pgid > 0) {
1092		PROC_LOCK(proc);
1093		/*
1094		 * Since funsetownlst() is called without the proctree
1095		 * locked, we need to check for P_WEXIT.
1096		 * XXX: is ESRCH correct?
1097		 */
1098		if ((proc->p_flag & P_WEXIT) != 0) {
1099			PROC_UNLOCK(proc);
1100			ret = ESRCH;
1101			goto fail;
1102		}
1103		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
1104		sigio->sio_proc = proc;
1105		PROC_UNLOCK(proc);
1106	} else {
1107		PGRP_LOCK(pgrp);
1108		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
1109		sigio->sio_pgrp = pgrp;
1110		PGRP_UNLOCK(pgrp);
1111	}
1112	sx_sunlock(&proctree_lock);
1113	SIGIO_LOCK();
1114	*sigiop = sigio;
1115	SIGIO_UNLOCK();
1116	return (0);
1117
1118fail:
1119	sx_sunlock(&proctree_lock);
1120	crfree(sigio->sio_ucred);
1121	free(sigio, M_SIGIO);
1122	return (ret);
1123}
1124
1125/*
1126 * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
1127 */
1128pid_t
1129fgetown(sigiop)
1130	struct sigio **sigiop;
1131{
1132	pid_t pgid;
1133
1134	SIGIO_LOCK();
1135	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
1136	SIGIO_UNLOCK();
1137	return (pgid);
1138}
1139
1140/*
1141 * Function drops the filedesc lock on return.
1142 */
1143static int
1144closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
1145    int holdleaders)
1146{
1147	int error;
1148
1149	FILEDESC_XLOCK_ASSERT(fdp);
1150
1151	if (holdleaders) {
1152		if (td->td_proc->p_fdtol != NULL) {
1153			/*
1154			 * Ask fdfree() to sleep to ensure that all relevant
1155			 * process leaders can be traversed in closef().
1156			 */
1157			fdp->fd_holdleaderscount++;
1158		} else {
1159			holdleaders = 0;
1160		}
1161	}
1162
1163	/*
1164	 * We now hold the fp reference that used to be owned by the
1165	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
1166	 * knote_fdclose to prevent a race of the fd getting opened, a knote
1167	 * added, and deleteing a knote for the new fd.
1168	 */
1169	knote_fdclose(td, fd);
1170
1171	/*
1172	 * We need to notify mqueue if the object is of type mqueue.
1173	 */
1174	if (fp->f_type == DTYPE_MQUEUE)
1175		mq_fdclose(td, fd, fp);
1176	FILEDESC_XUNLOCK(fdp);
1177
1178	error = closef(fp, td);
1179	if (holdleaders) {
1180		FILEDESC_XLOCK(fdp);
1181		fdp->fd_holdleaderscount--;
1182		if (fdp->fd_holdleaderscount == 0 &&
1183		    fdp->fd_holdleaderswakeup != 0) {
1184			fdp->fd_holdleaderswakeup = 0;
1185			wakeup(&fdp->fd_holdleaderscount);
1186		}
1187		FILEDESC_XUNLOCK(fdp);
1188	}
1189	return (error);
1190}
1191
1192/*
1193 * Close a file descriptor.
1194 */
1195#ifndef _SYS_SYSPROTO_H_
1196struct close_args {
1197	int     fd;
1198};
1199#endif
1200/* ARGSUSED */
1201int
1202sys_close(td, uap)
1203	struct thread *td;
1204	struct close_args *uap;
1205{
1206
1207	return (kern_close(td, uap->fd));
1208}
1209
1210int
1211kern_close(td, fd)
1212	struct thread *td;
1213	int fd;
1214{
1215	struct filedesc *fdp;
1216	struct file *fp;
1217
1218	fdp = td->td_proc->p_fd;
1219
1220	AUDIT_SYSCLOSE(td, fd);
1221
1222	FILEDESC_XLOCK(fdp);
1223	if ((fp = fget_locked(fdp, fd)) == NULL) {
1224		FILEDESC_XUNLOCK(fdp);
1225		return (EBADF);
1226	}
1227	fdfree(fdp, fd);
1228
1229	/* closefp() drops the FILEDESC lock for us. */
1230	return (closefp(fdp, fd, fp, td, 1));
1231}
1232
1233/*
1234 * Close open file descriptors.
1235 */
1236#ifndef _SYS_SYSPROTO_H_
1237struct closefrom_args {
1238	int	lowfd;
1239};
1240#endif
1241/* ARGSUSED */
1242int
1243sys_closefrom(struct thread *td, struct closefrom_args *uap)
1244{
1245	struct filedesc *fdp;
1246	int fd;
1247
1248	fdp = td->td_proc->p_fd;
1249	AUDIT_ARG_FD(uap->lowfd);
1250
1251	/*
1252	 * Treat negative starting file descriptor values identical to
1253	 * closefrom(0) which closes all files.
1254	 */
1255	if (uap->lowfd < 0)
1256		uap->lowfd = 0;
1257	FILEDESC_SLOCK(fdp);
1258	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
1259		if (fdp->fd_ofiles[fd].fde_file != NULL) {
1260			FILEDESC_SUNLOCK(fdp);
1261			(void)kern_close(td, fd);
1262			FILEDESC_SLOCK(fdp);
1263		}
1264	}
1265	FILEDESC_SUNLOCK(fdp);
1266	return (0);
1267}
1268
1269#if defined(COMPAT_43)
1270/*
1271 * Return status information about a file descriptor.
1272 */
1273#ifndef _SYS_SYSPROTO_H_
1274struct ofstat_args {
1275	int	fd;
1276	struct	ostat *sb;
1277};
1278#endif
1279/* ARGSUSED */
1280int
1281ofstat(struct thread *td, struct ofstat_args *uap)
1282{
1283	struct ostat oub;
1284	struct stat ub;
1285	int error;
1286
1287	error = kern_fstat(td, uap->fd, &ub);
1288	if (error == 0) {
1289		cvtstat(&ub, &oub);
1290		error = copyout(&oub, uap->sb, sizeof(oub));
1291	}
1292	return (error);
1293}
1294#endif /* COMPAT_43 */
1295
1296/*
1297 * Return status information about a file descriptor.
1298 */
1299#ifndef _SYS_SYSPROTO_H_
1300struct fstat_args {
1301	int	fd;
1302	struct	stat *sb;
1303};
1304#endif
1305/* ARGSUSED */
1306int
1307sys_fstat(struct thread *td, struct fstat_args *uap)
1308{
1309	struct stat ub;
1310	int error;
1311
1312	error = kern_fstat(td, uap->fd, &ub);
1313	if (error == 0)
1314		error = copyout(&ub, uap->sb, sizeof(ub));
1315	return (error);
1316}
1317
1318int
1319kern_fstat(struct thread *td, int fd, struct stat *sbp)
1320{
1321	struct file *fp;
1322	cap_rights_t rights;
1323	int error;
1324
1325	AUDIT_ARG_FD(fd);
1326
1327	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
1328	if (error != 0)
1329		return (error);
1330
1331	AUDIT_ARG_FILE(td->td_proc, fp);
1332
1333	error = fo_stat(fp, sbp, td->td_ucred, td);
1334	fdrop(fp, td);
1335#ifdef KTRACE
1336	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
1337		ktrstat(sbp);
1338#endif
1339	return (error);
1340}
1341
1342/*
1343 * Return status information about a file descriptor.
1344 */
1345#ifndef _SYS_SYSPROTO_H_
1346struct nfstat_args {
1347	int	fd;
1348	struct	nstat *sb;
1349};
1350#endif
1351/* ARGSUSED */
1352int
1353sys_nfstat(struct thread *td, struct nfstat_args *uap)
1354{
1355	struct nstat nub;
1356	struct stat ub;
1357	int error;
1358
1359	error = kern_fstat(td, uap->fd, &ub);
1360	if (error == 0) {
1361		cvtnstat(&ub, &nub);
1362		error = copyout(&nub, uap->sb, sizeof(nub));
1363	}
1364	return (error);
1365}
1366
1367/*
1368 * Return pathconf information about a file descriptor.
1369 */
1370#ifndef _SYS_SYSPROTO_H_
1371struct fpathconf_args {
1372	int	fd;
1373	int	name;
1374};
1375#endif
1376/* ARGSUSED */
1377int
1378sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
1379{
1380	struct file *fp;
1381	struct vnode *vp;
1382	cap_rights_t rights;
1383	int error;
1384
1385	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
1386	if (error != 0)
1387		return (error);
1388
1389	/* If asynchronous I/O is available, it works for all descriptors. */
1390	if (uap->name == _PC_ASYNC_IO) {
1391		td->td_retval[0] = async_io_version;
1392		goto out;
1393	}
1394	vp = fp->f_vnode;
1395	if (vp != NULL) {
1396		vn_lock(vp, LK_SHARED | LK_RETRY);
1397		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
1398		VOP_UNLOCK(vp, 0);
1399	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
1400		if (uap->name != _PC_PIPE_BUF) {
1401			error = EINVAL;
1402		} else {
1403			td->td_retval[0] = PIPE_BUF;
1404			error = 0;
1405		}
1406	} else {
1407		error = EOPNOTSUPP;
1408	}
1409out:
1410	fdrop(fp, td);
1411	return (error);
1412}
1413
1414/*
1415 * Initialize filecaps structure.
1416 */
1417void
1418filecaps_init(struct filecaps *fcaps)
1419{
1420
1421	bzero(fcaps, sizeof(*fcaps));
1422	fcaps->fc_nioctls = -1;
1423}
1424
1425/*
1426 * Copy filecaps structure allocating memory for ioctls array if needed.
1427 */
1428void
1429filecaps_copy(const struct filecaps *src, struct filecaps *dst)
1430{
1431	size_t size;
1432
1433	*dst = *src;
1434	if (src->fc_ioctls != NULL) {
1435		KASSERT(src->fc_nioctls > 0,
1436		    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
1437
1438		size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
1439		dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
1440		bcopy(src->fc_ioctls, dst->fc_ioctls, size);
1441	}
1442}
1443
1444/*
1445 * Move filecaps structure to the new place and clear the old place.
1446 */
1447void
1448filecaps_move(struct filecaps *src, struct filecaps *dst)
1449{
1450
1451	*dst = *src;
1452	bzero(src, sizeof(*src));
1453}
1454
1455/*
1456 * Fill the given filecaps structure with full rights.
1457 */
1458static void
1459filecaps_fill(struct filecaps *fcaps)
1460{
1461
1462	CAP_ALL(&fcaps->fc_rights);
1463	fcaps->fc_ioctls = NULL;
1464	fcaps->fc_nioctls = -1;
1465	fcaps->fc_fcntls = CAP_FCNTL_ALL;
1466}
1467
1468/*
1469 * Free memory allocated within filecaps structure.
1470 */
1471void
1472filecaps_free(struct filecaps *fcaps)
1473{
1474
1475	free(fcaps->fc_ioctls, M_FILECAPS);
1476	bzero(fcaps, sizeof(*fcaps));
1477}
1478
1479/*
1480 * Validate the given filecaps structure.
1481 */
1482static void
1483filecaps_validate(const struct filecaps *fcaps, const char *func)
1484{
1485
1486	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
1487	    ("%s: invalid rights", func));
1488	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
1489	    ("%s: invalid fcntls", func));
1490	KASSERT(fcaps->fc_fcntls == 0 ||
1491	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
1492	    ("%s: fcntls without CAP_FCNTL", func));
1493	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
1494	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
1495	    ("%s: invalid ioctls", func));
1496	KASSERT(fcaps->fc_nioctls == 0 ||
1497	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
1498	    ("%s: ioctls without CAP_IOCTL", func));
1499}
1500
1501static void
1502fdgrowtable_exp(struct filedesc *fdp, int nfd)
1503{
1504	int nfd1;
1505
1506	FILEDESC_XLOCK_ASSERT(fdp);
1507
1508	nfd1 = fdp->fd_nfiles * 2;
1509	if (nfd1 < nfd)
1510		nfd1 = nfd;
1511	fdgrowtable(fdp, nfd1);
1512}
1513
1514/*
1515 * Grow the file table to accomodate (at least) nfd descriptors.
1516 */
1517static void
1518fdgrowtable(struct filedesc *fdp, int nfd)
1519{
1520	struct filedesc0 *fdp0;
1521	struct freetable *ft;
1522	struct filedescent *ntable;
1523	struct filedescent *otable;
1524	int nnfiles, onfiles;
1525	NDSLOTTYPE *nmap, *omap;
1526
1527	FILEDESC_XLOCK_ASSERT(fdp);
1528
1529	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
1530
1531	/* save old values */
1532	onfiles = fdp->fd_nfiles;
1533	otable = fdp->fd_ofiles;
1534	omap = fdp->fd_map;
1535
1536	/* compute the size of the new table */
1537	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
1538	if (nnfiles <= onfiles)
1539		/* the table is already large enough */
1540		return;
1541
1542	/*
1543	 * Allocate a new table.  We need enough space for the
1544	 * file entries themselves and the struct freetable we will use
1545	 * when we decommission the table and place it on the freelist.
1546	 * We place the struct freetable in the middle so we don't have
1547	 * to worry about padding.
1548	 */
1549	ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
1550	    M_FILEDESC, M_ZERO | M_WAITOK);
1551	/* copy the old data over and point at the new tables */
1552	memcpy(ntable, otable, onfiles * sizeof(*otable));
1553	fdp->fd_ofiles = ntable;
1554
1555	/*
1556	 * Allocate a new map only if the old is not large enough.  It will
1557	 * grow at a slower rate than the table as it can map more
1558	 * entries than the table can hold.
1559	 */
1560	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
1561		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
1562		    M_ZERO | M_WAITOK);
1563		/* copy over the old data and update the pointer */
1564		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
1565		fdp->fd_map = nmap;
1566	}
1567
1568	/*
1569	 * In order to have a valid pattern for fget_unlocked()
1570	 * fdp->fd_nfiles must be the last member to be updated, otherwise
1571	 * fget_unlocked() consumers may reference a new, higher value for
1572	 * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
1573	 * resulting in OOB accesses.
1574	 */
1575	atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
1576
1577	/*
1578	 * Do not free the old file table, as some threads may still
1579	 * reference entries within it.  Instead, place it on a freelist
1580	 * which will be processed when the struct filedesc is released.
1581	 *
1582	 * Note that if onfiles == NDFILE, we're dealing with the original
1583	 * static allocation contained within (struct filedesc0 *)fdp,
1584	 * which must not be freed.
1585	 */
1586	if (onfiles > NDFILE) {
1587		ft = (struct freetable *)&otable[onfiles];
1588		fdp0 = (struct filedesc0 *)fdp;
1589		ft->ft_table = otable;
1590		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
1591	}
1592	/*
1593	 * The map does not have the same possibility of threads still
1594	 * holding references to it.  So always free it as long as it
1595	 * does not reference the original static allocation.
1596	 */
1597	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
1598		free(omap, M_FILEDESC);
1599}
1600
1601/*
1602 * Allocate a file descriptor for the process.
1603 */
1604int
1605fdalloc(struct thread *td, int minfd, int *result)
1606{
1607	struct proc *p = td->td_proc;
1608	struct filedesc *fdp = p->p_fd;
1609	int fd = -1, maxfd, allocfd;
1610#ifdef RACCT
1611	int error;
1612#endif
1613
1614	FILEDESC_XLOCK_ASSERT(fdp);
1615
1616	if (fdp->fd_freefile > minfd)
1617		minfd = fdp->fd_freefile;
1618
1619	maxfd = getmaxfd(p);
1620
1621	/*
1622	 * Search the bitmap for a free descriptor starting at minfd.
1623	 * If none is found, grow the file table.
1624	 */
1625	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
1626	if (fd >= maxfd)
1627		return (EMFILE);
1628	if (fd >= fdp->fd_nfiles) {
1629		allocfd = min(fd * 2, maxfd);
1630#ifdef RACCT
1631		PROC_LOCK(p);
1632		error = racct_set(p, RACCT_NOFILE, allocfd);
1633		PROC_UNLOCK(p);
1634		if (error != 0)
1635			return (EMFILE);
1636#endif
1637		/*
1638		 * fd is already equal to first free descriptor >= minfd, so
1639		 * we only need to grow the table and we are done.
1640		 */
1641		fdgrowtable_exp(fdp, allocfd);
1642	}
1643
1644	/*
1645	 * Perform some sanity checks, then mark the file descriptor as
1646	 * used and return it to the caller.
1647	 */
1648	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
1649	    ("invalid descriptor %d", fd));
1650	KASSERT(!fdisused(fdp, fd),
1651	    ("fd_first_free() returned non-free descriptor"));
1652	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
1653	    ("file descriptor isn't free"));
1654	KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
1655	fdused(fdp, fd);
1656	*result = fd;
1657	return (0);
1658}
1659
1660/*
1661 * Allocate n file descriptors for the process.
1662 */
1663int
1664fdallocn(struct thread *td, int minfd, int *fds, int n)
1665{
1666	struct proc *p = td->td_proc;
1667	struct filedesc *fdp = p->p_fd;
1668	int i;
1669
1670	FILEDESC_XLOCK_ASSERT(fdp);
1671
1672	if (!fdavail(td, n))
1673		return (EMFILE);
1674
1675	for (i = 0; i < n; i++)
1676		if (fdalloc(td, 0, &fds[i]) != 0)
1677			break;
1678
1679	if (i < n) {
1680		for (i--; i >= 0; i--)
1681			fdunused(fdp, fds[i]);
1682		return (EMFILE);
1683	}
1684
1685	return (0);
1686}
1687
1688/*
1689 * Check to see whether n user file descriptors are available to the process
1690 * p.
1691 */
1692int
1693fdavail(struct thread *td, int n)
1694{
1695	struct proc *p = td->td_proc;
1696	struct filedesc *fdp = td->td_proc->p_fd;
1697	int i, lim, last;
1698
1699	FILEDESC_LOCK_ASSERT(fdp);
1700
1701	/*
1702	 * XXX: This is only called from uipc_usrreq.c:unp_externalize();
1703	 *      call racct_add() from there instead of dealing with containers
1704	 *      here.
1705	 */
1706	lim = getmaxfd(p);
1707	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
1708		return (1);
1709	last = min(fdp->fd_nfiles, lim);
1710	for (i = fdp->fd_freefile; i < last; i++) {
1711		if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
1712			return (1);
1713	}
1714	return (0);
1715}
1716
1717/*
1718 * Create a new open file structure and allocate a file decriptor for the
1719 * process that refers to it.  We add one reference to the file for the
1720 * descriptor table and one reference for resultfp. This is to prevent us
1721 * being preempted and the entry in the descriptor table closed after we
1722 * release the FILEDESC lock.
1723 */
1724int
1725falloc(struct thread *td, struct file **resultfp, int *resultfd, int flags)
1726{
1727	struct file *fp;
1728	int error, fd;
1729
1730	error = falloc_noinstall(td, &fp);
1731	if (error)
1732		return (error);		/* no reference held on error */
1733
1734	error = finstall(td, fp, &fd, flags, NULL);
1735	if (error) {
1736		fdrop(fp, td);		/* one reference (fp only) */
1737		return (error);
1738	}
1739
1740	if (resultfp != NULL)
1741		*resultfp = fp;		/* copy out result */
1742	else
1743		fdrop(fp, td);		/* release local reference */
1744
1745	if (resultfd != NULL)
1746		*resultfd = fd;
1747
1748	return (0);
1749}
1750
1751/*
1752 * Create a new open file structure without allocating a file descriptor.
1753 */
1754int
1755falloc_noinstall(struct thread *td, struct file **resultfp)
1756{
1757	struct file *fp;
1758	int maxuserfiles = maxfiles - (maxfiles / 20);
1759	static struct timeval lastfail;
1760	static int curfail;
1761
1762	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
1763
1764	if ((openfiles >= maxuserfiles &&
1765	    priv_check(td, PRIV_MAXFILES) != 0) ||
1766	    openfiles >= maxfiles) {
1767		if (ppsratecheck(&lastfail, &curfail, 1)) {
1768			printf("kern.maxfiles limit exceeded by uid %i, "
1769			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
1770		}
1771		return (ENFILE);
1772	}
1773	atomic_add_int(&openfiles, 1);
1774	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
1775	refcount_init(&fp->f_count, 1);
1776	fp->f_cred = crhold(td->td_ucred);
1777	fp->f_ops = &badfileops;
1778	fp->f_data = NULL;
1779	fp->f_vnode = NULL;
1780	*resultfp = fp;
1781	return (0);
1782}
1783
1784/*
1785 * Install a file in a file descriptor table.
1786 */
1787int
1788finstall(struct thread *td, struct file *fp, int *fd, int flags,
1789    struct filecaps *fcaps)
1790{
1791	struct filedesc *fdp = td->td_proc->p_fd;
1792	struct filedescent *fde;
1793	int error;
1794
1795	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
1796	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
1797	if (fcaps != NULL)
1798		filecaps_validate(fcaps, __func__);
1799
1800	FILEDESC_XLOCK(fdp);
1801	if ((error = fdalloc(td, 0, fd))) {
1802		FILEDESC_XUNLOCK(fdp);
1803		return (error);
1804	}
1805	fhold(fp);
1806	fde = &fdp->fd_ofiles[*fd];
1807	fde->fde_file = fp;
1808	if ((flags & O_CLOEXEC) != 0)
1809		fde->fde_flags |= UF_EXCLOSE;
1810	if (fcaps != NULL)
1811		filecaps_move(fcaps, &fde->fde_caps);
1812	else
1813		filecaps_fill(&fde->fde_caps);
1814	FILEDESC_XUNLOCK(fdp);
1815	return (0);
1816}
1817
1818/*
1819 * Build a new filedesc structure from another.
1820 * Copy the current, root, and jail root vnode references.
1821 */
1822struct filedesc *
1823fdinit(struct filedesc *fdp)
1824{
1825	struct filedesc0 *newfdp;
1826
1827	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
1828	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
1829	if (fdp != NULL) {
1830		FILEDESC_SLOCK(fdp);
1831		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
1832		if (newfdp->fd_fd.fd_cdir)
1833			VREF(newfdp->fd_fd.fd_cdir);
1834		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
1835		if (newfdp->fd_fd.fd_rdir)
1836			VREF(newfdp->fd_fd.fd_rdir);
1837		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
1838		if (newfdp->fd_fd.fd_jdir)
1839			VREF(newfdp->fd_fd.fd_jdir);
1840		FILEDESC_SUNLOCK(fdp);
1841	}
1842
1843	/* Create the file descriptor table. */
1844	newfdp->fd_fd.fd_refcnt = 1;
1845	newfdp->fd_fd.fd_holdcnt = 1;
1846	newfdp->fd_fd.fd_cmask = CMASK;
1847	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
1848	newfdp->fd_fd.fd_nfiles = NDFILE;
1849	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
1850	newfdp->fd_fd.fd_lastfile = -1;
1851	return (&newfdp->fd_fd);
1852}
1853
1854static struct filedesc *
1855fdhold(struct proc *p)
1856{
1857	struct filedesc *fdp;
1858
1859	mtx_lock(&fdesc_mtx);
1860	fdp = p->p_fd;
1861	if (fdp != NULL)
1862		fdp->fd_holdcnt++;
1863	mtx_unlock(&fdesc_mtx);
1864	return (fdp);
1865}
1866
1867static void
1868fddrop(struct filedesc *fdp)
1869{
1870	struct filedesc0 *fdp0;
1871	struct freetable *ft;
1872	int i;
1873
1874	mtx_lock(&fdesc_mtx);
1875	i = --fdp->fd_holdcnt;
1876	mtx_unlock(&fdesc_mtx);
1877	if (i > 0)
1878		return;
1879
1880	FILEDESC_LOCK_DESTROY(fdp);
1881	fdp0 = (struct filedesc0 *)fdp;
1882	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
1883		SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
1884		free(ft->ft_table, M_FILEDESC);
1885	}
1886	free(fdp, M_FILEDESC);
1887}
1888
1889/*
1890 * Share a filedesc structure.
1891 */
1892struct filedesc *
1893fdshare(struct filedesc *fdp)
1894{
1895
1896	FILEDESC_XLOCK(fdp);
1897	fdp->fd_refcnt++;
1898	FILEDESC_XUNLOCK(fdp);
1899	return (fdp);
1900}
1901
1902/*
1903 * Unshare a filedesc structure, if necessary by making a copy
1904 */
1905void
1906fdunshare(struct thread *td)
1907{
1908	struct filedesc *tmp;
1909	struct proc *p = td->td_proc;
1910
1911	if (p->p_fd->fd_refcnt == 1)
1912		return;
1913
1914	tmp = fdcopy(p->p_fd);
1915	fdescfree(td);
1916	p->p_fd = tmp;
1917}
1918
1919/*
1920 * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
1921 * this is to ease callers, not catch errors.
1922 */
1923struct filedesc *
1924fdcopy(struct filedesc *fdp)
1925{
1926	struct filedesc *newfdp;
1927	struct filedescent *nfde, *ofde;
1928	int i;
1929
1930	/* Certain daemons might not have file descriptors. */
1931	if (fdp == NULL)
1932		return (NULL);
1933
1934	newfdp = fdinit(fdp);
1935	FILEDESC_SLOCK(fdp);
1936	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
1937		FILEDESC_SUNLOCK(fdp);
1938		FILEDESC_XLOCK(newfdp);
1939		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
1940		FILEDESC_XUNLOCK(newfdp);
1941		FILEDESC_SLOCK(fdp);
1942	}
1943	/* copy all passable descriptors (i.e. not kqueue) */
1944	newfdp->fd_freefile = -1;
1945	for (i = 0; i <= fdp->fd_lastfile; ++i) {
1946		ofde = &fdp->fd_ofiles[i];
1947		if (fdisused(fdp, i) &&
1948		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
1949		    ofde->fde_file->f_ops != &badfileops) {
1950			nfde = &newfdp->fd_ofiles[i];
1951			*nfde = *ofde;
1952			filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
1953			fhold(nfde->fde_file);
1954			newfdp->fd_lastfile = i;
1955		} else {
1956			if (newfdp->fd_freefile == -1)
1957				newfdp->fd_freefile = i;
1958		}
1959	}
1960	newfdp->fd_cmask = fdp->fd_cmask;
1961	FILEDESC_SUNLOCK(fdp);
1962	FILEDESC_XLOCK(newfdp);
1963	for (i = 0; i <= newfdp->fd_lastfile; ++i) {
1964		if (newfdp->fd_ofiles[i].fde_file != NULL)
1965			fdused(newfdp, i);
1966	}
1967	if (newfdp->fd_freefile == -1)
1968		newfdp->fd_freefile = i;
1969	FILEDESC_XUNLOCK(newfdp);
1970	return (newfdp);
1971}
1972
1973/*
1974 * Release a filedesc structure.
1975 */
1976void
1977fdescfree(struct thread *td)
1978{
1979	struct filedesc *fdp;
1980	int i;
1981	struct filedesc_to_leader *fdtol;
1982	struct file *fp;
1983	struct vnode *cdir, *jdir, *rdir, *vp;
1984	struct flock lf;
1985
1986	/* Certain daemons might not have file descriptors. */
1987	fdp = td->td_proc->p_fd;
1988	if (fdp == NULL)
1989		return;
1990
1991#ifdef RACCT
1992	PROC_LOCK(td->td_proc);
1993	racct_set(td->td_proc, RACCT_NOFILE, 0);
1994	PROC_UNLOCK(td->td_proc);
1995#endif
1996
1997	/* Check for special need to clear POSIX style locks */
1998	fdtol = td->td_proc->p_fdtol;
1999	if (fdtol != NULL) {
2000		FILEDESC_XLOCK(fdp);
2001		KASSERT(fdtol->fdl_refcount > 0,
2002		    ("filedesc_to_refcount botch: fdl_refcount=%d",
2003		    fdtol->fdl_refcount));
2004		if (fdtol->fdl_refcount == 1 &&
2005		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2006			for (i = 0; i <= fdp->fd_lastfile; i++) {
2007				fp = fdp->fd_ofiles[i].fde_file;
2008				if (fp == NULL || fp->f_type != DTYPE_VNODE)
2009					continue;
2010				fhold(fp);
2011				FILEDESC_XUNLOCK(fdp);
2012				lf.l_whence = SEEK_SET;
2013				lf.l_start = 0;
2014				lf.l_len = 0;
2015				lf.l_type = F_UNLCK;
2016				vp = fp->f_vnode;
2017				(void) VOP_ADVLOCK(vp,
2018				    (caddr_t)td->td_proc->p_leader, F_UNLCK,
2019				    &lf, F_POSIX);
2020				FILEDESC_XLOCK(fdp);
2021				fdrop(fp, td);
2022			}
2023		}
2024	retry:
2025		if (fdtol->fdl_refcount == 1) {
2026			if (fdp->fd_holdleaderscount > 0 &&
2027			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2028				/*
2029				 * close() or do_dup() has cleared a reference
2030				 * in a shared file descriptor table.
2031				 */
2032				fdp->fd_holdleaderswakeup = 1;
2033				sx_sleep(&fdp->fd_holdleaderscount,
2034				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
2035				goto retry;
2036			}
2037			if (fdtol->fdl_holdcount > 0) {
2038				/*
2039				 * Ensure that fdtol->fdl_leader remains
2040				 * valid in closef().
2041				 */
2042				fdtol->fdl_wakeup = 1;
2043				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
2044				    "fdlhold", 0);
2045				goto retry;
2046			}
2047		}
2048		fdtol->fdl_refcount--;
2049		if (fdtol->fdl_refcount == 0 &&
2050		    fdtol->fdl_holdcount == 0) {
2051			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
2052			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
2053		} else
2054			fdtol = NULL;
2055		td->td_proc->p_fdtol = NULL;
2056		FILEDESC_XUNLOCK(fdp);
2057		if (fdtol != NULL)
2058			free(fdtol, M_FILEDESC_TO_LEADER);
2059	}
2060
2061	mtx_lock(&fdesc_mtx);
2062	td->td_proc->p_fd = NULL;
2063	mtx_unlock(&fdesc_mtx);
2064
2065	FILEDESC_XLOCK(fdp);
2066	i = --fdp->fd_refcnt;
2067	if (i > 0) {
2068		FILEDESC_XUNLOCK(fdp);
2069		return;
2070	}
2071
2072	cdir = fdp->fd_cdir;
2073	fdp->fd_cdir = NULL;
2074	rdir = fdp->fd_rdir;
2075	fdp->fd_rdir = NULL;
2076	jdir = fdp->fd_jdir;
2077	fdp->fd_jdir = NULL;
2078	FILEDESC_XUNLOCK(fdp);
2079
2080	for (i = 0; i <= fdp->fd_lastfile; i++) {
2081		fp = fdp->fd_ofiles[i].fde_file;
2082		if (fp != NULL) {
2083			fdfree_last(fdp, i);
2084			(void) closef(fp, td);
2085		}
2086	}
2087
2088	if (fdp->fd_nfiles > NDFILE)
2089		free(fdp->fd_ofiles, M_FILEDESC);
2090	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
2091		free(fdp->fd_map, M_FILEDESC);
2092
2093	if (cdir != NULL)
2094		vrele(cdir);
2095	if (rdir != NULL)
2096		vrele(rdir);
2097	if (jdir != NULL)
2098		vrele(jdir);
2099
2100	fddrop(fdp);
2101}
2102
2103/*
2104 * For setugid programs, we don't want to people to use that setugidness
2105 * to generate error messages which write to a file which otherwise would
2106 * otherwise be off-limits to the process.  We check for filesystems where
2107 * the vnode can change out from under us after execve (like [lin]procfs).
2108 *
2109 * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
2110 * sufficient.  We also don't check for setugidness since we know we are.
2111 */
2112static int
2113is_unsafe(struct file *fp)
2114{
2115	if (fp->f_type == DTYPE_VNODE) {
2116		struct vnode *vp = fp->f_vnode;
2117
2118		if ((vp->v_vflag & VV_PROCDEP) != 0)
2119			return (1);
2120	}
2121	return (0);
2122}
2123
2124/*
2125 * Make this setguid thing safe, if at all possible.
2126 */
2127void
2128setugidsafety(struct thread *td)
2129{
2130	struct filedesc *fdp;
2131	struct file *fp;
2132	int i;
2133
2134	fdp = td->td_proc->p_fd;
2135	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2136	FILEDESC_XLOCK(fdp);
2137	for (i = 0; i <= fdp->fd_lastfile; i++) {
2138		if (i > 2)
2139			break;
2140		fp = fdp->fd_ofiles[i].fde_file;
2141		if (fp != NULL && is_unsafe(fp)) {
2142			knote_fdclose(td, i);
2143			/*
2144			 * NULL-out descriptor prior to close to avoid
2145			 * a race while close blocks.
2146			 */
2147			fdfree(fdp, i);
2148			FILEDESC_XUNLOCK(fdp);
2149			(void) closef(fp, td);
2150			FILEDESC_XLOCK(fdp);
2151		}
2152	}
2153	FILEDESC_XUNLOCK(fdp);
2154}
2155
2156/*
2157 * If a specific file object occupies a specific file descriptor, close the
2158 * file descriptor entry and drop a reference on the file object.  This is a
2159 * convenience function to handle a subsequent error in a function that calls
2160 * falloc() that handles the race that another thread might have closed the
2161 * file descriptor out from under the thread creating the file object.
2162 */
2163void
2164fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
2165{
2166
2167	FILEDESC_XLOCK(fdp);
2168	if (fdp->fd_ofiles[idx].fde_file == fp) {
2169		fdfree(fdp, idx);
2170		FILEDESC_XUNLOCK(fdp);
2171		fdrop(fp, td);
2172	} else
2173		FILEDESC_XUNLOCK(fdp);
2174}
2175
2176/*
2177 * Close any files on exec?
2178 */
2179void
2180fdcloseexec(struct thread *td)
2181{
2182	struct filedesc *fdp;
2183	struct filedescent *fde;
2184	struct file *fp;
2185	int i;
2186
2187	fdp = td->td_proc->p_fd;
2188	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2189	FILEDESC_XLOCK(fdp);
2190	for (i = 0; i <= fdp->fd_lastfile; i++) {
2191		fde = &fdp->fd_ofiles[i];
2192		fp = fde->fde_file;
2193		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
2194		    (fde->fde_flags & UF_EXCLOSE))) {
2195			fdfree(fdp, i);
2196			(void) closefp(fdp, i, fp, td, 0);
2197			/* closefp() drops the FILEDESC lock. */
2198			FILEDESC_XLOCK(fdp);
2199		}
2200	}
2201	FILEDESC_XUNLOCK(fdp);
2202}
2203
2204/*
2205 * It is unsafe for set[ug]id processes to be started with file
2206 * descriptors 0..2 closed, as these descriptors are given implicit
2207 * significance in the Standard C library.  fdcheckstd() will create a
2208 * descriptor referencing /dev/null for each of stdin, stdout, and
2209 * stderr that is not already open.
2210 */
2211int
2212fdcheckstd(struct thread *td)
2213{
2214	struct filedesc *fdp;
2215	register_t retval, save;
2216	int i, error, devnull;
2217
2218	fdp = td->td_proc->p_fd;
2219	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
2220	devnull = -1;
2221	error = 0;
2222	for (i = 0; i < 3; i++) {
2223		if (fdp->fd_ofiles[i].fde_file != NULL)
2224			continue;
2225		if (devnull < 0) {
2226			save = td->td_retval[0];
2227			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
2228			    O_RDWR, 0);
2229			devnull = td->td_retval[0];
2230			td->td_retval[0] = save;
2231			if (error)
2232				break;
2233			KASSERT(devnull == i, ("oof, we didn't get our fd"));
2234		} else {
2235			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
2236			if (error != 0)
2237				break;
2238		}
2239	}
2240	return (error);
2241}
2242
2243/*
2244 * Internal form of close.  Decrement reference count on file structure.
2245 * Note: td may be NULL when closing a file that was being passed in a
2246 * message.
2247 *
2248 * XXXRW: Giant is not required for the caller, but often will be held; this
2249 * makes it moderately likely the Giant will be recursed in the VFS case.
2250 */
2251int
2252closef(struct file *fp, struct thread *td)
2253{
2254	struct vnode *vp;
2255	struct flock lf;
2256	struct filedesc_to_leader *fdtol;
2257	struct filedesc *fdp;
2258
2259	/*
2260	 * POSIX record locking dictates that any close releases ALL
2261	 * locks owned by this process.  This is handled by setting
2262	 * a flag in the unlock to free ONLY locks obeying POSIX
2263	 * semantics, and not to free BSD-style file locks.
2264	 * If the descriptor was in a message, POSIX-style locks
2265	 * aren't passed with the descriptor, and the thread pointer
2266	 * will be NULL.  Callers should be careful only to pass a
2267	 * NULL thread pointer when there really is no owning
2268	 * context that might have locks, or the locks will be
2269	 * leaked.
2270	 */
2271	if (fp->f_type == DTYPE_VNODE && td != NULL) {
2272		vp = fp->f_vnode;
2273		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
2274			lf.l_whence = SEEK_SET;
2275			lf.l_start = 0;
2276			lf.l_len = 0;
2277			lf.l_type = F_UNLCK;
2278			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
2279			    F_UNLCK, &lf, F_POSIX);
2280		}
2281		fdtol = td->td_proc->p_fdtol;
2282		if (fdtol != NULL) {
2283			/*
2284			 * Handle special case where file descriptor table is
2285			 * shared between multiple process leaders.
2286			 */
2287			fdp = td->td_proc->p_fd;
2288			FILEDESC_XLOCK(fdp);
2289			for (fdtol = fdtol->fdl_next;
2290			     fdtol != td->td_proc->p_fdtol;
2291			     fdtol = fdtol->fdl_next) {
2292				if ((fdtol->fdl_leader->p_flag &
2293				     P_ADVLOCK) == 0)
2294					continue;
2295				fdtol->fdl_holdcount++;
2296				FILEDESC_XUNLOCK(fdp);
2297				lf.l_whence = SEEK_SET;
2298				lf.l_start = 0;
2299				lf.l_len = 0;
2300				lf.l_type = F_UNLCK;
2301				vp = fp->f_vnode;
2302				(void) VOP_ADVLOCK(vp,
2303				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
2304				    F_POSIX);
2305				FILEDESC_XLOCK(fdp);
2306				fdtol->fdl_holdcount--;
2307				if (fdtol->fdl_holdcount == 0 &&
2308				    fdtol->fdl_wakeup != 0) {
2309					fdtol->fdl_wakeup = 0;
2310					wakeup(fdtol);
2311				}
2312			}
2313			FILEDESC_XUNLOCK(fdp);
2314		}
2315	}
2316	return (fdrop(fp, td));
2317}
2318
2319/*
2320 * Initialize the file pointer with the specified properties.
2321 *
2322 * The ops are set with release semantics to be certain that the flags, type,
2323 * and data are visible when ops is.  This is to prevent ops methods from being
2324 * called with bad data.
2325 */
2326void
2327finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
2328{
2329	fp->f_data = data;
2330	fp->f_flag = flag;
2331	fp->f_type = type;
2332	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
2333}
2334
2335int
2336fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
2337    int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
2338{
2339	struct file *fp;
2340	u_int count;
2341#ifdef CAPABILITIES
2342	cap_rights_t haverights;
2343	int error;
2344#endif
2345
2346	/*
2347	 * Avoid reads reordering and then a first access to the
2348	 * fdp->fd_ofiles table which could result in OOB operation.
2349	 */
2350	if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
2351		return (EBADF);
2352	/*
2353	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
2354	 * never raising a refcount above 0.  To accomplish this we have
2355	 * to use a cmpset loop rather than an atomic_add.  The descriptor
2356	 * must be re-verified once we acquire a reference to be certain
2357	 * that the identity is still correct and we did not lose a race
2358	 * due to preemption.
2359	 */
2360	for (;;) {
2361		fp = fdp->fd_ofiles[fd].fde_file;
2362		if (fp == NULL)
2363			return (EBADF);
2364#ifdef CAPABILITIES
2365		haverights = *cap_rights(fdp, fd);
2366		if (needrightsp != NULL) {
2367			error = cap_check(&haverights, needrightsp);
2368			if (error != 0)
2369				return (error);
2370			if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
2371				error = cap_fcntl_check(fdp, fd, needfcntl);
2372				if (error != 0)
2373					return (error);
2374			}
2375		}
2376#endif
2377		count = fp->f_count;
2378		if (count == 0)
2379			continue;
2380		/*
2381		 * Use an acquire barrier to prevent caching of fd_ofiles
2382		 * so it is refreshed for verification.
2383		 */
2384		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
2385			continue;
2386		if (fp == fdp->fd_ofiles[fd].fde_file)
2387			break;
2388		fdrop(fp, curthread);
2389	}
2390	*fpp = fp;
2391	if (haverightsp != NULL) {
2392#ifdef CAPABILITIES
2393		*haverightsp = haverights;
2394#else
2395		CAP_ALL(haverightsp);
2396#endif
2397	}
2398	return (0);
2399}
2400
2401/*
2402 * Extract the file pointer associated with the specified descriptor for the
2403 * current user process.
2404 *
2405 * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
2406 * returned.
2407 *
2408 * File's rights will be checked against the capability rights mask.
2409 *
2410 * If an error occured the non-zero error is returned and *fpp is set to
2411 * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
2412 * responsible for fdrop().
2413 */
2414static __inline int
2415_fget(struct thread *td, int fd, struct file **fpp, int flags,
2416    cap_rights_t *needrightsp, u_char *maxprotp)
2417{
2418	struct filedesc *fdp;
2419	struct file *fp;
2420	cap_rights_t haverights, needrights;
2421	int error;
2422
2423	*fpp = NULL;
2424	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2425		return (EBADF);
2426	if (needrightsp != NULL)
2427		needrights = *needrightsp;
2428	else
2429		cap_rights_init(&needrights);
2430	if (maxprotp != NULL)
2431		cap_rights_set(&needrights, CAP_MMAP);
2432	error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
2433	if (error != 0)
2434		return (error);
2435	if (fp->f_ops == &badfileops) {
2436		fdrop(fp, td);
2437		return (EBADF);
2438	}
2439
2440#ifdef CAPABILITIES
2441	/*
2442	 * If requested, convert capability rights to access flags.
2443	 */
2444	if (maxprotp != NULL)
2445		*maxprotp = cap_rights_to_vmprot(&haverights);
2446#else /* !CAPABILITIES */
2447	if (maxprotp != NULL)
2448		*maxprotp = VM_PROT_ALL;
2449#endif /* CAPABILITIES */
2450
2451	/*
2452	 * FREAD and FWRITE failure return EBADF as per POSIX.
2453	 */
2454	error = 0;
2455	switch (flags) {
2456	case FREAD:
2457	case FWRITE:
2458		if ((fp->f_flag & flags) == 0)
2459			error = EBADF;
2460		break;
2461	case FEXEC:
2462	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
2463		    ((fp->f_flag & FWRITE) != 0))
2464			error = EBADF;
2465		break;
2466	case 0:
2467		break;
2468	default:
2469		KASSERT(0, ("wrong flags"));
2470	}
2471
2472	if (error != 0) {
2473		fdrop(fp, td);
2474		return (error);
2475	}
2476
2477	*fpp = fp;
2478	return (0);
2479}
2480
2481int
2482fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2483{
2484
2485	return(_fget(td, fd, fpp, 0, rightsp, NULL));
2486}
2487
2488int
2489fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
2490    struct file **fpp)
2491{
2492
2493	return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
2494}
2495
2496int
2497fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2498{
2499
2500	return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
2501}
2502
2503int
2504fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
2505{
2506
2507	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
2508}
2509
2510/*
2511 * Like fget() but loads the underlying vnode, or returns an error if the
2512 * descriptor does not represent a vnode.  Note that pipes use vnodes but
2513 * never have VM objects.  The returned vnode will be vref()'d.
2514 *
2515 * XXX: what about the unused flags ?
2516 */
2517static __inline int
2518_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
2519    struct vnode **vpp)
2520{
2521	struct file *fp;
2522	int error;
2523
2524	*vpp = NULL;
2525	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
2526	if (error != 0)
2527		return (error);
2528	if (fp->f_vnode == NULL) {
2529		error = EINVAL;
2530	} else {
2531		*vpp = fp->f_vnode;
2532		vref(*vpp);
2533	}
2534	fdrop(fp, td);
2535
2536	return (error);
2537}
2538
2539int
2540fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2541{
2542
2543	return (_fgetvp(td, fd, 0, rightsp, vpp));
2544}
2545
2546int
2547fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
2548    struct filecaps *havecaps, struct vnode **vpp)
2549{
2550	struct filedesc *fdp;
2551	struct file *fp;
2552#ifdef CAPABILITIES
2553	int error;
2554#endif
2555
2556	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
2557		return (EBADF);
2558
2559	fp = fget_locked(fdp, fd);
2560	if (fp == NULL || fp->f_ops == &badfileops)
2561		return (EBADF);
2562
2563#ifdef CAPABILITIES
2564	if (needrightsp != NULL) {
2565		error = cap_check(cap_rights(fdp, fd), needrightsp);
2566		if (error != 0)
2567			return (error);
2568	}
2569#endif
2570
2571	if (fp->f_vnode == NULL)
2572		return (EINVAL);
2573
2574	*vpp = fp->f_vnode;
2575	vref(*vpp);
2576	filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
2577
2578	return (0);
2579}
2580
2581int
2582fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2583{
2584
2585	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
2586}
2587
2588int
2589fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
2590{
2591
2592	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
2593}
2594
2595#ifdef notyet
2596int
2597fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
2598    struct vnode **vpp)
2599{
2600
2601	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
2602}
2603#endif
2604
2605/*
2606 * Like fget() but loads the underlying socket, or returns an error if the
2607 * descriptor does not represent a socket.
2608 *
2609 * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
2610 * in the future.
2611 *
2612 * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
2613 * on their file descriptor reference to prevent the socket from being free'd
2614 * during use.
2615 */
2616int
2617fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
2618    u_int *fflagp)
2619{
2620	struct file *fp;
2621	int error;
2622
2623	*spp = NULL;
2624	if (fflagp != NULL)
2625		*fflagp = 0;
2626	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
2627		return (error);
2628	if (fp->f_type != DTYPE_SOCKET) {
2629		error = ENOTSOCK;
2630	} else {
2631		*spp = fp->f_data;
2632		if (fflagp)
2633			*fflagp = fp->f_flag;
2634		SOCK_LOCK(*spp);
2635		soref(*spp);
2636		SOCK_UNLOCK(*spp);
2637	}
2638	fdrop(fp, td);
2639
2640	return (error);
2641}
2642
2643/*
2644 * Drop the reference count on the socket and XXX release the SX lock in the
2645 * future.  The last reference closes the socket.
2646 *
2647 * Note: fputsock() is deprecated, see comment for fgetsock().
2648 */
2649void
2650fputsock(struct socket *so)
2651{
2652
2653	ACCEPT_LOCK();
2654	SOCK_LOCK(so);
2655	CURVNET_SET(so->so_vnet);
2656	sorele(so);
2657	CURVNET_RESTORE();
2658}
2659
2660/*
2661 * Handle the last reference to a file being closed.
2662 */
2663int
2664_fdrop(struct file *fp, struct thread *td)
2665{
2666	int error;
2667
2668	error = 0;
2669	if (fp->f_count != 0)
2670		panic("fdrop: count %d", fp->f_count);
2671	if (fp->f_ops != &badfileops)
2672		error = fo_close(fp, td);
2673	atomic_subtract_int(&openfiles, 1);
2674	crfree(fp->f_cred);
2675	free(fp->f_advice, M_FADVISE);
2676	uma_zfree(file_zone, fp);
2677
2678	return (error);
2679}
2680
2681/*
2682 * Apply an advisory lock on a file descriptor.
2683 *
2684 * Just attempt to get a record lock of the requested type on the entire file
2685 * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
2686 */
2687#ifndef _SYS_SYSPROTO_H_
2688struct flock_args {
2689	int	fd;
2690	int	how;
2691};
2692#endif
2693/* ARGSUSED */
2694int
2695sys_flock(struct thread *td, struct flock_args *uap)
2696{
2697	struct file *fp;
2698	struct vnode *vp;
2699	struct flock lf;
2700	cap_rights_t rights;
2701	int error;
2702
2703	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
2704	if (error != 0)
2705		return (error);
2706	if (fp->f_type != DTYPE_VNODE) {
2707		fdrop(fp, td);
2708		return (EOPNOTSUPP);
2709	}
2710
2711	vp = fp->f_vnode;
2712	lf.l_whence = SEEK_SET;
2713	lf.l_start = 0;
2714	lf.l_len = 0;
2715	if (uap->how & LOCK_UN) {
2716		lf.l_type = F_UNLCK;
2717		atomic_clear_int(&fp->f_flag, FHASLOCK);
2718		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
2719		goto done2;
2720	}
2721	if (uap->how & LOCK_EX)
2722		lf.l_type = F_WRLCK;
2723	else if (uap->how & LOCK_SH)
2724		lf.l_type = F_RDLCK;
2725	else {
2726		error = EBADF;
2727		goto done2;
2728	}
2729	atomic_set_int(&fp->f_flag, FHASLOCK);
2730	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
2731	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
2732done2:
2733	fdrop(fp, td);
2734	return (error);
2735}
2736/*
2737 * Duplicate the specified descriptor to a free descriptor.
2738 */
2739int
2740dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
2741    int openerror, int *indxp)
2742{
2743	struct file *fp;
2744	int error, indx;
2745
2746	KASSERT(openerror == ENODEV || openerror == ENXIO,
2747	    ("unexpected error %d in %s", openerror, __func__));
2748
2749	/*
2750	 * If the to-be-dup'd fd number is greater than the allowed number
2751	 * of file descriptors, or the fd to be dup'd has already been
2752	 * closed, then reject.
2753	 */
2754	FILEDESC_XLOCK(fdp);
2755	if ((fp = fget_locked(fdp, dfd)) == NULL) {
2756		FILEDESC_XUNLOCK(fdp);
2757		return (EBADF);
2758	}
2759
2760	error = fdalloc(td, 0, &indx);
2761	if (error != 0) {
2762		FILEDESC_XUNLOCK(fdp);
2763		return (error);
2764	}
2765
2766	/*
2767	 * There are two cases of interest here.
2768	 *
2769	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
2770	 *
2771	 * For ENXIO steal away the file structure from (dfd) and store it in
2772	 * (indx).  (dfd) is effectively closed by this operation.
2773	 */
2774	switch (openerror) {
2775	case ENODEV:
2776		/*
2777		 * Check that the mode the file is being opened for is a
2778		 * subset of the mode of the existing descriptor.
2779		 */
2780		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
2781			fdunused(fdp, indx);
2782			FILEDESC_XUNLOCK(fdp);
2783			return (EACCES);
2784		}
2785		fhold(fp);
2786		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2787		filecaps_copy(&fdp->fd_ofiles[dfd].fde_caps,
2788		    &fdp->fd_ofiles[indx].fde_caps);
2789		break;
2790	case ENXIO:
2791		/*
2792		 * Steal away the file pointer from dfd and stuff it into indx.
2793		 */
2794		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
2795		bzero(&fdp->fd_ofiles[dfd], sizeof(fdp->fd_ofiles[dfd]));
2796		fdunused(fdp, dfd);
2797		break;
2798	}
2799	FILEDESC_XUNLOCK(fdp);
2800	*indxp = indx;
2801	return (0);
2802}
2803
2804/*
2805 * Scan all active processes and prisons to see if any of them have a current
2806 * or root directory of `olddp'. If so, replace them with the new mount point.
2807 */
2808void
2809mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
2810{
2811	struct filedesc *fdp;
2812	struct prison *pr;
2813	struct proc *p;
2814	int nrele;
2815
2816	if (vrefcnt(olddp) == 1)
2817		return;
2818	nrele = 0;
2819	sx_slock(&allproc_lock);
2820	FOREACH_PROC_IN_SYSTEM(p) {
2821		fdp = fdhold(p);
2822		if (fdp == NULL)
2823			continue;
2824		FILEDESC_XLOCK(fdp);
2825		if (fdp->fd_cdir == olddp) {
2826			vref(newdp);
2827			fdp->fd_cdir = newdp;
2828			nrele++;
2829		}
2830		if (fdp->fd_rdir == olddp) {
2831			vref(newdp);
2832			fdp->fd_rdir = newdp;
2833			nrele++;
2834		}
2835		if (fdp->fd_jdir == olddp) {
2836			vref(newdp);
2837			fdp->fd_jdir = newdp;
2838			nrele++;
2839		}
2840		FILEDESC_XUNLOCK(fdp);
2841		fddrop(fdp);
2842	}
2843	sx_sunlock(&allproc_lock);
2844	if (rootvnode == olddp) {
2845		vref(newdp);
2846		rootvnode = newdp;
2847		nrele++;
2848	}
2849	mtx_lock(&prison0.pr_mtx);
2850	if (prison0.pr_root == olddp) {
2851		vref(newdp);
2852		prison0.pr_root = newdp;
2853		nrele++;
2854	}
2855	mtx_unlock(&prison0.pr_mtx);
2856	sx_slock(&allprison_lock);
2857	TAILQ_FOREACH(pr, &allprison, pr_list) {
2858		mtx_lock(&pr->pr_mtx);
2859		if (pr->pr_root == olddp) {
2860			vref(newdp);
2861			pr->pr_root = newdp;
2862			nrele++;
2863		}
2864		mtx_unlock(&pr->pr_mtx);
2865	}
2866	sx_sunlock(&allprison_lock);
2867	while (nrele--)
2868		vrele(olddp);
2869}
2870
2871struct filedesc_to_leader *
2872filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
2873{
2874	struct filedesc_to_leader *fdtol;
2875
2876	fdtol = malloc(sizeof(struct filedesc_to_leader),
2877	       M_FILEDESC_TO_LEADER,
2878	       M_WAITOK);
2879	fdtol->fdl_refcount = 1;
2880	fdtol->fdl_holdcount = 0;
2881	fdtol->fdl_wakeup = 0;
2882	fdtol->fdl_leader = leader;
2883	if (old != NULL) {
2884		FILEDESC_XLOCK(fdp);
2885		fdtol->fdl_next = old->fdl_next;
2886		fdtol->fdl_prev = old;
2887		old->fdl_next = fdtol;
2888		fdtol->fdl_next->fdl_prev = fdtol;
2889		FILEDESC_XUNLOCK(fdp);
2890	} else {
2891		fdtol->fdl_next = fdtol;
2892		fdtol->fdl_prev = fdtol;
2893	}
2894	return (fdtol);
2895}
2896
2897/*
2898 * Get file structures globally.
2899 */
2900static int
2901sysctl_kern_file(SYSCTL_HANDLER_ARGS)
2902{
2903	struct xfile xf;
2904	struct filedesc *fdp;
2905	struct file *fp;
2906	struct proc *p;
2907	int error, n;
2908
2909	error = sysctl_wire_old_buffer(req, 0);
2910	if (error != 0)
2911		return (error);
2912	if (req->oldptr == NULL) {
2913		n = 0;
2914		sx_slock(&allproc_lock);
2915		FOREACH_PROC_IN_SYSTEM(p) {
2916			if (p->p_state == PRS_NEW)
2917				continue;
2918			fdp = fdhold(p);
2919			if (fdp == NULL)
2920				continue;
2921			/* overestimates sparse tables. */
2922			if (fdp->fd_lastfile > 0)
2923				n += fdp->fd_lastfile;
2924			fddrop(fdp);
2925		}
2926		sx_sunlock(&allproc_lock);
2927		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
2928	}
2929	error = 0;
2930	bzero(&xf, sizeof(xf));
2931	xf.xf_size = sizeof(xf);
2932	sx_slock(&allproc_lock);
2933	FOREACH_PROC_IN_SYSTEM(p) {
2934		PROC_LOCK(p);
2935		if (p->p_state == PRS_NEW) {
2936			PROC_UNLOCK(p);
2937			continue;
2938		}
2939		if (p_cansee(req->td, p) != 0) {
2940			PROC_UNLOCK(p);
2941			continue;
2942		}
2943		xf.xf_pid = p->p_pid;
2944		xf.xf_uid = p->p_ucred->cr_uid;
2945		PROC_UNLOCK(p);
2946		fdp = fdhold(p);
2947		if (fdp == NULL)
2948			continue;
2949		FILEDESC_SLOCK(fdp);
2950		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
2951			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
2952				continue;
2953			xf.xf_fd = n;
2954			xf.xf_file = fp;
2955			xf.xf_data = fp->f_data;
2956			xf.xf_vnode = fp->f_vnode;
2957			xf.xf_type = fp->f_type;
2958			xf.xf_count = fp->f_count;
2959			xf.xf_msgcount = 0;
2960			xf.xf_offset = foffset_get(fp);
2961			xf.xf_flag = fp->f_flag;
2962			error = SYSCTL_OUT(req, &xf, sizeof(xf));
2963			if (error)
2964				break;
2965		}
2966		FILEDESC_SUNLOCK(fdp);
2967		fddrop(fdp);
2968		if (error)
2969			break;
2970	}
2971	sx_sunlock(&allproc_lock);
2972	return (error);
2973}
2974
2975SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
2976    0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
2977
2978#ifdef KINFO_OFILE_SIZE
2979CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
2980#endif
2981
2982#ifdef COMPAT_FREEBSD7
2983static int
2984export_vnode_for_osysctl(struct vnode *vp, int type,
2985    struct kinfo_ofile *kif, struct filedesc *fdp, struct sysctl_req *req)
2986{
2987	int error;
2988	char *fullpath, *freepath;
2989
2990	bzero(kif, sizeof(*kif));
2991	kif->kf_structsize = sizeof(*kif);
2992
2993	vref(vp);
2994	kif->kf_fd = type;
2995	kif->kf_type = KF_TYPE_VNODE;
2996	/* This function only handles directories. */
2997	if (vp->v_type != VDIR) {
2998		vrele(vp);
2999		return (ENOTDIR);
3000	}
3001	kif->kf_vnode_type = KF_VTYPE_VDIR;
3002
3003	/*
3004	 * This is not a true file descriptor, so we set a bogus refcount
3005	 * and offset to indicate these fields should be ignored.
3006	 */
3007	kif->kf_ref_count = -1;
3008	kif->kf_offset = -1;
3009
3010	freepath = NULL;
3011	fullpath = "-";
3012	FILEDESC_SUNLOCK(fdp);
3013	vn_fullpath(curthread, vp, &fullpath, &freepath);
3014	vrele(vp);
3015	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3016	if (freepath != NULL)
3017		free(freepath, M_TEMP);
3018	error = SYSCTL_OUT(req, kif, sizeof(*kif));
3019	FILEDESC_SLOCK(fdp);
3020	return (error);
3021}
3022
3023/*
3024 * Get per-process file descriptors for use by procstat(1), et al.
3025 */
3026static int
3027sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
3028{
3029	char *fullpath, *freepath;
3030	struct kinfo_ofile *kif;
3031	struct filedesc *fdp;
3032	int error, i, *name;
3033	struct shmfd *shmfd;
3034	struct socket *so;
3035	struct vnode *vp;
3036	struct ksem *ks;
3037	struct file *fp;
3038	struct proc *p;
3039	struct tty *tp;
3040
3041	name = (int *)arg1;
3042	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3043	if (error != 0)
3044		return (error);
3045	fdp = fdhold(p);
3046	PROC_UNLOCK(p);
3047	if (fdp == NULL)
3048		return (ENOENT);
3049	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
3050	FILEDESC_SLOCK(fdp);
3051	if (fdp->fd_cdir != NULL)
3052		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
3053				fdp, req);
3054	if (fdp->fd_rdir != NULL)
3055		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
3056				fdp, req);
3057	if (fdp->fd_jdir != NULL)
3058		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
3059				fdp, req);
3060	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3061		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3062			continue;
3063		bzero(kif, sizeof(*kif));
3064		kif->kf_structsize = sizeof(*kif);
3065		ks = NULL;
3066		vp = NULL;
3067		so = NULL;
3068		tp = NULL;
3069		shmfd = NULL;
3070		kif->kf_fd = i;
3071
3072		switch (fp->f_type) {
3073		case DTYPE_VNODE:
3074			kif->kf_type = KF_TYPE_VNODE;
3075			vp = fp->f_vnode;
3076			break;
3077
3078		case DTYPE_SOCKET:
3079			kif->kf_type = KF_TYPE_SOCKET;
3080			so = fp->f_data;
3081			break;
3082
3083		case DTYPE_PIPE:
3084			kif->kf_type = KF_TYPE_PIPE;
3085			break;
3086
3087		case DTYPE_FIFO:
3088			kif->kf_type = KF_TYPE_FIFO;
3089			vp = fp->f_vnode;
3090			break;
3091
3092		case DTYPE_KQUEUE:
3093			kif->kf_type = KF_TYPE_KQUEUE;
3094			break;
3095
3096		case DTYPE_CRYPTO:
3097			kif->kf_type = KF_TYPE_CRYPTO;
3098			break;
3099
3100		case DTYPE_MQUEUE:
3101			kif->kf_type = KF_TYPE_MQUEUE;
3102			break;
3103
3104		case DTYPE_SHM:
3105			kif->kf_type = KF_TYPE_SHM;
3106			shmfd = fp->f_data;
3107			break;
3108
3109		case DTYPE_SEM:
3110			kif->kf_type = KF_TYPE_SEM;
3111			ks = fp->f_data;
3112			break;
3113
3114		case DTYPE_PTS:
3115			kif->kf_type = KF_TYPE_PTS;
3116			tp = fp->f_data;
3117			break;
3118
3119#ifdef PROCDESC
3120		case DTYPE_PROCDESC:
3121			kif->kf_type = KF_TYPE_PROCDESC;
3122			break;
3123#endif
3124
3125		default:
3126			kif->kf_type = KF_TYPE_UNKNOWN;
3127			break;
3128		}
3129		kif->kf_ref_count = fp->f_count;
3130		if (fp->f_flag & FREAD)
3131			kif->kf_flags |= KF_FLAG_READ;
3132		if (fp->f_flag & FWRITE)
3133			kif->kf_flags |= KF_FLAG_WRITE;
3134		if (fp->f_flag & FAPPEND)
3135			kif->kf_flags |= KF_FLAG_APPEND;
3136		if (fp->f_flag & FASYNC)
3137			kif->kf_flags |= KF_FLAG_ASYNC;
3138		if (fp->f_flag & FFSYNC)
3139			kif->kf_flags |= KF_FLAG_FSYNC;
3140		if (fp->f_flag & FNONBLOCK)
3141			kif->kf_flags |= KF_FLAG_NONBLOCK;
3142		if (fp->f_flag & O_DIRECT)
3143			kif->kf_flags |= KF_FLAG_DIRECT;
3144		if (fp->f_flag & FHASLOCK)
3145			kif->kf_flags |= KF_FLAG_HASLOCK;
3146		kif->kf_offset = foffset_get(fp);
3147		if (vp != NULL) {
3148			vref(vp);
3149			switch (vp->v_type) {
3150			case VNON:
3151				kif->kf_vnode_type = KF_VTYPE_VNON;
3152				break;
3153			case VREG:
3154				kif->kf_vnode_type = KF_VTYPE_VREG;
3155				break;
3156			case VDIR:
3157				kif->kf_vnode_type = KF_VTYPE_VDIR;
3158				break;
3159			case VBLK:
3160				kif->kf_vnode_type = KF_VTYPE_VBLK;
3161				break;
3162			case VCHR:
3163				kif->kf_vnode_type = KF_VTYPE_VCHR;
3164				break;
3165			case VLNK:
3166				kif->kf_vnode_type = KF_VTYPE_VLNK;
3167				break;
3168			case VSOCK:
3169				kif->kf_vnode_type = KF_VTYPE_VSOCK;
3170				break;
3171			case VFIFO:
3172				kif->kf_vnode_type = KF_VTYPE_VFIFO;
3173				break;
3174			case VBAD:
3175				kif->kf_vnode_type = KF_VTYPE_VBAD;
3176				break;
3177			default:
3178				kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
3179				break;
3180			}
3181			/*
3182			 * It is OK to drop the filedesc lock here as we will
3183			 * re-validate and re-evaluate its properties when
3184			 * the loop continues.
3185			 */
3186			freepath = NULL;
3187			fullpath = "-";
3188			FILEDESC_SUNLOCK(fdp);
3189			vn_fullpath(curthread, vp, &fullpath, &freepath);
3190			vrele(vp);
3191			strlcpy(kif->kf_path, fullpath,
3192			    sizeof(kif->kf_path));
3193			if (freepath != NULL)
3194				free(freepath, M_TEMP);
3195			FILEDESC_SLOCK(fdp);
3196		}
3197		if (so != NULL) {
3198			struct sockaddr *sa;
3199
3200			if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
3201			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3202				bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3203				free(sa, M_SONAME);
3204			}
3205			if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
3206			    == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3207				bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3208				free(sa, M_SONAME);
3209			}
3210			kif->kf_sock_domain =
3211			    so->so_proto->pr_domain->dom_family;
3212			kif->kf_sock_type = so->so_type;
3213			kif->kf_sock_protocol = so->so_proto->pr_protocol;
3214		}
3215		if (tp != NULL) {
3216			strlcpy(kif->kf_path, tty_devname(tp),
3217			    sizeof(kif->kf_path));
3218		}
3219		if (shmfd != NULL)
3220			shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
3221		if (ks != NULL && ksem_info != NULL)
3222			ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
3223		error = SYSCTL_OUT(req, kif, sizeof(*kif));
3224		if (error)
3225			break;
3226	}
3227	FILEDESC_SUNLOCK(fdp);
3228	fddrop(fdp);
3229	free(kif, M_TEMP);
3230	return (0);
3231}
3232
3233static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
3234    CTLFLAG_RD||CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
3235    "Process ofiledesc entries");
3236#endif	/* COMPAT_FREEBSD7 */
3237
3238#ifdef KINFO_FILE_SIZE
3239CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
3240#endif
3241
3242struct export_fd_buf {
3243	struct filedesc		*fdp;
3244	struct sbuf 		*sb;
3245	ssize_t			remainder;
3246	struct kinfo_file	kif;
3247};
3248
3249static int
3250export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
3251    int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
3252{
3253	struct {
3254		int	fflag;
3255		int	kf_fflag;
3256	} fflags_table[] = {
3257		{ FAPPEND, KF_FLAG_APPEND },
3258		{ FASYNC, KF_FLAG_ASYNC },
3259		{ FFSYNC, KF_FLAG_FSYNC },
3260		{ FHASLOCK, KF_FLAG_HASLOCK },
3261		{ FNONBLOCK, KF_FLAG_NONBLOCK },
3262		{ FREAD, KF_FLAG_READ },
3263		{ FWRITE, KF_FLAG_WRITE },
3264		{ O_CREAT, KF_FLAG_CREAT },
3265		{ O_DIRECT, KF_FLAG_DIRECT },
3266		{ O_EXCL, KF_FLAG_EXCL },
3267		{ O_EXEC, KF_FLAG_EXEC },
3268		{ O_EXLOCK, KF_FLAG_EXLOCK },
3269		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
3270		{ O_SHLOCK, KF_FLAG_SHLOCK },
3271		{ O_TRUNC, KF_FLAG_TRUNC }
3272	};
3273#define	NFFLAGS	(sizeof(fflags_table) / sizeof(*fflags_table))
3274	struct kinfo_file *kif;
3275	struct vnode *vp;
3276	int error, locked;
3277	unsigned int i;
3278
3279	if (efbuf->remainder == 0)
3280		return (0);
3281	kif = &efbuf->kif;
3282	bzero(kif, sizeof(*kif));
3283	locked = efbuf->fdp != NULL;
3284	switch (type) {
3285	case KF_TYPE_FIFO:
3286	case KF_TYPE_VNODE:
3287		if (locked) {
3288			FILEDESC_SUNLOCK(efbuf->fdp);
3289			locked = 0;
3290		}
3291		vp = (struct vnode *)data;
3292		error = fill_vnode_info(vp, kif);
3293		vrele(vp);
3294		break;
3295	case KF_TYPE_SOCKET:
3296		error = fill_socket_info((struct socket *)data, kif);
3297		break;
3298	case KF_TYPE_PIPE:
3299		error = fill_pipe_info((struct pipe *)data, kif);
3300		break;
3301	case KF_TYPE_PTS:
3302		error = fill_pts_info((struct tty *)data, kif);
3303		break;
3304	case KF_TYPE_PROCDESC:
3305		error = fill_procdesc_info((struct procdesc *)data, kif);
3306		break;
3307	case KF_TYPE_SEM:
3308		error = fill_sem_info((struct file *)data, kif);
3309		break;
3310	case KF_TYPE_SHM:
3311		error = fill_shm_info((struct file *)data, kif);
3312		break;
3313	default:
3314		error = 0;
3315	}
3316	if (error == 0)
3317		kif->kf_status |= KF_ATTR_VALID;
3318
3319	/*
3320	 * Translate file access flags.
3321	 */
3322	for (i = 0; i < NFFLAGS; i++)
3323		if (fflags & fflags_table[i].fflag)
3324			kif->kf_flags |=  fflags_table[i].kf_fflag;
3325	if (rightsp != NULL)
3326		kif->kf_cap_rights = *rightsp;
3327	else
3328		cap_rights_init(&kif->kf_cap_rights);
3329	kif->kf_fd = fd;
3330	kif->kf_type = type;
3331	kif->kf_ref_count = refcnt;
3332	kif->kf_offset = offset;
3333	/* Pack record size down */
3334	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
3335	    strlen(kif->kf_path) + 1;
3336	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
3337	if (efbuf->remainder != -1) {
3338		if (efbuf->remainder < kif->kf_structsize) {
3339			/* Terminate export. */
3340			efbuf->remainder = 0;
3341			if (efbuf->fdp != NULL && !locked)
3342				FILEDESC_SLOCK(efbuf->fdp);
3343			return (0);
3344		}
3345		efbuf->remainder -= kif->kf_structsize;
3346	}
3347	if (locked)
3348		FILEDESC_SUNLOCK(efbuf->fdp);
3349	error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize);
3350	if (efbuf->fdp != NULL)
3351		FILEDESC_SLOCK(efbuf->fdp);
3352	return (error);
3353}
3354
3355/*
3356 * Store a process file descriptor information to sbuf.
3357 *
3358 * Takes a locked proc as argument, and returns with the proc unlocked.
3359 */
3360int
3361kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
3362{
3363	struct file *fp;
3364	struct filedesc *fdp;
3365	struct export_fd_buf *efbuf;
3366	struct vnode *cttyvp, *textvp, *tracevp;
3367	int64_t offset;
3368	void *data;
3369	int error, i;
3370	int type, refcnt, fflags;
3371	cap_rights_t rights;
3372
3373	PROC_LOCK_ASSERT(p, MA_OWNED);
3374
3375	/* ktrace vnode */
3376	tracevp = p->p_tracevp;
3377	if (tracevp != NULL)
3378		vref(tracevp);
3379	/* text vnode */
3380	textvp = p->p_textvp;
3381	if (textvp != NULL)
3382		vref(textvp);
3383	/* Controlling tty. */
3384	cttyvp = NULL;
3385	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
3386		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
3387		if (cttyvp != NULL)
3388			vref(cttyvp);
3389	}
3390	fdp = fdhold(p);
3391	PROC_UNLOCK(p);
3392	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
3393	efbuf->fdp = NULL;
3394	efbuf->sb = sb;
3395	efbuf->remainder = maxlen;
3396	if (tracevp != NULL)
3397		export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
3398		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3399	if (textvp != NULL)
3400		export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
3401		    FREAD, -1, -1, NULL, efbuf);
3402	if (cttyvp != NULL)
3403		export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
3404		    FREAD | FWRITE, -1, -1, NULL, efbuf);
3405	error = 0;
3406	if (fdp == NULL)
3407		goto fail;
3408	efbuf->fdp = fdp;
3409	FILEDESC_SLOCK(fdp);
3410	/* working directory */
3411	if (fdp->fd_cdir != NULL) {
3412		vref(fdp->fd_cdir);
3413		data = fdp->fd_cdir;
3414		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
3415		    FREAD, -1, -1, NULL, efbuf);
3416	}
3417	/* root directory */
3418	if (fdp->fd_rdir != NULL) {
3419		vref(fdp->fd_rdir);
3420		data = fdp->fd_rdir;
3421		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
3422		    FREAD, -1, -1, NULL, efbuf);
3423	}
3424	/* jail directory */
3425	if (fdp->fd_jdir != NULL) {
3426		vref(fdp->fd_jdir);
3427		data = fdp->fd_jdir;
3428		export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
3429		    FREAD, -1, -1, NULL, efbuf);
3430	}
3431	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
3432		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
3433			continue;
3434		data = NULL;
3435#ifdef CAPABILITIES
3436		rights = *cap_rights(fdp, i);
3437#else /* !CAPABILITIES */
3438		cap_rights_init(&rights);
3439#endif
3440		switch (fp->f_type) {
3441		case DTYPE_VNODE:
3442			type = KF_TYPE_VNODE;
3443			vref(fp->f_vnode);
3444			data = fp->f_vnode;
3445			break;
3446
3447		case DTYPE_SOCKET:
3448			type = KF_TYPE_SOCKET;
3449			data = fp->f_data;
3450			break;
3451
3452		case DTYPE_PIPE:
3453			type = KF_TYPE_PIPE;
3454			data = fp->f_data;
3455			break;
3456
3457		case DTYPE_FIFO:
3458			type = KF_TYPE_FIFO;
3459			vref(fp->f_vnode);
3460			data = fp->f_vnode;
3461			break;
3462
3463		case DTYPE_KQUEUE:
3464			type = KF_TYPE_KQUEUE;
3465			break;
3466
3467		case DTYPE_CRYPTO:
3468			type = KF_TYPE_CRYPTO;
3469			break;
3470
3471		case DTYPE_MQUEUE:
3472			type = KF_TYPE_MQUEUE;
3473			break;
3474
3475		case DTYPE_SHM:
3476			type = KF_TYPE_SHM;
3477			data = fp;
3478			break;
3479
3480		case DTYPE_SEM:
3481			type = KF_TYPE_SEM;
3482			data = fp;
3483			break;
3484
3485		case DTYPE_PTS:
3486			type = KF_TYPE_PTS;
3487			data = fp->f_data;
3488			break;
3489
3490#ifdef PROCDESC
3491		case DTYPE_PROCDESC:
3492			type = KF_TYPE_PROCDESC;
3493			data = fp->f_data;
3494			break;
3495#endif
3496
3497		default:
3498			type = KF_TYPE_UNKNOWN;
3499			break;
3500		}
3501		refcnt = fp->f_count;
3502		fflags = fp->f_flag;
3503		offset = foffset_get(fp);
3504
3505		/*
3506		 * Create sysctl entry.
3507		 * It is OK to drop the filedesc lock here as we will
3508		 * re-validate and re-evaluate its properties when
3509		 * the loop continues.
3510		 */
3511		error = export_fd_to_sb(data, type, i, fflags, refcnt,
3512		    offset, &rights, efbuf);
3513		if (error != 0)
3514			break;
3515	}
3516	FILEDESC_SUNLOCK(fdp);
3517	fddrop(fdp);
3518fail:
3519	free(efbuf, M_TEMP);
3520	return (error);
3521}
3522
3523#define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
3524
3525/*
3526 * Get per-process file descriptors for use by procstat(1), et al.
3527 */
3528static int
3529sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
3530{
3531	struct sbuf sb;
3532	struct proc *p;
3533	ssize_t maxlen;
3534	int error, error2, *name;
3535
3536	name = (int *)arg1;
3537
3538	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
3539	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
3540	if (error != 0) {
3541		sbuf_delete(&sb);
3542		return (error);
3543	}
3544	maxlen = req->oldptr != NULL ? req->oldlen : -1;
3545	error = kern_proc_filedesc_out(p, &sb, maxlen);
3546	error2 = sbuf_finish(&sb);
3547	sbuf_delete(&sb);
3548	return (error != 0 ? error : error2);
3549}
3550
3551int
3552vntype_to_kinfo(int vtype)
3553{
3554	struct {
3555		int	vtype;
3556		int	kf_vtype;
3557	} vtypes_table[] = {
3558		{ VBAD, KF_VTYPE_VBAD },
3559		{ VBLK, KF_VTYPE_VBLK },
3560		{ VCHR, KF_VTYPE_VCHR },
3561		{ VDIR, KF_VTYPE_VDIR },
3562		{ VFIFO, KF_VTYPE_VFIFO },
3563		{ VLNK, KF_VTYPE_VLNK },
3564		{ VNON, KF_VTYPE_VNON },
3565		{ VREG, KF_VTYPE_VREG },
3566		{ VSOCK, KF_VTYPE_VSOCK }
3567	};
3568#define	NVTYPES	(sizeof(vtypes_table) / sizeof(*vtypes_table))
3569	unsigned int i;
3570
3571	/*
3572	 * Perform vtype translation.
3573	 */
3574	for (i = 0; i < NVTYPES; i++)
3575		if (vtypes_table[i].vtype == vtype)
3576			break;
3577	if (i < NVTYPES)
3578		return (vtypes_table[i].kf_vtype);
3579
3580	return (KF_VTYPE_UNKNOWN);
3581}
3582
3583static int
3584fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
3585{
3586	struct vattr va;
3587	char *fullpath, *freepath;
3588	int error;
3589
3590	if (vp == NULL)
3591		return (1);
3592	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
3593	freepath = NULL;
3594	fullpath = "-";
3595	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
3596	if (error == 0) {
3597		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
3598	}
3599	if (freepath != NULL)
3600		free(freepath, M_TEMP);
3601
3602	/*
3603	 * Retrieve vnode attributes.
3604	 */
3605	va.va_fsid = VNOVAL;
3606	va.va_rdev = NODEV;
3607	vn_lock(vp, LK_SHARED | LK_RETRY);
3608	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
3609	VOP_UNLOCK(vp, 0);
3610	if (error != 0)
3611		return (error);
3612	if (va.va_fsid != VNOVAL)
3613		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
3614	else
3615		kif->kf_un.kf_file.kf_file_fsid =
3616		    vp->v_mount->mnt_stat.f_fsid.val[0];
3617	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
3618	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
3619	kif->kf_un.kf_file.kf_file_size = va.va_size;
3620	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
3621	return (0);
3622}
3623
3624static int
3625fill_socket_info(struct socket *so, struct kinfo_file *kif)
3626{
3627	struct sockaddr *sa;
3628	struct inpcb *inpcb;
3629	struct unpcb *unpcb;
3630	int error;
3631
3632	if (so == NULL)
3633		return (1);
3634	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
3635	kif->kf_sock_type = so->so_type;
3636	kif->kf_sock_protocol = so->so_proto->pr_protocol;
3637	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
3638	switch(kif->kf_sock_domain) {
3639	case AF_INET:
3640	case AF_INET6:
3641		if (kif->kf_sock_protocol == IPPROTO_TCP) {
3642			if (so->so_pcb != NULL) {
3643				inpcb = (struct inpcb *)(so->so_pcb);
3644				kif->kf_un.kf_sock.kf_sock_inpcb =
3645				    (uintptr_t)inpcb->inp_ppcb;
3646			}
3647		}
3648		break;
3649	case AF_UNIX:
3650		if (so->so_pcb != NULL) {
3651			unpcb = (struct unpcb *)(so->so_pcb);
3652			if (unpcb->unp_conn) {
3653				kif->kf_un.kf_sock.kf_sock_unpconn =
3654				    (uintptr_t)unpcb->unp_conn;
3655				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
3656				    so->so_rcv.sb_state;
3657				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
3658				    so->so_snd.sb_state;
3659			}
3660		}
3661		break;
3662	}
3663	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
3664	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
3665		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
3666		free(sa, M_SONAME);
3667	}
3668	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
3669	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
3670		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
3671		free(sa, M_SONAME);
3672	}
3673	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
3674	    sizeof(kif->kf_path));
3675	return (0);
3676}
3677
3678static int
3679fill_pts_info(struct tty *tp, struct kinfo_file *kif)
3680{
3681
3682	if (tp == NULL)
3683		return (1);
3684	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
3685	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
3686	return (0);
3687}
3688
3689static int
3690fill_pipe_info(struct pipe *pi, struct kinfo_file *kif)
3691{
3692
3693	if (pi == NULL)
3694		return (1);
3695	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
3696	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
3697	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
3698	return (0);
3699}
3700
3701static int
3702fill_procdesc_info(struct procdesc *pdp, struct kinfo_file *kif)
3703{
3704
3705	if (pdp == NULL)
3706		return (1);
3707	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
3708	return (0);
3709}
3710
3711static int
3712fill_sem_info(struct file *fp, struct kinfo_file *kif)
3713{
3714	struct thread *td;
3715	struct stat sb;
3716
3717	td = curthread;
3718	if (fp->f_data == NULL)
3719		return (1);
3720	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3721		return (1);
3722	if (ksem_info == NULL)
3723		return (1);
3724	ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
3725	    &kif->kf_un.kf_sem.kf_sem_value);
3726	kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
3727	return (0);
3728}
3729
3730static int
3731fill_shm_info(struct file *fp, struct kinfo_file *kif)
3732{
3733	struct thread *td;
3734	struct stat sb;
3735
3736	td = curthread;
3737	if (fp->f_data == NULL)
3738		return (1);
3739	if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
3740		return (1);
3741	shm_path(fp->f_data, kif->kf_path, sizeof(kif->kf_path));
3742	kif->kf_un.kf_file.kf_file_mode = sb.st_mode;
3743	kif->kf_un.kf_file.kf_file_size = sb.st_size;
3744	return (0);
3745}
3746
3747static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
3748    CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
3749    "Process filedesc entries");
3750
3751#ifdef DDB
3752/*
3753 * For the purposes of debugging, generate a human-readable string for the
3754 * file type.
3755 */
3756static const char *
3757file_type_to_name(short type)
3758{
3759
3760	switch (type) {
3761	case 0:
3762		return ("zero");
3763	case DTYPE_VNODE:
3764		return ("vnod");
3765	case DTYPE_SOCKET:
3766		return ("sock");
3767	case DTYPE_PIPE:
3768		return ("pipe");
3769	case DTYPE_FIFO:
3770		return ("fifo");
3771	case DTYPE_KQUEUE:
3772		return ("kque");
3773	case DTYPE_CRYPTO:
3774		return ("crpt");
3775	case DTYPE_MQUEUE:
3776		return ("mque");
3777	case DTYPE_SHM:
3778		return ("shm");
3779	case DTYPE_SEM:
3780		return ("ksem");
3781	default:
3782		return ("unkn");
3783	}
3784}
3785
3786/*
3787 * For the purposes of debugging, identify a process (if any, perhaps one of
3788 * many) that references the passed file in its file descriptor array. Return
3789 * NULL if none.
3790 */
3791static struct proc *
3792file_to_first_proc(struct file *fp)
3793{
3794	struct filedesc *fdp;
3795	struct proc *p;
3796	int n;
3797
3798	FOREACH_PROC_IN_SYSTEM(p) {
3799		if (p->p_state == PRS_NEW)
3800			continue;
3801		fdp = p->p_fd;
3802		if (fdp == NULL)
3803			continue;
3804		for (n = 0; n <= fdp->fd_lastfile; n++) {
3805			if (fp == fdp->fd_ofiles[n].fde_file)
3806				return (p);
3807		}
3808	}
3809	return (NULL);
3810}
3811
3812static void
3813db_print_file(struct file *fp, int header)
3814{
3815	struct proc *p;
3816
3817	if (header)
3818		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
3819		    "File", "Type", "Data", "Flag", "GCFl", "Count",
3820		    "MCount", "Vnode", "FPID", "FCmd");
3821	p = file_to_first_proc(fp);
3822	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
3823	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
3824	    0, fp->f_count, 0, fp->f_vnode,
3825	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
3826}
3827
3828DB_SHOW_COMMAND(file, db_show_file)
3829{
3830	struct file *fp;
3831
3832	if (!have_addr) {
3833		db_printf("usage: show file <addr>\n");
3834		return;
3835	}
3836	fp = (struct file *)addr;
3837	db_print_file(fp, 1);
3838}
3839
3840DB_SHOW_COMMAND(files, db_show_files)
3841{
3842	struct filedesc *fdp;
3843	struct file *fp;
3844	struct proc *p;
3845	int header;
3846	int n;
3847
3848	header = 1;
3849	FOREACH_PROC_IN_SYSTEM(p) {
3850		if (p->p_state == PRS_NEW)
3851			continue;
3852		if ((fdp = p->p_fd) == NULL)
3853			continue;
3854		for (n = 0; n <= fdp->fd_lastfile; ++n) {
3855			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
3856				continue;
3857			db_print_file(fp, header);
3858			header = 0;
3859		}
3860	}
3861}
3862#endif
3863
3864SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
3865    &maxfilesperproc, 0, "Maximum files allowed open per process");
3866
3867SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
3868    &maxfiles, 0, "Maximum number of files");
3869
3870SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
3871    __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
3872
3873/* ARGSUSED*/
3874static void
3875filelistinit(void *dummy)
3876{
3877
3878	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
3879	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3880	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
3881	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
3882}
3883SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
3884
3885/*-------------------------------------------------------------------*/
3886
3887static int
3888badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
3889    int flags, struct thread *td)
3890{
3891
3892	return (EBADF);
3893}
3894
3895static int
3896badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
3897    struct thread *td)
3898{
3899
3900	return (EINVAL);
3901}
3902
3903static int
3904badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
3905    struct thread *td)
3906{
3907
3908	return (EBADF);
3909}
3910
3911static int
3912badfo_poll(struct file *fp, int events, struct ucred *active_cred,
3913    struct thread *td)
3914{
3915
3916	return (0);
3917}
3918
3919static int
3920badfo_kqfilter(struct file *fp, struct knote *kn)
3921{
3922
3923	return (EBADF);
3924}
3925
3926static int
3927badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
3928    struct thread *td)
3929{
3930
3931	return (EBADF);
3932}
3933
3934static int
3935badfo_close(struct file *fp, struct thread *td)
3936{
3937
3938	return (EBADF);
3939}
3940
3941static int
3942badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3943    struct thread *td)
3944{
3945
3946	return (EBADF);
3947}
3948
3949static int
3950badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3951    struct thread *td)
3952{
3953
3954	return (EBADF);
3955}
3956
3957static int
3958badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3959    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3960    int kflags, struct thread *td)
3961{
3962
3963	return (EBADF);
3964}
3965
3966struct fileops badfileops = {
3967	.fo_read = badfo_readwrite,
3968	.fo_write = badfo_readwrite,
3969	.fo_truncate = badfo_truncate,
3970	.fo_ioctl = badfo_ioctl,
3971	.fo_poll = badfo_poll,
3972	.fo_kqfilter = badfo_kqfilter,
3973	.fo_stat = badfo_stat,
3974	.fo_close = badfo_close,
3975	.fo_chmod = badfo_chmod,
3976	.fo_chown = badfo_chown,
3977	.fo_sendfile = badfo_sendfile,
3978};
3979
3980int
3981invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
3982    struct thread *td)
3983{
3984
3985	return (EINVAL);
3986}
3987
3988int
3989invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
3990    struct thread *td)
3991{
3992
3993	return (EINVAL);
3994}
3995
3996int
3997invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
3998    struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
3999    int kflags, struct thread *td)
4000{
4001
4002	return (EINVAL);
4003}
4004
4005/*-------------------------------------------------------------------*/
4006
4007/*
4008 * File Descriptor pseudo-device driver (/dev/fd/).
4009 *
4010 * Opening minor device N dup()s the file (if any) connected to file
4011 * descriptor N belonging to the calling process.  Note that this driver
4012 * consists of only the ``open()'' routine, because all subsequent
4013 * references to this file will be direct to the other driver.
4014 *
4015 * XXX: we could give this one a cloning event handler if necessary.
4016 */
4017
4018/* ARGSUSED */
4019static int
4020fdopen(struct cdev *dev, int mode, int type, struct thread *td)
4021{
4022
4023	/*
4024	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
4025	 * the file descriptor being sought for duplication. The error
4026	 * return ensures that the vnode for this device will be released
4027	 * by vn_open. Open will detect this special error and take the
4028	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
4029	 * will simply report the error.
4030	 */
4031	td->td_dupfd = dev2unit(dev);
4032	return (ENODEV);
4033}
4034
4035static struct cdevsw fildesc_cdevsw = {
4036	.d_version =	D_VERSION,
4037	.d_open =	fdopen,
4038	.d_name =	"FD",
4039};
4040
4041static void
4042fildesc_drvinit(void *unused)
4043{
4044	struct cdev *dev;
4045
4046	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
4047	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
4048	make_dev_alias(dev, "stdin");
4049	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
4050	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
4051	make_dev_alias(dev, "stdout");
4052	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
4053	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
4054	make_dev_alias(dev, "stderr");
4055}
4056
4057SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
4058