1139804Simp/*-
21541Srgrimes * Copyright (c) 1982, 1986, 1989, 1993
31541Srgrimes *	The Regents of the University of California.  All rights reserved.
41541Srgrimes * (c) UNIX System Laboratories, Inc.
51541Srgrimes * All or some portions of this file are derived from material licensed
61541Srgrimes * to the University of California by American Telephone and Telegraph
71541Srgrimes * Co. or Unix System Laboratories, Inc. and are reproduced herein with
81541Srgrimes * the permission of UNIX System Laboratories, Inc.
91541Srgrimes *
101541Srgrimes * Redistribution and use in source and binary forms, with or without
111541Srgrimes * modification, are permitted provided that the following conditions
121541Srgrimes * are met:
131541Srgrimes * 1. Redistributions of source code must retain the above copyright
141541Srgrimes *    notice, this list of conditions and the following disclaimer.
151541Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
161541Srgrimes *    notice, this list of conditions and the following disclaimer in the
171541Srgrimes *    documentation and/or other materials provided with the distribution.
181541Srgrimes * 4. Neither the name of the University nor the names of its contributors
191541Srgrimes *    may be used to endorse or promote products derived from this software
201541Srgrimes *    without specific prior written permission.
211541Srgrimes *
221541Srgrimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
231541Srgrimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
241541Srgrimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
251541Srgrimes * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
261541Srgrimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
271541Srgrimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
281541Srgrimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
291541Srgrimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
301541Srgrimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
311541Srgrimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
321541Srgrimes * SUCH DAMAGE.
331541Srgrimes *
341541Srgrimes *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
351541Srgrimes */
361541Srgrimes
37116182Sobrien#include <sys/cdefs.h>
38116182Sobrien__FBSDID("$FreeBSD: stable/10/sys/kern/sys_generic.c 360332 2020-04-26 08:35:32Z hselasky $");
39116182Sobrien
40224778Srwatson#include "opt_capsicum.h"
41147676Speter#include "opt_compat.h"
4213203Swollman#include "opt_ktrace.h"
4313203Swollman
441541Srgrimes#include <sys/param.h>
451541Srgrimes#include <sys/systm.h>
4612221Sbde#include <sys/sysproto.h>
47280258Srwatson#include <sys/capsicum.h>
481541Srgrimes#include <sys/filedesc.h>
4924206Sbde#include <sys/filio.h>
5024131Sbde#include <sys/fcntl.h>
511541Srgrimes#include <sys/file.h>
52247602Spjd#include <sys/lock.h>
531541Srgrimes#include <sys/proc.h>
543308Sphk#include <sys/signalvar.h>
551541Srgrimes#include <sys/socketvar.h>
561541Srgrimes#include <sys/uio.h>
571541Srgrimes#include <sys/kernel.h>
58175140Sjhb#include <sys/ktr.h>
59114216Skan#include <sys/limits.h>
601541Srgrimes#include <sys/malloc.h>
6129351Speter#include <sys/poll.h>
6272146Speter#include <sys/resourcevar.h>
6370834Swollman#include <sys/selinfo.h>
64126326Sjhb#include <sys/sleepqueue.h>
65102779Siedowse#include <sys/syscallsubr.h>
6655478Speter#include <sys/sysctl.h>
6729351Speter#include <sys/sysent.h>
68124736Sache#include <sys/vnode.h>
6968883Sdillon#include <sys/bio.h>
7068883Sdillon#include <sys/buf.h>
7176564Stanimura#include <sys/condvar.h>
721541Srgrimes#ifdef KTRACE
731541Srgrimes#include <sys/ktrace.h>
741541Srgrimes#endif
751541Srgrimes
76175140Sjhb#include <security/audit/audit.h>
77174647Sjeff
78275212Shselasky/*
79275212Shselasky * The following macro defines how many bytes will be allocated from
80275212Shselasky * the stack instead of memory allocated when passing the IOCTL data
81275212Shselasky * structures from userspace and to the kernel. Some IOCTLs having
82275212Shselasky * small data structures are used very frequently and this small
83275212Shselasky * buffer on the stack gives a significant speedup improvement for
84275212Shselasky * those requests. The value of this define should be greater or equal
85275212Shselasky * to 64 bytes and should also be power of two. The data structure is
86275212Shselasky * currently hard-aligned to a 8-byte boundary on the stack. This
87275212Shselasky * should currently be sufficient for all supported platforms.
88275212Shselasky */
89275212Shselasky#define	SYS_IOCTL_SMALL_SIZE	128	/* bytes */
90275212Shselasky#define	SYS_IOCTL_SMALL_ALIGN	8	/* bytes */
91275212Shselasky
92231949Skibint iosize_max_clamp = 1;
93232494SkibSYSCTL_INT(_debug, OID_AUTO, iosize_max_clamp, CTLFLAG_RW,
94232494Skib    &iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX");
95257122Skibint devfs_iosize_max_clamp = 1;
96257122SkibSYSCTL_INT(_debug, OID_AUTO, devfs_iosize_max_clamp, CTLFLAG_RW,
97257122Skib    &devfs_iosize_max_clamp, 0, "Clamp max i/o size to INT_MAX for devices");
98257122Skib
99232494Skib/*
100232494Skib * Assert that the return value of read(2) and write(2) syscalls fits
101232494Skib * into a register.  If not, an architecture will need to provide the
102232494Skib * usermode wrappers to reconstruct the result.
103232494Skib */
104232494SkibCTASSERT(sizeof(register_t) >= sizeof(size_t));
105231949Skib
10630354Sphkstatic MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
10730354Sphkstatic MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
10830354SphkMALLOC_DEFINE(M_IOV, "iov", "large iov's");
10930309Sphk
110211941Skibstatic int	pollout(struct thread *, struct pollfd *, struct pollfd *,
111211941Skib		    u_int);
11291972Salfredstatic int	pollscan(struct thread *, struct pollfd *, u_int);
113174647Sjeffstatic int	pollrescan(struct thread *);
11491972Salfredstatic int	selscan(struct thread *, fd_mask **, fd_mask **, int);
115174647Sjeffstatic int	selrescan(struct thread *, fd_mask **, fd_mask **);
116174647Sjeffstatic void	selfdalloc(struct thread *, void *);
117174647Sjeffstatic void	selfdfree(struct seltd *, struct selfd *);
118147813Sjhbstatic int	dofileread(struct thread *, int, struct file *, struct uio *,
119147813Sjhb		    off_t, int);
120147813Sjhbstatic int	dofilewrite(struct thread *, int, struct file *, struct uio *,
121147813Sjhb		    off_t, int);
122122352Stanimurastatic void	doselwakeup(struct selinfo *, int);
123174647Sjeffstatic void	seltdinit(struct thread *);
124247801Sdavidestatic int	seltdwait(struct thread *, sbintime_t, sbintime_t);
125174647Sjeffstatic void	seltdclear(struct thread *);
1263485Sphk
127174647Sjeff/*
128174647Sjeff * One seltd per-thread allocated on demand as needed.
129174647Sjeff *
130174647Sjeff *	t - protected by st_mtx
131174647Sjeff * 	k - Only accessed by curthread or read-only
132174647Sjeff */
133174647Sjeffstruct seltd {
134174647Sjeff	STAILQ_HEAD(, selfd)	st_selq;	/* (k) List of selfds. */
135174647Sjeff	struct selfd		*st_free1;	/* (k) free fd for read set. */
136174647Sjeff	struct selfd		*st_free2;	/* (k) free fd for write set. */
137174647Sjeff	struct mtx		st_mtx;		/* Protects struct seltd */
138174647Sjeff	struct cv		st_wait;	/* (t) Wait channel. */
139174647Sjeff	int			st_flags;	/* (t) SELTD_ flags. */
140174647Sjeff};
141174647Sjeff
142174647Sjeff#define	SELTD_PENDING	0x0001			/* We have pending events. */
143174647Sjeff#define	SELTD_RESCAN	0x0002			/* Doing a rescan. */
144174647Sjeff
145174647Sjeff/*
146174647Sjeff * One selfd allocated per-thread per-file-descriptor.
147174647Sjeff *	f - protected by sf_mtx
148174647Sjeff */
149174647Sjeffstruct selfd {
150174647Sjeff	STAILQ_ENTRY(selfd)	sf_link;	/* (k) fds owned by this td. */
151174647Sjeff	TAILQ_ENTRY(selfd)	sf_threads;	/* (f) fds on this selinfo. */
152174647Sjeff	struct selinfo		*sf_si;		/* (f) selinfo when linked. */
153174647Sjeff	struct mtx		*sf_mtx;	/* Pointer to selinfo mtx. */
154174647Sjeff	struct seltd		*sf_td;		/* (k) owning seltd. */
155174647Sjeff	void			*sf_cookie;	/* (k) fd or pollfd. */
156174647Sjeff};
157174647Sjeff
158174647Sjeffstatic uma_zone_t selfd_zone;
159195259Sjeffstatic struct mtx_pool *mtxpool_select;
160174647Sjeff
16112221Sbde#ifndef _SYS_SYSPROTO_H_
1621541Srgrimesstruct read_args {
1631541Srgrimes	int	fd;
16438864Sbde	void	*buf;
16538864Sbde	size_t	nbyte;
1661541Srgrimes};
16712221Sbde#endif
1681549Srgrimesint
169225617Skmacysys_read(td, uap)
17083366Sjulian	struct thread *td;
17186341Sdillon	struct read_args *uap;
1721541Srgrimes{
173147813Sjhb	struct uio auio;
174147813Sjhb	struct iovec aiov;
17568883Sdillon	int error;
1761541Srgrimes
177231949Skib	if (uap->nbyte > IOSIZE_MAX)
178147813Sjhb		return (EINVAL);
179147813Sjhb	aiov.iov_base = uap->buf;
180147813Sjhb	aiov.iov_len = uap->nbyte;
181147813Sjhb	auio.uio_iov = &aiov;
182147813Sjhb	auio.uio_iovcnt = 1;
183147813Sjhb	auio.uio_resid = uap->nbyte;
184147813Sjhb	auio.uio_segflg = UIO_USERSPACE;
185147813Sjhb	error = kern_readv(td, uap->fd, &auio);
18668883Sdillon	return(error);
1871541Srgrimes}
1881541Srgrimes
1891541Srgrimes/*
190147813Sjhb * Positioned read system call
19145065Salc */
19245065Salc#ifndef _SYS_SYSPROTO_H_
19345065Salcstruct pread_args {
19445065Salc	int	fd;
19545065Salc	void	*buf;
19645065Salc	size_t	nbyte;
19745311Sdt	int	pad;
19845311Sdt	off_t	offset;
19945065Salc};
20045065Salc#endif
20145065Salcint
202225617Skmacysys_pread(td, uap)
20383366Sjulian	struct thread *td;
20486341Sdillon	struct pread_args *uap;
20545065Salc{
20645065Salc	struct uio auio;
20745065Salc	struct iovec aiov;
208147813Sjhb	int error;
20945065Salc
210231949Skib	if (uap->nbyte > IOSIZE_MAX)
211147813Sjhb		return (EINVAL);
212147813Sjhb	aiov.iov_base = uap->buf;
213147813Sjhb	aiov.iov_len = uap->nbyte;
21445065Salc	auio.uio_iov = &aiov;
21545065Salc	auio.uio_iovcnt = 1;
216147813Sjhb	auio.uio_resid = uap->nbyte;
21745065Salc	auio.uio_segflg = UIO_USERSPACE;
218147813Sjhb	error = kern_preadv(td, uap->fd, &auio, uap->offset);
219147813Sjhb	return(error);
22045065Salc}
22145065Salc
222171212Speterint
223171212Speterfreebsd6_pread(td, uap)
224171212Speter	struct thread *td;
225171212Speter	struct freebsd6_pread_args *uap;
226171212Speter{
227171212Speter	struct pread_args oargs;
228171212Speter
229171212Speter	oargs.fd = uap->fd;
230171212Speter	oargs.buf = uap->buf;
231171212Speter	oargs.nbyte = uap->nbyte;
232171212Speter	oargs.offset = uap->offset;
233225617Skmacy	return (sys_pread(td, &oargs));
234171212Speter}
235171212Speter
23645065Salc/*
2371541Srgrimes * Scatter read system call.
2381541Srgrimes */
23912221Sbde#ifndef _SYS_SYSPROTO_H_
2401541Srgrimesstruct readv_args {
24112208Sbde	int	fd;
2421541Srgrimes	struct	iovec *iovp;
2431541Srgrimes	u_int	iovcnt;
2441541Srgrimes};
24512221Sbde#endif
2461549Srgrimesint
247225617Skmacysys_readv(struct thread *td, struct readv_args *uap)
2481541Srgrimes{
249144445Sjhb	struct uio *auio;
250144445Sjhb	int error;
251144445Sjhb
252144445Sjhb	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
253144445Sjhb	if (error)
254144445Sjhb		return (error);
255144445Sjhb	error = kern_readv(td, uap->fd, auio);
256144445Sjhb	free(auio, M_IOV);
257144445Sjhb	return (error);
258144445Sjhb}
259144445Sjhb
260144445Sjhbint
261144445Sjhbkern_readv(struct thread *td, int fd, struct uio *auio)
262144445Sjhb{
26386341Sdillon	struct file *fp;
264255219Spjd	cap_rights_t rights;
26596243Salc	int error;
266147813Sjhb
267255219Spjd	error = fget_read(td, fd, cap_rights_init(&rights, CAP_READ), &fp);
268147813Sjhb	if (error)
269147813Sjhb		return (error);
270147813Sjhb	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
271147813Sjhb	fdrop(fp, td);
272147813Sjhb	return (error);
273147813Sjhb}
274147813Sjhb
275147813Sjhb/*
276147813Sjhb * Scatter positioned read system call.
277147813Sjhb */
278147813Sjhb#ifndef _SYS_SYSPROTO_H_
279147813Sjhbstruct preadv_args {
280147813Sjhb	int	fd;
281147813Sjhb	struct	iovec *iovp;
282147813Sjhb	u_int	iovcnt;
283147813Sjhb	off_t	offset;
284147813Sjhb};
2851541Srgrimes#endif
286147813Sjhbint
287225617Skmacysys_preadv(struct thread *td, struct preadv_args *uap)
288147813Sjhb{
289147813Sjhb	struct uio *auio;
290147813Sjhb	int error;
2911541Srgrimes
292147813Sjhb	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
293147813Sjhb	if (error)
294147813Sjhb		return (error);
295147813Sjhb	error = kern_preadv(td, uap->fd, auio, uap->offset);
296147813Sjhb	free(auio, M_IOV);
297147813Sjhb	return (error);
298147813Sjhb}
299147813Sjhb
300147813Sjhbint
301147813Sjhbkern_preadv(td, fd, auio, offset)
302147813Sjhb	struct thread *td;
303147813Sjhb	int fd;
304147813Sjhb	struct uio *auio;
305147813Sjhb	off_t offset;
306147813Sjhb{
307147813Sjhb	struct file *fp;
308255219Spjd	cap_rights_t rights;
309147813Sjhb	int error;
310147813Sjhb
311255219Spjd	error = fget_read(td, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
312131897Sphk	if (error)
31396243Salc		return (error);
314147813Sjhb	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
315147813Sjhb		error = ESPIPE;
316315481Smmokhi	else if (offset < 0 &&
317315481Smmokhi	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
318147813Sjhb		error = EINVAL;
319147813Sjhb	else
320147813Sjhb		error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
321147813Sjhb	fdrop(fp, td);
322147813Sjhb	return (error);
323147813Sjhb}
324147813Sjhb
325147813Sjhb/*
326147813Sjhb * Common code for readv and preadv that reads data in
327147813Sjhb * from a file using the passed in uio, offset, and flags.
328147813Sjhb */
329147813Sjhbstatic int
330147813Sjhbdofileread(td, fd, fp, auio, offset, flags)
331147813Sjhb	struct thread *td;
332147813Sjhb	int fd;
333147813Sjhb	struct file *fp;
334147813Sjhb	struct uio *auio;
335147813Sjhb	off_t offset;
336147813Sjhb	int flags;
337147813Sjhb{
338147813Sjhb	ssize_t cnt;
339147813Sjhb	int error;
340147813Sjhb#ifdef KTRACE
341147813Sjhb	struct uio *ktruio = NULL;
342147813Sjhb#endif
343147813Sjhb
344140800Sphk	/* Finish zero length reads right here */
345140800Sphk	if (auio->uio_resid == 0) {
346140800Sphk		td->td_retval[0] = 0;
347140800Sphk		return(0);
348140800Sphk	}
349131897Sphk	auio->uio_rw = UIO_READ;
350147813Sjhb	auio->uio_offset = offset;
351131897Sphk	auio->uio_td = td;
3521541Srgrimes#ifdef KTRACE
353131897Sphk	if (KTRPOINT(td, KTR_GENIO))
354131897Sphk		ktruio = cloneuio(auio);
3551541Srgrimes#endif
356131897Sphk	cnt = auio->uio_resid;
357147813Sjhb	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
358131897Sphk		if (auio->uio_resid != cnt && (error == ERESTART ||
3591541Srgrimes		    error == EINTR || error == EWOULDBLOCK))
3601541Srgrimes			error = 0;
36168883Sdillon	}
362131897Sphk	cnt -= auio->uio_resid;
3631541Srgrimes#ifdef KTRACE
364131897Sphk	if (ktruio != NULL) {
365131897Sphk		ktruio->uio_resid = cnt;
366144445Sjhb		ktrgenio(fd, UIO_READ, ktruio, error);
3671541Srgrimes	}
3681541Srgrimes#endif
36983366Sjulian	td->td_retval[0] = cnt;
3701541Srgrimes	return (error);
3711541Srgrimes}
3721541Srgrimes
37312221Sbde#ifndef _SYS_SYSPROTO_H_
3741541Srgrimesstruct write_args {
3751541Srgrimes	int	fd;
37638864Sbde	const void *buf;
37738864Sbde	size_t	nbyte;
3781541Srgrimes};
37912221Sbde#endif
3801549Srgrimesint
381225617Skmacysys_write(td, uap)
38283366Sjulian	struct thread *td;
38386341Sdillon	struct write_args *uap;
3841541Srgrimes{
385147813Sjhb	struct uio auio;
386147813Sjhb	struct iovec aiov;
38768883Sdillon	int error;
3881541Srgrimes
389231949Skib	if (uap->nbyte > IOSIZE_MAX)
390147813Sjhb		return (EINVAL);
391147813Sjhb	aiov.iov_base = (void *)(uintptr_t)uap->buf;
392147813Sjhb	aiov.iov_len = uap->nbyte;
393147813Sjhb	auio.uio_iov = &aiov;
394147813Sjhb	auio.uio_iovcnt = 1;
395147813Sjhb	auio.uio_resid = uap->nbyte;
396147813Sjhb	auio.uio_segflg = UIO_USERSPACE;
397147813Sjhb	error = kern_writev(td, uap->fd, &auio);
39868883Sdillon	return(error);
3991541Srgrimes}
4001541Srgrimes
4011541Srgrimes/*
402167211Srwatson * Positioned write system call.
40345065Salc */
40445065Salc#ifndef _SYS_SYSPROTO_H_
40545065Salcstruct pwrite_args {
40645065Salc	int	fd;
40745065Salc	const void *buf;
40845065Salc	size_t	nbyte;
40945311Sdt	int	pad;
41045311Sdt	off_t	offset;
41145065Salc};
41245065Salc#endif
41345065Salcint
414225617Skmacysys_pwrite(td, uap)
41583366Sjulian	struct thread *td;
41686341Sdillon	struct pwrite_args *uap;
41745065Salc{
41845065Salc	struct uio auio;
41945065Salc	struct iovec aiov;
420147813Sjhb	int error;
42145065Salc
422231949Skib	if (uap->nbyte > IOSIZE_MAX)
423147813Sjhb		return (EINVAL);
424147813Sjhb	aiov.iov_base = (void *)(uintptr_t)uap->buf;
425147813Sjhb	aiov.iov_len = uap->nbyte;
42645065Salc	auio.uio_iov = &aiov;
42745065Salc	auio.uio_iovcnt = 1;
428147813Sjhb	auio.uio_resid = uap->nbyte;
42945065Salc	auio.uio_segflg = UIO_USERSPACE;
430147813Sjhb	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
431147813Sjhb	return(error);
43245065Salc}
43345065Salc
434171212Speterint
435171212Speterfreebsd6_pwrite(td, uap)
436171212Speter	struct thread *td;
437171212Speter	struct freebsd6_pwrite_args *uap;
438171212Speter{
439171212Speter	struct pwrite_args oargs;
440171212Speter
441171212Speter	oargs.fd = uap->fd;
442171212Speter	oargs.buf = uap->buf;
443171212Speter	oargs.nbyte = uap->nbyte;
444171212Speter	oargs.offset = uap->offset;
445225617Skmacy	return (sys_pwrite(td, &oargs));
446171212Speter}
447171212Speter
44845065Salc/*
449167211Srwatson * Gather write system call.
4501541Srgrimes */
45112221Sbde#ifndef _SYS_SYSPROTO_H_
4521541Srgrimesstruct writev_args {
4531541Srgrimes	int	fd;
4541541Srgrimes	struct	iovec *iovp;
4551541Srgrimes	u_int	iovcnt;
4561541Srgrimes};
45712221Sbde#endif
4581549Srgrimesint
459225617Skmacysys_writev(struct thread *td, struct writev_args *uap)
4601541Srgrimes{
461144445Sjhb	struct uio *auio;
462144445Sjhb	int error;
463144445Sjhb
464144445Sjhb	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
465144445Sjhb	if (error)
466144445Sjhb		return (error);
467144445Sjhb	error = kern_writev(td, uap->fd, auio);
468144445Sjhb	free(auio, M_IOV);
469144445Sjhb	return (error);
470144445Sjhb}
471144445Sjhb
472144445Sjhbint
473144445Sjhbkern_writev(struct thread *td, int fd, struct uio *auio)
474144445Sjhb{
47586341Sdillon	struct file *fp;
476255219Spjd	cap_rights_t rights;
477131897Sphk	int error;
478147813Sjhb
479255219Spjd	error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
480147813Sjhb	if (error)
481154073Sjhb		return (error);
482147813Sjhb	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
483147813Sjhb	fdrop(fp, td);
484147813Sjhb	return (error);
485147813Sjhb}
486147813Sjhb
487147813Sjhb/*
488167211Srwatson * Gather positioned write system call.
489147813Sjhb */
490147813Sjhb#ifndef _SYS_SYSPROTO_H_
491147813Sjhbstruct pwritev_args {
492147813Sjhb	int	fd;
493147813Sjhb	struct	iovec *iovp;
494147813Sjhb	u_int	iovcnt;
495147813Sjhb	off_t	offset;
496147813Sjhb};
497147813Sjhb#endif
498147813Sjhbint
499225617Skmacysys_pwritev(struct thread *td, struct pwritev_args *uap)
500147813Sjhb{
501147813Sjhb	struct uio *auio;
502147813Sjhb	int error;
503147813Sjhb
504147813Sjhb	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
505147813Sjhb	if (error)
506147813Sjhb		return (error);
507147813Sjhb	error = kern_pwritev(td, uap->fd, auio, uap->offset);
508147813Sjhb	free(auio, M_IOV);
509147813Sjhb	return (error);
510147813Sjhb}
511147813Sjhb
512147813Sjhbint
513147813Sjhbkern_pwritev(td, fd, auio, offset)
514147813Sjhb	struct thread *td;
515147813Sjhb	struct uio *auio;
516147813Sjhb	int fd;
517147813Sjhb	off_t offset;
518147813Sjhb{
519147813Sjhb	struct file *fp;
520255219Spjd	cap_rights_t rights;
521147813Sjhb	int error;
522147813Sjhb
523255219Spjd	error = fget_write(td, fd, cap_rights_init(&rights, CAP_PWRITE), &fp);
524147813Sjhb	if (error)
525154073Sjhb		return (error);
526147813Sjhb	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
527147813Sjhb		error = ESPIPE;
528315481Smmokhi	else if (offset < 0 &&
529315481Smmokhi	    (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR))
530147813Sjhb		error = EINVAL;
531147813Sjhb	else
532147813Sjhb		error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
533147813Sjhb	fdrop(fp, td);
534147813Sjhb	return (error);
535147813Sjhb}
536147813Sjhb
537147813Sjhb/*
538147813Sjhb * Common code for writev and pwritev that writes data to
539147813Sjhb * a file using the passed in uio, offset, and flags.
540147813Sjhb */
541147813Sjhbstatic int
542147813Sjhbdofilewrite(td, fd, fp, auio, offset, flags)
543147813Sjhb	struct thread *td;
544147813Sjhb	int fd;
545147813Sjhb	struct file *fp;
546147813Sjhb	struct uio *auio;
547147813Sjhb	off_t offset;
548147813Sjhb	int flags;
549147813Sjhb{
550147813Sjhb	ssize_t cnt;
551147813Sjhb	int error;
5521541Srgrimes#ifdef KTRACE
553131897Sphk	struct uio *ktruio = NULL;
5541541Srgrimes#endif
5551541Srgrimes
556131897Sphk	auio->uio_rw = UIO_WRITE;
557131897Sphk	auio->uio_td = td;
558147813Sjhb	auio->uio_offset = offset;
5591541Srgrimes#ifdef KTRACE
560131897Sphk	if (KTRPOINT(td, KTR_GENIO))
561131897Sphk		ktruio = cloneuio(auio);
5621541Srgrimes#endif
563131897Sphk	cnt = auio->uio_resid;
564244643Skib	if (fp->f_type == DTYPE_VNODE &&
565244643Skib	    (fp->f_vnread_flags & FDEVFS_VNODE) == 0)
56669733Sdillon		bwillwrite();
567147813Sjhb	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
568131897Sphk		if (auio->uio_resid != cnt && (error == ERESTART ||
5691541Srgrimes		    error == EINTR || error == EWOULDBLOCK))
5701541Srgrimes			error = 0;
571147813Sjhb		/* Socket layer is responsible for issuing SIGPIPE. */
572167150Sbms		if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
57383366Sjulian			PROC_LOCK(td->td_proc);
574209595Sjhb			tdsignal(td, SIGPIPE);
57583366Sjulian			PROC_UNLOCK(td->td_proc);
57673929Sjhb		}
5771541Srgrimes	}
578131897Sphk	cnt -= auio->uio_resid;
5791541Srgrimes#ifdef KTRACE
580131897Sphk	if (ktruio != NULL) {
581131897Sphk		ktruio->uio_resid = cnt;
582144445Sjhb		ktrgenio(fd, UIO_WRITE, ktruio, error);
5831541Srgrimes	}
5841541Srgrimes#endif
58583366Sjulian	td->td_retval[0] = cnt;
5861541Srgrimes	return (error);
5871541Srgrimes}
5881541Srgrimes
589175140Sjhb/*
590175140Sjhb * Truncate a file given a file descriptor.
591175140Sjhb *
592175140Sjhb * Can't use fget_write() here, since must return EINVAL and not EBADF if the
593175140Sjhb * descriptor isn't writable.
594175140Sjhb */
595175140Sjhbint
596175140Sjhbkern_ftruncate(td, fd, length)
597175140Sjhb	struct thread *td;
598175140Sjhb	int fd;
599175140Sjhb	off_t length;
600175140Sjhb{
601175140Sjhb	struct file *fp;
602255219Spjd	cap_rights_t rights;
603175140Sjhb	int error;
604175140Sjhb
605195104Srwatson	AUDIT_ARG_FD(fd);
606175140Sjhb	if (length < 0)
607175140Sjhb		return (EINVAL);
608255219Spjd	error = fget(td, fd, cap_rights_init(&rights, CAP_FTRUNCATE), &fp);
609175140Sjhb	if (error)
610175140Sjhb		return (error);
611195104Srwatson	AUDIT_ARG_FILE(td->td_proc, fp);
612175140Sjhb	if (!(fp->f_flag & FWRITE)) {
613175140Sjhb		fdrop(fp, td);
614175140Sjhb		return (EINVAL);
615175140Sjhb	}
616175140Sjhb	error = fo_truncate(fp, length, td->td_ucred, td);
617175140Sjhb	fdrop(fp, td);
618175140Sjhb	return (error);
619175140Sjhb}
620175140Sjhb
62112221Sbde#ifndef _SYS_SYSPROTO_H_
622175140Sjhbstruct ftruncate_args {
623175140Sjhb	int	fd;
624175140Sjhb	int	pad;
625175140Sjhb	off_t	length;
626175140Sjhb};
627175140Sjhb#endif
628175140Sjhbint
629225617Skmacysys_ftruncate(td, uap)
630175140Sjhb	struct thread *td;
631175140Sjhb	struct ftruncate_args *uap;
632175140Sjhb{
633175140Sjhb
634175140Sjhb	return (kern_ftruncate(td, uap->fd, uap->length));
635175140Sjhb}
636175140Sjhb
637175140Sjhb#if defined(COMPAT_43)
638175140Sjhb#ifndef _SYS_SYSPROTO_H_
639175140Sjhbstruct oftruncate_args {
640175140Sjhb	int	fd;
641175140Sjhb	long	length;
642175140Sjhb};
643175140Sjhb#endif
644175140Sjhbint
645175140Sjhboftruncate(td, uap)
646175140Sjhb	struct thread *td;
647175140Sjhb	struct oftruncate_args *uap;
648175140Sjhb{
649175140Sjhb
650175140Sjhb	return (kern_ftruncate(td, uap->fd, uap->length));
651175140Sjhb}
652175140Sjhb#endif /* COMPAT_43 */
653175140Sjhb
654175140Sjhb#ifndef _SYS_SYSPROTO_H_
6551541Srgrimesstruct ioctl_args {
6561541Srgrimes	int	fd;
65738517Sdfr	u_long	com;
6581541Srgrimes	caddr_t	data;
6591541Srgrimes};
66012221Sbde#endif
6611541Srgrimes/* ARGSUSED */
6621549Srgrimesint
663225617Skmacysys_ioctl(struct thread *td, struct ioctl_args *uap)
6641541Srgrimes{
665275212Shselasky	u_char smalldata[SYS_IOCTL_SMALL_SIZE] __aligned(SYS_IOCTL_SMALL_ALIGN);
666360332Shselasky	uint32_t com;
667162711Sru	int arg, error;
668137687Sphk	u_int size;
669162711Sru	caddr_t data;
6701541Srgrimes
671360332Shselasky#ifdef INVARIANTS
672140406Sphk	if (uap->com > 0xffffffff) {
673140406Sphk		printf(
674140406Sphk		    "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
675173600Sjulian		    td->td_proc->p_pid, td->td_name, uap->com);
676140406Sphk	}
677360332Shselasky#endif
678360332Shselasky	com = (uint32_t)uap->com;
6791541Srgrimes
6801541Srgrimes	/*
6811541Srgrimes	 * Interpret high order word to find amount of data to be
6821541Srgrimes	 * copied to/from the user's address space.
6831541Srgrimes	 */
6841541Srgrimes	size = IOCPARM_LEN(com);
685137689Sphk	if ((size > IOCPARM_MAX) ||
686137689Sphk	    ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
687147676Speter#if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
688147676Speter	    ((com & IOC_OUT) && size == 0) ||
689147676Speter#else
690147676Speter	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
691147676Speter#endif
692162711Sru	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
69389306Salfred		return (ENOTTY);
69468883Sdillon
695137689Sphk	if (size > 0) {
696183297Sobrien		if (com & IOC_VOID) {
697162711Sru			/* Integer argument. */
698162711Sru			arg = (intptr_t)uap->data;
699162711Sru			data = (void *)&arg;
700162711Sru			size = 0;
701275212Shselasky		} else {
702275212Shselasky			if (size > SYS_IOCTL_SMALL_SIZE)
703275212Shselasky				data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
704275212Shselasky			else
705275212Shselasky				data = smalldata;
706275212Shselasky		}
707162711Sru	} else
708137689Sphk		data = (void *)&uap->data;
709137689Sphk	if (com & IOC_IN) {
710137689Sphk		error = copyin(uap->data, data, (u_int)size);
711275212Shselasky		if (error != 0)
712275212Shselasky			goto out;
713137689Sphk	} else if (com & IOC_OUT) {
7141541Srgrimes		/*
7151541Srgrimes		 * Zero the buffer so the user always
7161541Srgrimes		 * gets back something deterministic.
7171541Srgrimes		 */
7181541Srgrimes		bzero(data, size);
71968883Sdillon	}
7201541Srgrimes
721160192Sjhb	error = kern_ioctl(td, uap->fd, com, data);
722160192Sjhb
723160192Sjhb	if (error == 0 && (com & IOC_OUT))
724160192Sjhb		error = copyout(data, uap->data, (u_int)size);
725160192Sjhb
726275212Shselaskyout:
727275212Shselasky	if (size > SYS_IOCTL_SMALL_SIZE)
728162711Sru		free(data, M_IOCTLOPS);
729160192Sjhb	return (error);
730160192Sjhb}
731160192Sjhb
732160192Sjhbint
733160192Sjhbkern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
734160192Sjhb{
735160192Sjhb	struct file *fp;
736160192Sjhb	struct filedesc *fdp;
737255219Spjd#ifndef CAPABILITIES
738255219Spjd	cap_rights_t rights;
739255219Spjd#endif
740247602Spjd	int error, tmp, locked;
741160192Sjhb
742195281Srwatson	AUDIT_ARG_FD(fd);
743195281Srwatson	AUDIT_ARG_CMD(com);
744247602Spjd
745160192Sjhb	fdp = td->td_proc->p_fd;
746247602Spjd
747160192Sjhb	switch (com) {
748160192Sjhb	case FIONCLEX:
749247602Spjd	case FIOCLEX:
750168355Srwatson		FILEDESC_XLOCK(fdp);
751247602Spjd		locked = LA_XLOCKED;
752247602Spjd		break;
753247602Spjd	default:
754247602Spjd#ifdef CAPABILITIES
755247602Spjd		FILEDESC_SLOCK(fdp);
756247602Spjd		locked = LA_SLOCKED;
757247602Spjd#else
758247602Spjd		locked = LA_UNLOCKED;
759247602Spjd#endif
760247602Spjd		break;
761247602Spjd	}
762247602Spjd
763247602Spjd#ifdef CAPABILITIES
764247602Spjd	if ((fp = fget_locked(fdp, fd)) == NULL) {
765247602Spjd		error = EBADF;
766160192Sjhb		goto out;
767247602Spjd	}
768247602Spjd	if ((error = cap_ioctl_check(fdp, fd, com)) != 0) {
769247602Spjd		fp = NULL;	/* fhold() was not called yet */
770247602Spjd		goto out;
771247602Spjd	}
772247602Spjd	fhold(fp);
773247602Spjd	if (locked == LA_SLOCKED) {
774247602Spjd		FILEDESC_SUNLOCK(fdp);
775247602Spjd		locked = LA_UNLOCKED;
776247602Spjd	}
777247602Spjd#else
778255219Spjd	error = fget(td, fd, cap_rights_init(&rights, CAP_IOCTL), &fp);
779255219Spjd	if (error != 0) {
780247602Spjd		fp = NULL;
781247602Spjd		goto out;
782247602Spjd	}
783247602Spjd#endif
784247602Spjd	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
785247602Spjd		error = EBADF;
786247602Spjd		goto out;
787247602Spjd	}
788247602Spjd
789247602Spjd	switch (com) {
790247602Spjd	case FIONCLEX:
791247602Spjd		fdp->fd_ofiles[fd].fde_flags &= ~UF_EXCLOSE;
792247602Spjd		goto out;
793160192Sjhb	case FIOCLEX:
794247602Spjd		fdp->fd_ofiles[fd].fde_flags |= UF_EXCLOSE;
795160192Sjhb		goto out;
796160192Sjhb	case FIONBIO:
7973098Sphk		if ((tmp = *(int *)data))
798174988Sjeff			atomic_set_int(&fp->f_flag, FNONBLOCK);
7991541Srgrimes		else
800174988Sjeff			atomic_clear_int(&fp->f_flag, FNONBLOCK);
801137773Sphk		data = (void *)&tmp;
802160192Sjhb		break;
803160192Sjhb	case FIOASYNC:
8043098Sphk		if ((tmp = *(int *)data))
805174988Sjeff			atomic_set_int(&fp->f_flag, FASYNC);
8061541Srgrimes		else
807174988Sjeff			atomic_clear_int(&fp->f_flag, FASYNC);
808137773Sphk		data = (void *)&tmp;
809160192Sjhb		break;
810137773Sphk	}
8111541Srgrimes
812137773Sphk	error = fo_ioctl(fp, com, data, td->td_ucred, td);
813160192Sjhbout:
814247602Spjd	switch (locked) {
815247602Spjd	case LA_XLOCKED:
816247602Spjd		FILEDESC_XUNLOCK(fdp);
817247602Spjd		break;
818247602Spjd#ifdef CAPABILITIES
819247602Spjd	case LA_SLOCKED:
820247602Spjd		FILEDESC_SUNLOCK(fdp);
821247602Spjd		break;
822247602Spjd#endif
823247602Spjd	default:
824247602Spjd		FILEDESC_UNLOCK_ASSERT(fdp);
825247602Spjd		break;
826247602Spjd	}
827247602Spjd	if (fp != NULL)
828247602Spjd		fdrop(fp, td);
8291541Srgrimes	return (error);
8301541Srgrimes}
8311541Srgrimes
832189450Skibint
833189450Skibpoll_no_poll(int events)
834189450Skib{
835189450Skib	/*
836189450Skib	 * Return true for read/write.  If the user asked for something
837189450Skib	 * special, return POLLNVAL, so that clients have a way of
838189450Skib	 * determining reliably whether or not the extended
839189450Skib	 * functionality is present without hard-coding knowledge
840189450Skib	 * of specific filesystem implementations.
841189450Skib	 */
842189450Skib	if (events & ~POLLSTANDARD)
843189450Skib		return (POLLNVAL);
844189450Skib
845189450Skib	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
846189450Skib}
847189450Skib
848198508Skibint
849225617Skmacysys_pselect(struct thread *td, struct pselect_args *uap)
850198508Skib{
851198508Skib	struct timespec ts;
852198508Skib	struct timeval tv, *tvp;
853198508Skib	sigset_t set, *uset;
854198508Skib	int error;
855198508Skib
856198508Skib	if (uap->ts != NULL) {
857198508Skib		error = copyin(uap->ts, &ts, sizeof(ts));
858198508Skib		if (error != 0)
859198508Skib		    return (error);
860198508Skib		TIMESPEC_TO_TIMEVAL(&tv, &ts);
861198508Skib		tvp = &tv;
862198508Skib	} else
863198508Skib		tvp = NULL;
864198508Skib	if (uap->sm != NULL) {
865198508Skib		error = copyin(uap->sm, &set, sizeof(set));
866198508Skib		if (error != 0)
867198508Skib			return (error);
868198508Skib		uset = &set;
869198508Skib	} else
870198508Skib		uset = NULL;
871198508Skib	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
872198508Skib	    uset, NFDBITS));
873198508Skib}
874198508Skib
875198508Skibint
876198508Skibkern_pselect(struct thread *td, int nd, fd_set *in, fd_set *ou, fd_set *ex,
877198508Skib    struct timeval *tvp, sigset_t *uset, int abi_nfdbits)
878198508Skib{
879198508Skib	int error;
880198508Skib
881198508Skib	if (uset != NULL) {
882198508Skib		error = kern_sigprocmask(td, SIG_SETMASK, uset,
883198508Skib		    &td->td_oldsigmask, 0);
884198508Skib		if (error != 0)
885198508Skib			return (error);
886198508Skib		td->td_pflags |= TDP_OLDMASK;
887198508Skib		/*
888198508Skib		 * Make sure that ast() is called on return to
889198508Skib		 * usermode and TDP_OLDMASK is cleared, restoring old
890198508Skib		 * sigmask.
891198508Skib		 */
892198508Skib		thread_lock(td);
893198508Skib		td->td_flags |= TDF_ASTPENDING;
894198508Skib		thread_unlock(td);
895198508Skib	}
896198508Skib	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
897198508Skib	return (error);
898198508Skib}
899198508Skib
90012221Sbde#ifndef _SYS_SYSPROTO_H_
9011541Srgrimesstruct select_args {
90217702Ssmpatel	int	nd;
9031541Srgrimes	fd_set	*in, *ou, *ex;
9041541Srgrimes	struct	timeval *tv;
9051541Srgrimes};
90612221Sbde#endif
9071549Srgrimesint
908225617Skmacysys_select(struct thread *td, struct select_args *uap)
9091541Srgrimes{
910102779Siedowse	struct timeval tv, *tvp;
911102779Siedowse	int error;
912102779Siedowse
913102779Siedowse	if (uap->tv != NULL) {
914102779Siedowse		error = copyin(uap->tv, &tv, sizeof(tv));
915102779Siedowse		if (error)
916102779Siedowse			return (error);
917102779Siedowse		tvp = &tv;
918102779Siedowse	} else
919102779Siedowse		tvp = NULL;
920102779Siedowse
921197049Skib	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
922197049Skib	    NFDBITS));
923102779Siedowse}
924102779Siedowse
925227485Skib/*
926227485Skib * In the unlikely case when user specified n greater then the last
927227485Skib * open file descriptor, check that no bits are set after the last
928227485Skib * valid fd.  We must return EBADF if any is set.
929227485Skib *
930227485Skib * There are applications that rely on the behaviour.
931227485Skib *
932227485Skib * nd is fd_lastfile + 1.
933227485Skib */
934227485Skibstatic int
935227485Skibselect_check_badfd(fd_set *fd_in, int nd, int ndu, int abi_nfdbits)
936227485Skib{
937227485Skib	char *addr, *oaddr;
938227485Skib	int b, i, res;
939227485Skib	uint8_t bits;
940227485Skib
941227485Skib	if (nd >= ndu || fd_in == NULL)
942227485Skib		return (0);
943227485Skib
944227485Skib	oaddr = NULL;
945227485Skib	bits = 0; /* silence gcc */
946227485Skib	for (i = nd; i < ndu; i++) {
947227485Skib		b = i / NBBY;
948227485Skib#if BYTE_ORDER == LITTLE_ENDIAN
949227485Skib		addr = (char *)fd_in + b;
950227485Skib#else
951227485Skib		addr = (char *)fd_in;
952227485Skib		if (abi_nfdbits == NFDBITS) {
953227485Skib			addr += rounddown(b, sizeof(fd_mask)) +
954227485Skib			    sizeof(fd_mask) - 1 - b % sizeof(fd_mask);
955227485Skib		} else {
956227485Skib			addr += rounddown(b, sizeof(uint32_t)) +
957227485Skib			    sizeof(uint32_t) - 1 - b % sizeof(uint32_t);
958227485Skib		}
959227485Skib#endif
960227485Skib		if (addr != oaddr) {
961227485Skib			res = fubyte(addr);
962227485Skib			if (res == -1)
963227485Skib				return (EFAULT);
964227485Skib			oaddr = addr;
965227485Skib			bits = res;
966227485Skib		}
967227485Skib		if ((bits & (1 << (i % NBBY))) != 0)
968227485Skib			return (EBADF);
969227485Skib	}
970227485Skib	return (0);
971227485Skib}
972227485Skib
973102779Siedowseint
974102779Siedowsekern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
975197049Skib    fd_set *fd_ex, struct timeval *tvp, int abi_nfdbits)
976102779Siedowse{
97789306Salfred	struct filedesc *fdp;
97822945Sbde	/*
97922945Sbde	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
98022945Sbde	 * infds with the new FD_SETSIZE of 1024, and more than enough for
98122945Sbde	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
98222945Sbde	 * of 256.
98322945Sbde	 */
98422945Sbde	fd_mask s_selbits[howmany(2048, NFDBITS)];
98589969Salfred	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
986247801Sdavide	struct timeval rtv;
987247801Sdavide	sbintime_t asbt, precision, rsbt;
988197049Skib	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;
989247801Sdavide	int error, lf, ndu;
9901541Srgrimes
991102779Siedowse	if (nd < 0)
99217713Ssmpatel		return (EINVAL);
99389306Salfred	fdp = td->td_proc->p_fd;
994227485Skib	ndu = nd;
995227485Skib	lf = fdp->fd_lastfile;
996227485Skib	if (nd > lf + 1)
997227485Skib		nd = lf + 1;
99817702Ssmpatel
999227485Skib	error = select_check_badfd(fd_in, nd, ndu, abi_nfdbits);
1000227485Skib	if (error != 0)
1001227485Skib		return (error);
1002227485Skib	error = select_check_badfd(fd_ou, nd, ndu, abi_nfdbits);
1003227485Skib	if (error != 0)
1004227485Skib		return (error);
1005227485Skib	error = select_check_badfd(fd_ex, nd, ndu, abi_nfdbits);
1006227485Skib	if (error != 0)
1007227485Skib		return (error);
1008227485Skib
100922945Sbde	/*
101022945Sbde	 * Allocate just enough bits for the non-null fd_sets.  Use the
101122945Sbde	 * preallocated auto buffer if possible.
101222945Sbde	 */
1013102779Siedowse	nfdbits = roundup(nd, NFDBITS);
101422945Sbde	ncpbytes = nfdbits / NBBY;
1015197049Skib	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
101622945Sbde	nbufbytes = 0;
1017102779Siedowse	if (fd_in != NULL)
101822945Sbde		nbufbytes += 2 * ncpbytes;
1019102779Siedowse	if (fd_ou != NULL)
102022945Sbde		nbufbytes += 2 * ncpbytes;
1021102779Siedowse	if (fd_ex != NULL)
102222945Sbde		nbufbytes += 2 * ncpbytes;
102322945Sbde	if (nbufbytes <= sizeof s_selbits)
102422945Sbde		selbits = &s_selbits[0];
102522945Sbde	else
1026111119Simp		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
102717702Ssmpatel
102817702Ssmpatel	/*
102922945Sbde	 * Assign pointers into the bit buffers and fetch the input bits.
103022945Sbde	 * Put the output buffers together so that they can be bzeroed
103122945Sbde	 * together.
103217702Ssmpatel	 */
103322945Sbde	sbp = selbits;
10341541Srgrimes#define	getbits(name, x) \
103522945Sbde	do {								\
1036205014Snwhitehorn		if (name == NULL) {					\
103722945Sbde			ibits[x] = NULL;				\
1038205014Snwhitehorn			obits[x] = NULL;				\
1039205014Snwhitehorn		} else {						\
104022945Sbde			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
104122945Sbde			obits[x] = sbp;					\
104222945Sbde			sbp += ncpbytes / sizeof *sbp;			\
1043197049Skib			error = copyin(name, ibits[x], ncpubytes);	\
104476564Stanimura			if (error != 0)					\
1045174647Sjeff				goto done;				\
1046197049Skib			bzero((char *)ibits[x] + ncpubytes,		\
1047197049Skib			    ncpbytes - ncpubytes);			\
104822945Sbde		}							\
104922945Sbde	} while (0)
1050102779Siedowse	getbits(fd_in, 0);
1051102779Siedowse	getbits(fd_ou, 1);
1052102779Siedowse	getbits(fd_ex, 2);
10531541Srgrimes#undef	getbits
1054205014Snwhitehorn
1055205014Snwhitehorn#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
1056205014Snwhitehorn	/*
1057205014Snwhitehorn	 * XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
1058205014Snwhitehorn	 * we are running under 32-bit emulation. This should be more
1059205014Snwhitehorn	 * generic.
1060205014Snwhitehorn	 */
1061205014Snwhitehorn#define swizzle_fdset(bits)						\
1062205014Snwhitehorn	if (abi_nfdbits != NFDBITS && bits != NULL) {			\
1063205014Snwhitehorn		int i;							\
1064205014Snwhitehorn		for (i = 0; i < ncpbytes / sizeof *sbp; i++)		\
1065205014Snwhitehorn			bits[i] = (bits[i] >> 32) | (bits[i] << 32);	\
1066205014Snwhitehorn	}
1067205014Snwhitehorn#else
1068205014Snwhitehorn#define swizzle_fdset(bits)
1069205014Snwhitehorn#endif
1070205014Snwhitehorn
1071205014Snwhitehorn	/* Make sure the bit order makes it through an ABI transition */
1072205014Snwhitehorn	swizzle_fdset(ibits[0]);
1073205014Snwhitehorn	swizzle_fdset(ibits[1]);
1074205014Snwhitehorn	swizzle_fdset(ibits[2]);
1075205014Snwhitehorn
107622945Sbde	if (nbufbytes != 0)
107722945Sbde		bzero(selbits, nbufbytes / 2);
10781541Srgrimes
1079247801Sdavide	precision = 0;
1080102779Siedowse	if (tvp != NULL) {
1081247801Sdavide		rtv = *tvp;
1082247801Sdavide		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
1083247801Sdavide		    rtv.tv_usec >= 1000000) {
10841541Srgrimes			error = EINVAL;
1085174647Sjeff			goto done;
10861541Srgrimes		}
1087248092Smav		if (!timevalisset(&rtv))
1088247898Smav			asbt = 0;
1089248092Smav		else if (rtv.tv_sec <= INT32_MAX) {
1090247898Smav			rsbt = tvtosbt(rtv);
1091247898Smav			precision = rsbt;
1092247898Smav			precision >>= tc_precexp;
1093247898Smav			if (TIMESEL(&asbt, rsbt))
1094247898Smav				asbt += tc_tick_sbt;
1095304894Skib			if (asbt <= SBT_MAX - rsbt)
1096248092Smav				asbt += rsbt;
1097248092Smav			else
1098247898Smav				asbt = -1;
1099247898Smav		} else
1100247898Smav			asbt = -1;
1101247801Sdavide	} else
1102247801Sdavide		asbt = -1;
1103174647Sjeff	seltdinit(td);
1104174647Sjeff	/* Iterate until the timeout expires or descriptors become ready. */
1105174647Sjeff	for (;;) {
1106174647Sjeff		error = selscan(td, ibits, obits, nd);
1107174647Sjeff		if (error || td->td_retval[0] != 0)
1108174647Sjeff			break;
1109247801Sdavide		error = seltdwait(td, asbt, precision);
1110174647Sjeff		if (error)
1111174647Sjeff			break;
1112174647Sjeff		error = selrescan(td, ibits, obits);
1113174647Sjeff		if (error || td->td_retval[0] != 0)
1114174647Sjeff			break;
111535029Sphk	}
1116174647Sjeff	seltdclear(td);
111792252Salfred
11181541Srgrimesdone:
11191541Srgrimes	/* select is not restarted after signals... */
11201541Srgrimes	if (error == ERESTART)
11211541Srgrimes		error = EINTR;
11221541Srgrimes	if (error == EWOULDBLOCK)
11231541Srgrimes		error = 0;
1124205014Snwhitehorn
1125205014Snwhitehorn	/* swizzle bit order back, if necessary */
1126205014Snwhitehorn	swizzle_fdset(obits[0]);
1127205014Snwhitehorn	swizzle_fdset(obits[1]);
1128205014Snwhitehorn	swizzle_fdset(obits[2]);
1129205014Snwhitehorn#undef swizzle_fdset
1130205014Snwhitehorn
11311541Srgrimes#define	putbits(name, x) \
1132197049Skib	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
11331541Srgrimes		error = error2;
11341541Srgrimes	if (error == 0) {
11351541Srgrimes		int error2;
11361541Srgrimes
1137102779Siedowse		putbits(fd_in, 0);
1138102779Siedowse		putbits(fd_ou, 1);
1139102779Siedowse		putbits(fd_ex, 2);
11401541Srgrimes#undef putbits
11411541Srgrimes	}
114222945Sbde	if (selbits != &s_selbits[0])
114322945Sbde		free(selbits, M_SELECT);
114482752Sdillon
11451541Srgrimes	return (error);
11461541Srgrimes}
1147187677Sjeff/*
1148187677Sjeff * Convert a select bit set to poll flags.
1149187682Sjeff *
1150187677Sjeff * The backend always returns POLLHUP/POLLERR if appropriate and we
1151187677Sjeff * return this as a set bit in any set.
1152187677Sjeff */
1153187677Sjeffstatic int select_flags[3] = {
1154187677Sjeff    POLLRDNORM | POLLHUP | POLLERR,
1155187677Sjeff    POLLWRNORM | POLLHUP | POLLERR,
1156208374Skib    POLLRDBAND | POLLERR
1157187677Sjeff};
11581541Srgrimes
1159174647Sjeff/*
1160187677Sjeff * Compute the fo_poll flags required for a fd given by the index and
1161187677Sjeff * bit position in the fd_mask array.
1162187677Sjeff */
1163187677Sjeffstatic __inline int
1164187996Ssepotvinselflags(fd_mask **ibits, int idx, fd_mask bit)
1165187677Sjeff{
1166187677Sjeff	int flags;
1167187677Sjeff	int msk;
1168187677Sjeff
1169187677Sjeff	flags = 0;
1170187677Sjeff	for (msk = 0; msk < 3; msk++) {
1171187677Sjeff		if (ibits[msk] == NULL)
1172187677Sjeff			continue;
1173187996Ssepotvin		if ((ibits[msk][idx] & bit) == 0)
1174187677Sjeff			continue;
1175187677Sjeff		flags |= select_flags[msk];
1176187677Sjeff	}
1177187677Sjeff	return (flags);
1178187677Sjeff}
1179187677Sjeff
1180187677Sjeff/*
1181187677Sjeff * Set the appropriate output bits given a mask of fired events and the
1182187677Sjeff * input bits originally requested.
1183187677Sjeff */
1184187677Sjeffstatic __inline int
1185187677Sjeffselsetbits(fd_mask **ibits, fd_mask **obits, int idx, fd_mask bit, int events)
1186187677Sjeff{
1187187677Sjeff	int msk;
1188187677Sjeff	int n;
1189187677Sjeff
1190187677Sjeff	n = 0;
1191187677Sjeff	for (msk = 0; msk < 3; msk++) {
1192187677Sjeff		if ((events & select_flags[msk]) == 0)
1193187677Sjeff			continue;
1194187677Sjeff		if (ibits[msk] == NULL)
1195187677Sjeff			continue;
1196187677Sjeff		if ((ibits[msk][idx] & bit) == 0)
1197187677Sjeff			continue;
1198187677Sjeff		/*
1199187677Sjeff		 * XXX Check for a duplicate set.  This can occur because a
1200187677Sjeff		 * socket calls selrecord() twice for each poll() call
1201187677Sjeff		 * resulting in two selfds per real fd.  selrescan() will
1202187677Sjeff		 * call selsetbits twice as a result.
1203187677Sjeff		 */
1204187677Sjeff		if ((obits[msk][idx] & bit) != 0)
1205187677Sjeff			continue;
1206187677Sjeff		obits[msk][idx] |= bit;
1207187677Sjeff		n++;
1208187677Sjeff	}
1209187677Sjeff
1210187677Sjeff	return (n);
1211187677Sjeff}
1212187677Sjeff
1213224778Srwatsonstatic __inline int
1214224778Srwatsongetselfd_cap(struct filedesc *fdp, int fd, struct file **fpp)
1215224778Srwatson{
1216255219Spjd	cap_rights_t rights;
1217224778Srwatson
1218258324Spjd	cap_rights_init(&rights, CAP_EVENT);
1219258324Spjd
1220258324Spjd	return (fget_unlocked(fdp, fd, &rights, 0, fpp, NULL));
1221224778Srwatson}
1222224778Srwatson
1223187677Sjeff/*
1224174647Sjeff * Traverse the list of fds attached to this thread's seltd and check for
1225174647Sjeff * completion.
1226174647Sjeff */
122712819Sphkstatic int
1228174647Sjeffselrescan(struct thread *td, fd_mask **ibits, fd_mask **obits)
1229174647Sjeff{
1230187677Sjeff	struct filedesc *fdp;
1231187677Sjeff	struct selinfo *si;
1232174647Sjeff	struct seltd *stp;
1233174647Sjeff	struct selfd *sfp;
1234174647Sjeff	struct selfd *sfn;
1235174647Sjeff	struct file *fp;
1236187693Sjeff	fd_mask bit;
1237187693Sjeff	int fd, ev, n, idx;
1238224778Srwatson	int error;
1239174647Sjeff
1240187677Sjeff	fdp = td->td_proc->p_fd;
1241174647Sjeff	stp = td->td_sel;
1242187677Sjeff	n = 0;
1243174647Sjeff	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1244174647Sjeff		fd = (int)(uintptr_t)sfp->sf_cookie;
1245174647Sjeff		si = sfp->sf_si;
1246174647Sjeff		selfdfree(stp, sfp);
1247174647Sjeff		/* If the selinfo wasn't cleared the event didn't fire. */
1248174647Sjeff		if (si != NULL)
1249174647Sjeff			continue;
1250224778Srwatson		error = getselfd_cap(fdp, fd, &fp);
1251224778Srwatson		if (error)
1252224778Srwatson			return (error);
1253187677Sjeff		idx = fd / NFDBITS;
1254187693Sjeff		bit = (fd_mask)1 << (fd % NFDBITS);
1255187677Sjeff		ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
1256192080Sjeff		fdrop(fp, td);
1257187677Sjeff		if (ev != 0)
1258187677Sjeff			n += selsetbits(ibits, obits, idx, bit, ev);
1259174647Sjeff	}
1260174647Sjeff	stp->st_flags = 0;
1261174647Sjeff	td->td_retval[0] = n;
1262174647Sjeff	return (0);
1263174647Sjeff}
1264174647Sjeff
1265174647Sjeff/*
1266174647Sjeff * Perform the initial filedescriptor scan and register ourselves with
1267174647Sjeff * each selinfo.
1268174647Sjeff */
1269174647Sjeffstatic int
127083366Sjulianselscan(td, ibits, obits, nfd)
127183366Sjulian	struct thread *td;
127217702Ssmpatel	fd_mask **ibits, **obits;
127330994Sphk	int nfd;
12741541Srgrimes{
1275187677Sjeff	struct filedesc *fdp;
12761541Srgrimes	struct file *fp;
1277187693Sjeff	fd_mask bit;
1278187677Sjeff	int ev, flags, end, fd;
1279187693Sjeff	int n, idx;
1280224778Srwatson	int error;
12811541Srgrimes
1282187677Sjeff	fdp = td->td_proc->p_fd;
1283187677Sjeff	n = 0;
1284187693Sjeff	for (idx = 0, fd = 0; fd < nfd; idx++) {
1285187677Sjeff		end = imin(fd + NFDBITS, nfd);
1286187677Sjeff		for (bit = 1; fd < end; bit <<= 1, fd++) {
1287187677Sjeff			/* Compute the list of events we're interested in. */
1288187677Sjeff			flags = selflags(ibits, idx, bit);
1289187677Sjeff			if (flags == 0)
1290187677Sjeff				continue;
1291224778Srwatson			error = getselfd_cap(fdp, fd, &fp);
1292224778Srwatson			if (error)
1293224778Srwatson				return (error);
1294187677Sjeff			selfdalloc(td, (void *)(uintptr_t)fd);
1295187677Sjeff			ev = fo_poll(fp, flags, td->td_ucred, td);
1296192080Sjeff			fdrop(fp, td);
1297187677Sjeff			if (ev != 0)
1298187677Sjeff				n += selsetbits(ibits, obits, idx, bit, ev);
12991541Srgrimes		}
13001541Srgrimes	}
1301187677Sjeff
130283366Sjulian	td->td_retval[0] = n;
13031541Srgrimes	return (0);
13041541Srgrimes}
13051541Srgrimes
130629351Speterint
1307275986Sdchaginsys_poll(struct thread *td, struct poll_args *uap)
130829351Speter{
1309275986Sdchagin	struct timespec ts, *tsp;
1310275986Sdchagin
1311275986Sdchagin	if (uap->timeout != INFTIM) {
1312275986Sdchagin		if (uap->timeout < 0)
1313275986Sdchagin			return (EINVAL);
1314275986Sdchagin		ts.tv_sec = uap->timeout / 1000;
1315275986Sdchagin		ts.tv_nsec = (uap->timeout % 1000) * 1000000;
1316275986Sdchagin		tsp = &ts;
1317275986Sdchagin	} else
1318275986Sdchagin		tsp = NULL;
1319275986Sdchagin
1320275986Sdchagin	return (kern_poll(td, uap->fds, uap->nfds, tsp, NULL));
1321275986Sdchagin}
1322275986Sdchagin
1323275986Sdchaginint
1324275986Sdchaginkern_poll(struct thread *td, struct pollfd *fds, u_int nfds,
1325275986Sdchagin    struct timespec *tsp, sigset_t *uset)
1326275986Sdchagin{
1327134404Sandre	struct pollfd *bits;
1328134404Sandre	struct pollfd smallbits[32];
1329275986Sdchagin	sbintime_t sbt, precision, tmp;
1330275986Sdchagin	time_t over;
1331275986Sdchagin	struct timespec ts;
1332247801Sdavide	int error;
133329351Speter	size_t ni;
133429351Speter
1335275986Sdchagin	precision = 0;
1336275986Sdchagin	if (tsp != NULL) {
1337275986Sdchagin		if (tsp->tv_sec < 0)
1338275986Sdchagin			return (EINVAL);
1339275986Sdchagin		if (tsp->tv_nsec < 0 || tsp->tv_nsec >= 1000000000)
1340275986Sdchagin			return (EINVAL);
1341275986Sdchagin		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1342275986Sdchagin			sbt = 0;
1343275986Sdchagin		else {
1344275986Sdchagin			ts = *tsp;
1345275986Sdchagin			if (ts.tv_sec > INT32_MAX / 2) {
1346275986Sdchagin				over = ts.tv_sec - INT32_MAX / 2;
1347275986Sdchagin				ts.tv_sec -= over;
1348275986Sdchagin			} else
1349275986Sdchagin				over = 0;
1350275986Sdchagin			tmp = tstosbt(ts);
1351275986Sdchagin			precision = tmp;
1352275986Sdchagin			precision >>= tc_precexp;
1353275986Sdchagin			if (TIMESEL(&sbt, tmp))
1354275986Sdchagin				sbt += tc_tick_sbt;
1355275986Sdchagin			sbt += tmp;
1356275986Sdchagin		}
1357275986Sdchagin	} else
1358275986Sdchagin		sbt = -1;
1359275986Sdchagin
1360177368Sjeff	if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
1361174647Sjeff		return (EINVAL);
136272146Speter	ni = nfds * sizeof(struct pollfd);
136329351Speter	if (ni > sizeof(smallbits))
1364111119Simp		bits = malloc(ni, M_TEMP, M_WAITOK);
136529351Speter	else
136629351Speter		bits = smallbits;
1367275986Sdchagin	error = copyin(fds, bits, ni);
136829351Speter	if (error)
1369174647Sjeff		goto done;
1370275986Sdchagin
1371275986Sdchagin	if (uset != NULL) {
1372275986Sdchagin		error = kern_sigprocmask(td, SIG_SETMASK, uset,
1373275986Sdchagin		    &td->td_oldsigmask, 0);
1374275986Sdchagin		if (error)
1375174647Sjeff			goto done;
1376275986Sdchagin		td->td_pflags |= TDP_OLDMASK;
1377275986Sdchagin		/*
1378275986Sdchagin		 * Make sure that ast() is called on return to
1379275986Sdchagin		 * usermode and TDP_OLDMASK is cleared, restoring old
1380275986Sdchagin		 * sigmask.
1381275986Sdchagin		 */
1382275986Sdchagin		thread_lock(td);
1383275986Sdchagin		td->td_flags |= TDF_ASTPENDING;
1384275986Sdchagin		thread_unlock(td);
1385275986Sdchagin	}
1386275986Sdchagin
1387174647Sjeff	seltdinit(td);
1388174647Sjeff	/* Iterate until the timeout expires or descriptors become ready. */
1389174647Sjeff	for (;;) {
1390174647Sjeff		error = pollscan(td, bits, nfds);
1391174647Sjeff		if (error || td->td_retval[0] != 0)
1392174647Sjeff			break;
1393275986Sdchagin		error = seltdwait(td, sbt, precision);
1394174647Sjeff		if (error)
1395174647Sjeff			break;
1396174647Sjeff		error = pollrescan(td);
1397174647Sjeff		if (error || td->td_retval[0] != 0)
1398174647Sjeff			break;
139929351Speter	}
1400174647Sjeff	seltdclear(td);
140192252Salfred
140229351Speterdone:
140329351Speter	/* poll is not restarted after signals... */
140429351Speter	if (error == ERESTART)
140529351Speter		error = EINTR;
140629351Speter	if (error == EWOULDBLOCK)
140729351Speter		error = 0;
140829351Speter	if (error == 0) {
1409275986Sdchagin		error = pollout(td, bits, fds, nfds);
141029351Speter		if (error)
141129351Speter			goto out;
141229351Speter	}
141329351Speterout:
141429351Speter	if (ni > sizeof(smallbits))
141529351Speter		free(bits, M_TEMP);
141629351Speter	return (error);
141729351Speter}
141829351Speter
1419275986Sdchaginint
1420275986Sdchaginsys_ppoll(struct thread *td, struct ppoll_args *uap)
1421275986Sdchagin{
1422275986Sdchagin	struct timespec ts, *tsp;
1423275986Sdchagin	sigset_t set, *ssp;
1424275986Sdchagin	int error;
1425275986Sdchagin
1426275986Sdchagin	if (uap->ts != NULL) {
1427275986Sdchagin		error = copyin(uap->ts, &ts, sizeof(ts));
1428275986Sdchagin		if (error)
1429275986Sdchagin			return (error);
1430275986Sdchagin		tsp = &ts;
1431275986Sdchagin	} else
1432275986Sdchagin		tsp = NULL;
1433275986Sdchagin	if (uap->set != NULL) {
1434275986Sdchagin		error = copyin(uap->set, &set, sizeof(set));
1435275986Sdchagin		if (error)
1436275986Sdchagin			return (error);
1437275986Sdchagin		ssp = &set;
1438275986Sdchagin	} else
1439275986Sdchagin		ssp = NULL;
1440275986Sdchagin	/*
1441275986Sdchagin	 * fds is still a pointer to user space. kern_poll() will
1442275986Sdchagin	 * take care of copyin that array to the kernel space.
1443275986Sdchagin	 */
1444275986Sdchagin
1445275986Sdchagin	return (kern_poll(td, uap->fds, uap->nfds, tsp, ssp));
1446275986Sdchagin}
1447275986Sdchagin
144829351Speterstatic int
1449174647Sjeffpollrescan(struct thread *td)
1450174647Sjeff{
1451174647Sjeff	struct seltd *stp;
1452174647Sjeff	struct selfd *sfp;
1453174647Sjeff	struct selfd *sfn;
1454174647Sjeff	struct selinfo *si;
1455174647Sjeff	struct filedesc *fdp;
1456174647Sjeff	struct file *fp;
1457174647Sjeff	struct pollfd *fd;
1458255230Ssbruno#ifdef CAPABILITIES
1459255219Spjd	cap_rights_t rights;
1460255230Ssbruno#endif
1461174647Sjeff	int n;
1462174647Sjeff
1463174647Sjeff	n = 0;
1464174647Sjeff	fdp = td->td_proc->p_fd;
1465174647Sjeff	stp = td->td_sel;
1466174647Sjeff	FILEDESC_SLOCK(fdp);
1467174647Sjeff	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
1468174647Sjeff		fd = (struct pollfd *)sfp->sf_cookie;
1469174647Sjeff		si = sfp->sf_si;
1470174647Sjeff		selfdfree(stp, sfp);
1471174647Sjeff		/* If the selinfo wasn't cleared the event didn't fire. */
1472174647Sjeff		if (si != NULL)
1473174647Sjeff			continue;
1474247602Spjd		fp = fdp->fd_ofiles[fd->fd].fde_file;
1475224910Sjonathan#ifdef CAPABILITIES
1476247602Spjd		if (fp == NULL ||
1477255219Spjd		    cap_check(cap_rights(fdp, fd->fd),
1478258324Spjd		    cap_rights_init(&rights, CAP_EVENT)) != 0)
1479224910Sjonathan#else
1480247602Spjd		if (fp == NULL)
1481224910Sjonathan#endif
1482247602Spjd		{
1483174647Sjeff			fd->revents = POLLNVAL;
1484174647Sjeff			n++;
1485174647Sjeff			continue;
1486174647Sjeff		}
1487224910Sjonathan
1488174647Sjeff		/*
1489174647Sjeff		 * Note: backend also returns POLLHUP and
1490174647Sjeff		 * POLLERR if appropriate.
1491174647Sjeff		 */
1492174647Sjeff		fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
1493174647Sjeff		if (fd->revents != 0)
1494174647Sjeff			n++;
1495174647Sjeff	}
1496174647Sjeff	FILEDESC_SUNLOCK(fdp);
1497174647Sjeff	stp->st_flags = 0;
1498174647Sjeff	td->td_retval[0] = n;
1499174647Sjeff	return (0);
1500174647Sjeff}
1501174647Sjeff
1502174647Sjeff
1503174647Sjeffstatic int
1504211941Skibpollout(td, fds, ufds, nfd)
1505211941Skib	struct thread *td;
1506189708Srwatson	struct pollfd *fds;
1507189708Srwatson	struct pollfd *ufds;
1508189708Srwatson	u_int nfd;
1509189708Srwatson{
1510189708Srwatson	int error = 0;
1511189708Srwatson	u_int i = 0;
1512211941Skib	u_int n = 0;
1513189708Srwatson
1514189708Srwatson	for (i = 0; i < nfd; i++) {
1515189708Srwatson		error = copyout(&fds->revents, &ufds->revents,
1516189708Srwatson		    sizeof(ufds->revents));
1517189708Srwatson		if (error)
1518189708Srwatson			return (error);
1519211941Skib		if (fds->revents != 0)
1520211941Skib			n++;
1521189708Srwatson		fds++;
1522189708Srwatson		ufds++;
1523189708Srwatson	}
1524211941Skib	td->td_retval[0] = n;
1525189708Srwatson	return (0);
1526189708Srwatson}
1527189708Srwatson
1528189708Srwatsonstatic int
152983366Sjulianpollscan(td, fds, nfd)
153083366Sjulian	struct thread *td;
153129351Speter	struct pollfd *fds;
153273159Sjlemon	u_int nfd;
153329351Speter{
1534174647Sjeff	struct filedesc *fdp = td->td_proc->p_fd;
153529351Speter	struct file *fp;
1536255230Ssbruno#ifdef CAPABILITIES
1537255219Spjd	cap_rights_t rights;
1538255230Ssbruno#endif
1539247602Spjd	int i, n = 0;
154029351Speter
1541168355Srwatson	FILEDESC_SLOCK(fdp);
154229351Speter	for (i = 0; i < nfd; i++, fds++) {
1543268338Smjg		if (fds->fd > fdp->fd_lastfile) {
154429351Speter			fds->revents = POLLNVAL;
154529351Speter			n++;
154641632Sjkh		} else if (fds->fd < 0) {
154741632Sjkh			fds->revents = 0;
154829351Speter		} else {
1549247602Spjd			fp = fdp->fd_ofiles[fds->fd].fde_file;
1550224910Sjonathan#ifdef CAPABILITIES
1551247602Spjd			if (fp == NULL ||
1552247602Spjd			    cap_check(cap_rights(fdp, fds->fd),
1553258324Spjd			    cap_rights_init(&rights, CAP_EVENT)) != 0)
1554224910Sjonathan#else
1555247602Spjd			if (fp == NULL)
1556224910Sjonathan#endif
1557247602Spjd			{
155829351Speter				fds->revents = POLLNVAL;
155929351Speter				n++;
156029351Speter			} else {
156131364Sbde				/*
156231364Sbde				 * Note: backend also returns POLLHUP and
156331364Sbde				 * POLLERR if appropriate.
156431364Sbde				 */
1565174647Sjeff				selfdalloc(td, fds);
156651418Sgreen				fds->revents = fo_poll(fp, fds->events,
1567101983Srwatson				    td->td_ucred, td);
1568196460Skib				/*
1569196460Skib				 * POSIX requires POLLOUT to be never
1570196460Skib				 * set simultaneously with POLLHUP.
1571196460Skib				 */
1572196460Skib				if ((fds->revents & POLLHUP) != 0)
1573196460Skib					fds->revents &= ~POLLOUT;
1574196460Skib
157529351Speter				if (fds->revents != 0)
157629351Speter					n++;
157729351Speter			}
157829351Speter		}
157929351Speter	}
1580168355Srwatson	FILEDESC_SUNLOCK(fdp);
158183366Sjulian	td->td_retval[0] = n;
158229351Speter	return (0);
158329351Speter}
158429351Speter
158529351Speter/*
158629351Speter * OpenBSD poll system call.
1587167211Srwatson *
158829351Speter * XXX this isn't quite a true representation..  OpenBSD uses select ops.
158929351Speter */
159029351Speter#ifndef _SYS_SYSPROTO_H_
159129351Speterstruct openbsd_poll_args {
159229351Speter	struct pollfd *fds;
159329351Speter	u_int	nfds;
159429351Speter	int	timeout;
159529351Speter};
159629351Speter#endif
159729351Speterint
1598225617Skmacysys_openbsd_poll(td, uap)
159983366Sjulian	register struct thread *td;
160029351Speter	register struct openbsd_poll_args *uap;
160129351Speter{
1602225617Skmacy	return (sys_poll(td, (struct poll_args *)uap));
160329351Speter}
160429351Speter
160592252Salfred/*
1606252356Sdavide * XXX This was created specifically to support netncp and netsmb.  This
1607252356Sdavide * allows the caller to specify a socket to wait for events on.  It returns
1608252356Sdavide * 0 if any events matched and an error otherwise.  There is no way to
1609252356Sdavide * determine which events fired.
1610252356Sdavide */
1611252356Sdavideint
1612252356Sdavideselsocket(struct socket *so, int events, struct timeval *tvp, struct thread *td)
1613252356Sdavide{
1614252356Sdavide	struct timeval rtv;
1615252356Sdavide	sbintime_t asbt, precision, rsbt;
1616252356Sdavide	int error;
1617252356Sdavide
1618252367Speter	precision = 0;	/* stupid gcc! */
1619252356Sdavide	if (tvp != NULL) {
1620252356Sdavide		rtv = *tvp;
1621252356Sdavide		if (rtv.tv_sec < 0 || rtv.tv_usec < 0 ||
1622252356Sdavide		    rtv.tv_usec >= 1000000)
1623252356Sdavide			return (EINVAL);
1624252356Sdavide		if (!timevalisset(&rtv))
1625252356Sdavide			asbt = 0;
1626252356Sdavide		else if (rtv.tv_sec <= INT32_MAX) {
1627252356Sdavide			rsbt = tvtosbt(rtv);
1628252356Sdavide			precision = rsbt;
1629252356Sdavide			precision >>= tc_precexp;
1630252356Sdavide			if (TIMESEL(&asbt, rsbt))
1631252356Sdavide				asbt += tc_tick_sbt;
1632304894Skib			if (asbt <= SBT_MAX - rsbt)
1633252356Sdavide				asbt += rsbt;
1634252356Sdavide			else
1635252356Sdavide				asbt = -1;
1636252356Sdavide		} else
1637252356Sdavide			asbt = -1;
1638252356Sdavide	} else
1639252356Sdavide		asbt = -1;
1640252356Sdavide	seltdinit(td);
1641252356Sdavide	/*
1642252356Sdavide	 * Iterate until the timeout expires or the socket becomes ready.
1643252356Sdavide	 */
1644252356Sdavide	for (;;) {
1645252356Sdavide		selfdalloc(td, NULL);
1646252356Sdavide		error = sopoll(so, events, NULL, td);
1647252356Sdavide		/* error here is actually the ready events. */
1648252356Sdavide		if (error)
1649252356Sdavide			return (0);
1650252356Sdavide		error = seltdwait(td, asbt, precision);
1651252356Sdavide		if (error)
1652252356Sdavide			break;
1653252356Sdavide	}
1654252356Sdavide	seltdclear(td);
1655252356Sdavide	/* XXX Duplicates ncp/smb behavior. */
1656252356Sdavide	if (error == ERESTART)
1657252356Sdavide		error = 0;
1658252356Sdavide	return (error);
1659252356Sdavide}
1660252356Sdavide
1661252356Sdavide/*
1662174647Sjeff * Preallocate two selfds associated with 'cookie'.  Some fo_poll routines
1663174647Sjeff * have two select sets, one for read and another for write.
1664174647Sjeff */
1665174647Sjeffstatic void
1666174647Sjeffselfdalloc(struct thread *td, void *cookie)
1667174647Sjeff{
1668174647Sjeff	struct seltd *stp;
1669174647Sjeff
1670174647Sjeff	stp = td->td_sel;
1671174647Sjeff	if (stp->st_free1 == NULL)
1672174647Sjeff		stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1673174647Sjeff	stp->st_free1->sf_td = stp;
1674174647Sjeff	stp->st_free1->sf_cookie = cookie;
1675174647Sjeff	if (stp->st_free2 == NULL)
1676174647Sjeff		stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK|M_ZERO);
1677174647Sjeff	stp->st_free2->sf_td = stp;
1678174647Sjeff	stp->st_free2->sf_cookie = cookie;
1679174647Sjeff}
1680174647Sjeff
1681174647Sjeffstatic void
1682174647Sjeffselfdfree(struct seltd *stp, struct selfd *sfp)
1683174647Sjeff{
1684174647Sjeff	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
1685174647Sjeff	mtx_lock(sfp->sf_mtx);
1686174647Sjeff	if (sfp->sf_si)
1687174647Sjeff		TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
1688174647Sjeff	mtx_unlock(sfp->sf_mtx);
1689174647Sjeff	uma_zfree(selfd_zone, sfp);
1690174647Sjeff}
1691174647Sjeff
1692225177Sattilio/* Drain the waiters tied to all the selfd belonging the specified selinfo. */
1693225177Sattiliovoid
1694225177Sattilioseldrain(sip)
1695225177Sattilio        struct selinfo *sip;
1696225177Sattilio{
1697225177Sattilio
1698225177Sattilio	/*
1699225177Sattilio	 * This feature is already provided by doselwakeup(), thus it is
1700225177Sattilio	 * enough to go for it.
1701225177Sattilio	 * Eventually, the context, should take care to avoid races
1702225177Sattilio	 * between thread calling select()/poll() and file descriptor
1703225177Sattilio	 * detaching, but, again, the races are just the same as
1704225177Sattilio	 * selwakeup().
1705225177Sattilio	 */
1706225177Sattilio        doselwakeup(sip, -1);
1707225177Sattilio}
1708225177Sattilio
1709174647Sjeff/*
17101541Srgrimes * Record a select request.
17111541Srgrimes */
17121541Srgrimesvoid
17131541Srgrimesselrecord(selector, sip)
171483366Sjulian	struct thread *selector;
17151541Srgrimes	struct selinfo *sip;
17161541Srgrimes{
1717174647Sjeff	struct selfd *sfp;
1718174647Sjeff	struct seltd *stp;
1719174647Sjeff	struct mtx *mtxp;
17201541Srgrimes
1721174647Sjeff	stp = selector->td_sel;
172292252Salfred	/*
1723174647Sjeff	 * Don't record when doing a rescan.
172492252Salfred	 */
1725174647Sjeff	if (stp->st_flags & SELTD_RESCAN)
1726174647Sjeff		return;
1727174647Sjeff	/*
1728174647Sjeff	 * Grab one of the preallocated descriptors.
1729174647Sjeff	 */
1730174647Sjeff	sfp = NULL;
1731174647Sjeff	if ((sfp = stp->st_free1) != NULL)
1732174647Sjeff		stp->st_free1 = NULL;
1733174647Sjeff	else if ((sfp = stp->st_free2) != NULL)
1734174647Sjeff		stp->st_free2 = NULL;
1735174647Sjeff	else
1736174647Sjeff		panic("selrecord: No free selfd on selq");
1737195259Sjeff	mtxp = sip->si_mtx;
1738195259Sjeff	if (mtxp == NULL)
1739195259Sjeff		mtxp = mtx_pool_find(mtxpool_select, sip);
1740174647Sjeff	/*
1741174647Sjeff	 * Initialize the sfp and queue it in the thread.
1742174647Sjeff	 */
1743174647Sjeff	sfp->sf_si = sip;
1744174647Sjeff	sfp->sf_mtx = mtxp;
1745174647Sjeff	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
1746174647Sjeff	/*
1747174647Sjeff	 * Now that we've locked the sip, check for initialization.
1748174647Sjeff	 */
1749174647Sjeff	mtx_lock(mtxp);
1750174647Sjeff	if (sip->si_mtx == NULL) {
1751174647Sjeff		sip->si_mtx = mtxp;
1752174647Sjeff		TAILQ_INIT(&sip->si_tdlist);
175383366Sjulian	}
1754174647Sjeff	/*
1755174647Sjeff	 * Add this thread to the list of selfds listening on this selinfo.
1756174647Sjeff	 */
1757174647Sjeff	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
1758174647Sjeff	mtx_unlock(sip->si_mtx);
17591541Srgrimes}
17601541Srgrimes
1761122352Stanimura/* Wake up a selecting thread. */
1762122352Stanimuravoid
1763122352Stanimuraselwakeup(sip)
1764122352Stanimura	struct selinfo *sip;
1765122352Stanimura{
1766122352Stanimura	doselwakeup(sip, -1);
1767122352Stanimura}
1768122352Stanimura
1769122352Stanimura/* Wake up a selecting thread, and set its priority. */
1770122352Stanimuravoid
1771122352Stanimuraselwakeuppri(sip, pri)
1772122352Stanimura	struct selinfo *sip;
1773122352Stanimura	int pri;
1774122352Stanimura{
1775122352Stanimura	doselwakeup(sip, pri);
1776122352Stanimura}
1777122352Stanimura
17781541Srgrimes/*
17791541Srgrimes * Do a wakeup when a selectable event occurs.
17801541Srgrimes */
1781122352Stanimurastatic void
1782122352Stanimuradoselwakeup(sip, pri)
178392252Salfred	struct selinfo *sip;
1784122352Stanimura	int pri;
17851541Srgrimes{
1786174647Sjeff	struct selfd *sfp;
1787174647Sjeff	struct selfd *sfn;
1788174647Sjeff	struct seltd *stp;
17891541Srgrimes
1790174647Sjeff	/* If it's not initialized there can't be any waiters. */
1791174647Sjeff	if (sip->si_mtx == NULL)
179292252Salfred		return;
1793174647Sjeff	/*
1794174647Sjeff	 * Locking the selinfo locks all selfds associated with it.
1795174647Sjeff	 */
1796174647Sjeff	mtx_lock(sip->si_mtx);
1797174647Sjeff	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
1798174647Sjeff		/*
1799174647Sjeff		 * Once we remove this sfp from the list and clear the
1800174647Sjeff		 * sf_si seltdclear will know to ignore this si.
1801174647Sjeff		 */
1802174647Sjeff		TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
1803174647Sjeff		sfp->sf_si = NULL;
1804174647Sjeff		stp = sfp->sf_td;
1805174647Sjeff		mtx_lock(&stp->st_mtx);
1806174647Sjeff		stp->st_flags |= SELTD_PENDING;
1807174647Sjeff		cv_broadcastpri(&stp->st_wait, pri);
1808174647Sjeff		mtx_unlock(&stp->st_mtx);
18091541Srgrimes	}
1810174647Sjeff	mtx_unlock(sip->si_mtx);
18111541Srgrimes}
181276564Stanimura
1813174647Sjeffstatic void
1814174647Sjeffseltdinit(struct thread *td)
1815174647Sjeff{
1816174647Sjeff	struct seltd *stp;
181776564Stanimura
1818174647Sjeff	if ((stp = td->td_sel) != NULL)
1819174647Sjeff		goto out;
1820174647Sjeff	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK|M_ZERO);
1821174647Sjeff	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
1822174647Sjeff	cv_init(&stp->st_wait, "select");
1823174647Sjeffout:
1824174647Sjeff	stp->st_flags = 0;
1825174647Sjeff	STAILQ_INIT(&stp->st_selq);
1826174647Sjeff}
1827174647Sjeff
1828174647Sjeffstatic int
1829247801Sdavideseltdwait(struct thread *td, sbintime_t sbt, sbintime_t precision)
1830174647Sjeff{
1831174647Sjeff	struct seltd *stp;
1832174647Sjeff	int error;
1833174647Sjeff
1834174647Sjeff	stp = td->td_sel;
1835174647Sjeff	/*
1836174647Sjeff	 * An event of interest may occur while we do not hold the seltd
1837174647Sjeff	 * locked so check the pending flag before we sleep.
1838174647Sjeff	 */
1839174647Sjeff	mtx_lock(&stp->st_mtx);
1840174647Sjeff	/*
1841174647Sjeff	 * Any further calls to selrecord will be a rescan.
1842174647Sjeff	 */
1843174647Sjeff	stp->st_flags |= SELTD_RESCAN;
1844174647Sjeff	if (stp->st_flags & SELTD_PENDING) {
1845174647Sjeff		mtx_unlock(&stp->st_mtx);
1846174647Sjeff		return (0);
1847174647Sjeff	}
1848247801Sdavide	if (sbt == 0)
1849247801Sdavide		error = EWOULDBLOCK;
1850247801Sdavide	else if (sbt != -1)
1851247801Sdavide		error = cv_timedwait_sig_sbt(&stp->st_wait, &stp->st_mtx,
1852247801Sdavide		    sbt, precision, C_ABSOLUTE);
1853174647Sjeff	else
1854174647Sjeff		error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
1855174647Sjeff	mtx_unlock(&stp->st_mtx);
1856174647Sjeff
1857174647Sjeff	return (error);
1858174647Sjeff}
1859174647Sjeff
1860174647Sjeffvoid
1861174647Sjeffseltdfini(struct thread *td)
1862174647Sjeff{
1863174647Sjeff	struct seltd *stp;
1864174647Sjeff
1865174647Sjeff	stp = td->td_sel;
1866174647Sjeff	if (stp == NULL)
1867174647Sjeff		return;
1868174647Sjeff	if (stp->st_free1)
1869174647Sjeff		uma_zfree(selfd_zone, stp->st_free1);
1870174647Sjeff	if (stp->st_free2)
1871174647Sjeff		uma_zfree(selfd_zone, stp->st_free2);
1872174647Sjeff	td->td_sel = NULL;
1873174647Sjeff	free(stp, M_SELECT);
1874174647Sjeff}
1875174647Sjeff
1876174647Sjeff/*
1877174647Sjeff * Remove the references to the thread from all of the objects we were
1878174647Sjeff * polling.
1879174647Sjeff */
188076564Stanimurastatic void
1881174647Sjeffseltdclear(struct thread *td)
188276564Stanimura{
1883174647Sjeff	struct seltd *stp;
1884174647Sjeff	struct selfd *sfp;
1885174647Sjeff	struct selfd *sfn;
1886174647Sjeff
1887174647Sjeff	stp = td->td_sel;
1888174647Sjeff	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
1889174647Sjeff		selfdfree(stp, sfp);
1890174647Sjeff	stp->st_flags = 0;
189176564Stanimura}
1892174647Sjeff
1893174647Sjeffstatic void selectinit(void *);
1894174647SjeffSYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
1895174647Sjeffstatic void
1896174647Sjeffselectinit(void *dummy __unused)
1897174647Sjeff{
1898195259Sjeff
1899174647Sjeff	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
1900174647Sjeff	    NULL, NULL, UMA_ALIGN_PTR, 0);
1901195259Sjeff	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
1902174647Sjeff}
1903