linux_event.c revision 293546
1/*-
2 * Copyright (c) 2007 Roman Divacky
3 * Copyright (c) 2014 Dmitry Chagin
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/compat/linux/linux_event.c 293546 2016-01-09 16:44:17Z dchagin $");
30
31#include "opt_compat.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/imgact.h>
36#include <sys/kernel.h>
37#include <sys/limits.h>
38#include <sys/lock.h>
39#include <sys/mutex.h>
40#include <sys/capability.h>
41#include <sys/types.h>
42#include <sys/file.h>
43#include <sys/filedesc.h>
44#include <sys/errno.h>
45#include <sys/event.h>
46#include <sys/proc.h>
47#include <sys/sx.h>
48#include <sys/syscallsubr.h>
49#include <sys/timespec.h>
50
51#ifdef COMPAT_LINUX32
52#include <machine/../linux32/linux.h>
53#include <machine/../linux32/linux32_proto.h>
54#else
55#include <machine/../linux/linux.h>
56#include <machine/../linux/linux_proto.h>
57#endif
58
59#include <compat/linux/linux_emul.h>
60#include <compat/linux/linux_event.h>
61#include <compat/linux/linux_file.h>
62#include <compat/linux/linux_util.h>
63
64/*
65 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
66 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
67 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
68 * data verbatuim. Therefore we allocate 64-bit memory block to pass
69 * user supplied data for every file descriptor.
70 */
71
72typedef uint64_t	epoll_udata_t;
73
74struct epoll_emuldata {
75	uint32_t	fdc;		/* epoll udata max index */
76	epoll_udata_t	udata[1];	/* epoll user data vector */
77};
78
79#define	EPOLL_DEF_SZ		16
80#define	EPOLL_SIZE(fdn)			\
81	(sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
82
83struct epoll_event {
84	uint32_t	events;
85	epoll_udata_t	data;
86}
87#if defined(__amd64__)
88__attribute__((packed))
89#endif
90;
91
92#define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
93
94static void	epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
95static int	epoll_to_kevent(struct thread *td, struct file *epfp,
96		    int fd, struct epoll_event *l_event, int *kev_flags,
97		    struct kevent *kevent, int *nkevents);
98static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
99static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
100static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
101static int	epoll_delete_event(struct thread *td, struct file *epfp,
102		    int fd, int filter);
103static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
104		    int fd);
105
106struct epoll_copyin_args {
107	struct kevent	*changelist;
108};
109
110struct epoll_copyout_args {
111	struct epoll_event	*leventlist;
112	struct proc		*p;
113	uint32_t		count;
114	int			error;
115};
116
117
118static void
119epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
120{
121	struct linux_pemuldata *pem;
122	struct epoll_emuldata *emd;
123	struct proc *p;
124
125	p = td->td_proc;
126
127	pem = pem_find(p);
128	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
129
130	LINUX_PEM_XLOCK(pem);
131	if (pem->epoll == NULL) {
132		emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
133		emd->fdc = fd;
134		pem->epoll = emd;
135	} else {
136		emd = pem->epoll;
137		if (fd > emd->fdc) {
138			emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
139			emd->fdc = fd;
140			pem->epoll = emd;
141		}
142	}
143	emd->udata[fd] = udata;
144	LINUX_PEM_XUNLOCK(pem);
145}
146
147static int
148epoll_create_common(struct thread *td, int flags)
149{
150	int error;
151
152	error = kern_kqueue(td, flags);
153	if (error)
154		return (error);
155
156	epoll_fd_install(td, EPOLL_DEF_SZ, 0);
157
158	return (0);
159}
160
161int
162linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
163{
164
165	/*
166	 * args->size is unused. Linux just tests it
167	 * and then forgets it as well.
168	 */
169	if (args->size <= 0)
170		return (EINVAL);
171
172	return (epoll_create_common(td, 0));
173}
174
175int
176linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
177{
178	int flags;
179
180	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
181		return (EINVAL);
182
183	flags = 0;
184	if ((args->flags & LINUX_O_CLOEXEC) != 0)
185		flags |= O_CLOEXEC;
186
187	return (epoll_create_common(td, flags));
188}
189
190/* Structure converting function from epoll to kevent. */
191static int
192epoll_to_kevent(struct thread *td, struct file *epfp,
193    int fd, struct epoll_event *l_event, int *kev_flags,
194    struct kevent *kevent, int *nkevents)
195{
196	uint32_t levents = l_event->events;
197	struct linux_pemuldata *pem;
198	struct proc *p;
199
200	/* flags related to how event is registered */
201	if ((levents & LINUX_EPOLLONESHOT) != 0)
202		*kev_flags |= EV_ONESHOT;
203	if ((levents & LINUX_EPOLLET) != 0)
204		*kev_flags |= EV_CLEAR;
205
206	/* flags related to what event is registered */
207	if ((levents & LINUX_EPOLL_EVRD) != 0) {
208		EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
209		++(*nkevents);
210	}
211	if ((levents & LINUX_EPOLL_EVWR) != 0) {
212		EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
213		++(*nkevents);
214	}
215
216	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
217		p = td->td_proc;
218
219		pem = pem_find(p);
220		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
221		KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
222
223		LINUX_PEM_XLOCK(pem);
224		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
225			pem->flags |= LINUX_XUNSUP_EPOLL;
226			LINUX_PEM_XUNLOCK(pem);
227			linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
228			    levents);
229		} else
230			LINUX_PEM_XUNLOCK(pem);
231		return (EINVAL);
232	}
233
234	return (0);
235}
236
237/*
238 * Structure converting function from kevent to epoll. In a case
239 * this is called on error in registration we store the error in
240 * event->data and pick it up later in linux_epoll_ctl().
241 */
242static void
243kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
244{
245
246	if ((kevent->flags & EV_ERROR) != 0)
247		return;
248
249	switch (kevent->filter) {
250	case EVFILT_READ:
251		l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI;
252	break;
253	case EVFILT_WRITE:
254		l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM;
255	break;
256	}
257}
258
259/*
260 * Copyout callback used by kevent. This converts kevent
261 * events to epoll events and copies them back to the
262 * userspace. This is also called on error on registering
263 * of the filter.
264 */
265static int
266epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
267{
268	struct epoll_copyout_args *args;
269	struct linux_pemuldata *pem;
270	struct epoll_emuldata *emd;
271	struct epoll_event *eep;
272	int error, fd, i;
273
274	args = (struct epoll_copyout_args*) arg;
275	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
276
277	pem = pem_find(args->p);
278	KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
279	LINUX_PEM_SLOCK(pem);
280	emd = pem->epoll;
281	KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
282
283	for (i = 0; i < count; i++) {
284		kevent_to_epoll(&kevp[i], &eep[i]);
285
286		fd = kevp[i].ident;
287		KASSERT(fd <= emd->fdc, ("epoll user data vector"
288						    " is too small.\n"));
289		eep[i].data = emd->udata[fd];
290	}
291	LINUX_PEM_SUNLOCK(pem);
292
293	error = copyout(eep, args->leventlist, count * sizeof(*eep));
294	if (error == 0) {
295		args->leventlist += count;
296		args->count += count;
297	} else if (args->error == 0)
298		args->error = error;
299
300	free(eep, M_EPOLL);
301	return (error);
302}
303
304/*
305 * Copyin callback used by kevent. This copies already
306 * converted filters from kernel memory to the kevent
307 * internal kernel memory. Hence the memcpy instead of
308 * copyin.
309 */
310static int
311epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
312{
313	struct epoll_copyin_args *args;
314
315	args = (struct epoll_copyin_args*) arg;
316
317	memcpy(kevp, args->changelist, count * sizeof(*kevp));
318	args->changelist += count;
319
320	return (0);
321}
322
323/*
324 * Load epoll filter, convert it to kevent filter
325 * and load it into kevent subsystem.
326 */
327int
328linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
329{
330	struct file *epfp, *fp;
331	struct epoll_copyin_args ciargs;
332	struct kevent kev[2];
333	struct kevent_copyops k_ops = { &ciargs,
334					NULL,
335					epoll_kev_copyin};
336	struct epoll_event le;
337	cap_rights_t rights;
338	int kev_flags;
339	int nchanges = 0;
340	int error;
341
342	if (args->op != LINUX_EPOLL_CTL_DEL) {
343		error = copyin(args->event, &le, sizeof(le));
344		if (error != 0)
345			return (error);
346	}
347
348	error = fget(td, args->epfd,
349	    cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
350	if (error != 0)
351		return (error);
352	if (epfp->f_type != DTYPE_KQUEUE)
353		goto leave1;
354
355	 /* Protect user data vector from incorrectly supplied fd. */
356	error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
357	if (error != 0)
358		goto leave1;
359
360	/* Linux disallows spying on himself */
361	if (epfp == fp) {
362		error = EINVAL;
363		goto leave0;
364	}
365
366	ciargs.changelist = kev;
367
368	switch (args->op) {
369	case LINUX_EPOLL_CTL_MOD:
370		/*
371		 * We don't memorize which events were set for this FD
372		 * on this level, so just delete all we could have set:
373		 * EVFILT_READ and EVFILT_WRITE, ignoring any errors
374		 */
375		error = epoll_delete_all_events(td, epfp, args->fd);
376		if (error)
377			goto leave0;
378		/* FALLTHROUGH */
379
380	case LINUX_EPOLL_CTL_ADD:
381			kev_flags = EV_ADD | EV_ENABLE;
382		break;
383
384	case LINUX_EPOLL_CTL_DEL:
385		/* CTL_DEL means unregister this fd with this epoll */
386		error = epoll_delete_all_events(td, epfp, args->fd);
387		goto leave0;
388
389	default:
390		error = EINVAL;
391		goto leave0;
392	}
393
394	error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags,
395	    kev, &nchanges);
396	if (error)
397		goto leave0;
398
399	epoll_fd_install(td, args->fd, le.data);
400
401	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
402
403leave0:
404	fdrop(fp, td);
405
406leave1:
407	fdrop(epfp, td);
408	return (error);
409}
410
411/*
412 * Wait for a filter to be triggered on the epoll file descriptor.
413 */
414int
415linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
416{
417	struct file *epfp;
418	struct timespec ts, *tsp;
419	cap_rights_t rights;
420	struct epoll_copyout_args coargs;
421	struct kevent_copyops k_ops = { &coargs,
422					epoll_kev_copyout,
423					NULL};
424	int error;
425
426	if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS)
427		return (EINVAL);
428
429	error = fget(td, args->epfd,
430	    cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
431	if (error != 0)
432		return (error);
433
434	coargs.leventlist = args->events;
435	coargs.p = td->td_proc;
436	coargs.count = 0;
437	coargs.error = 0;
438
439	if (args->timeout != -1) {
440		if (args->timeout < 0) {
441			error = EINVAL;
442			goto leave;
443		}
444		/* Convert from milliseconds to timespec. */
445		ts.tv_sec = args->timeout / 1000;
446		ts.tv_nsec = (args->timeout % 1000) * 1000000;
447		tsp = &ts;
448	} else {
449		tsp = NULL;
450	}
451
452	error = kern_kevent_fp(td, epfp, 0, args->maxevents, &k_ops, tsp);
453	if (error == 0 && coargs.error != 0)
454		error = coargs.error;
455
456	/*
457	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
458	 * Maybe we should translate that but I don't think it matters at all.
459	 */
460	if (error == 0)
461		td->td_retval[0] = coargs.count;
462leave:
463	fdrop(epfp, td);
464	return (error);
465}
466
467static int
468epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
469{
470	struct epoll_copyin_args ciargs;
471	struct kevent kev;
472	struct kevent_copyops k_ops = { &ciargs,
473					NULL,
474					epoll_kev_copyin};
475	int error;
476
477	ciargs.changelist = &kev;
478	EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
479
480	error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL);
481
482	/*
483	 * here we ignore ENONT, because we don't keep track of events here
484	 */
485	if (error == ENOENT)
486		error = 0;
487	return (error);
488}
489
490static int
491epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
492{
493	int error1, error2;
494
495	error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
496	error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
497
498	/* report any errors we got */
499	return (error1 == 0 ? error2 : error1);
500}
501