linux_event.c revision 321026
1299781Sjmcneill/*- 2299781Sjmcneill * Copyright (c) 2007 Roman Divacky 3299781Sjmcneill * Copyright (c) 2014 Dmitry Chagin 4299781Sjmcneill * All rights reserved. 5299781Sjmcneill * 6299781Sjmcneill * Redistribution and use in source and binary forms, with or without 7299781Sjmcneill * modification, are permitted provided that the following conditions 8299781Sjmcneill * are met: 9299781Sjmcneill * 1. Redistributions of source code must retain the above copyright 10299781Sjmcneill * notice, this list of conditions and the following disclaimer. 11299781Sjmcneill * 2. Redistributions in binary form must reproduce the above copyright 12299781Sjmcneill * notice, this list of conditions and the following disclaimer in the 13299781Sjmcneill * documentation and/or other materials provided with the distribution. 14299781Sjmcneill * 15299781Sjmcneill * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16299781Sjmcneill * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17299781Sjmcneill * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18299781Sjmcneill * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19299781Sjmcneill * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20299781Sjmcneill * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21299781Sjmcneill * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22299781Sjmcneill * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23299781Sjmcneill * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24299781Sjmcneill * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25299781Sjmcneill * SUCH DAMAGE. 26299781Sjmcneill */ 27299781Sjmcneill 28299781Sjmcneill#include <sys/cdefs.h> 29299781Sjmcneill__FBSDID("$FreeBSD: stable/10/sys/compat/linux/linux_event.c 321026 2017-07-15 18:25:59Z dchagin $"); 30299781Sjmcneill 31299781Sjmcneill#include "opt_compat.h" 32299781Sjmcneill 33299781Sjmcneill#include <sys/param.h> 34299781Sjmcneill#include <sys/systm.h> 35299781Sjmcneill#include <sys/imgact.h> 36299781Sjmcneill#include <sys/kernel.h> 37299781Sjmcneill#include <sys/limits.h> 38299781Sjmcneill#include <sys/lock.h> 39299781Sjmcneill#include <sys/mutex.h> 40299781Sjmcneill#include <sys/capsicum.h> 41299781Sjmcneill#include <sys/types.h> 42299781Sjmcneill#include <sys/file.h> 43299781Sjmcneill#include <sys/filedesc.h> 44299781Sjmcneill#include <sys/filio.h> 45299781Sjmcneill#include <sys/errno.h> 46299781Sjmcneill#include <sys/event.h> 47299781Sjmcneill#include <sys/poll.h> 48299781Sjmcneill#include <sys/proc.h> 49299781Sjmcneill#include <sys/selinfo.h> 50299781Sjmcneill#include <sys/sx.h> 51299781Sjmcneill#include <sys/syscallsubr.h> 52299781Sjmcneill#include <sys/timespec.h> 53299781Sjmcneill 54299781Sjmcneill#ifdef COMPAT_LINUX32 55299781Sjmcneill#include <machine/../linux32/linux.h> 56299781Sjmcneill#include <machine/../linux32/linux32_proto.h> 57299781Sjmcneill#else 58299781Sjmcneill#include <machine/../linux/linux.h> 59299781Sjmcneill#include <machine/../linux/linux_proto.h> 60299781Sjmcneill#endif 61299781Sjmcneill 62299781Sjmcneill#include <compat/linux/linux_emul.h> 63299781Sjmcneill#include <compat/linux/linux_event.h> 64299781Sjmcneill#include <compat/linux/linux_file.h> 65299781Sjmcneill#include <compat/linux/linux_util.h> 66299781Sjmcneill 67299781Sjmcneill/* 68299781Sjmcneill * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 69299781Sjmcneill * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 70299781Sjmcneill * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 71299781Sjmcneill * data verbatuim. Therefore we allocate 64-bit memory block to pass 72299781Sjmcneill * user supplied data for every file descriptor. 73299781Sjmcneill */ 74299781Sjmcneill 75299781Sjmcneilltypedef uint64_t epoll_udata_t; 76299781Sjmcneill 77299781Sjmcneillstruct epoll_emuldata { 78299781Sjmcneill uint32_t fdc; /* epoll udata max index */ 79299781Sjmcneill epoll_udata_t udata[1]; /* epoll user data vector */ 80299781Sjmcneill}; 81299781Sjmcneill 82299781Sjmcneill#define EPOLL_DEF_SZ 16 83299781Sjmcneill#define EPOLL_SIZE(fdn) \ 84299781Sjmcneill (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 85299781Sjmcneill 86299781Sjmcneillstruct epoll_event { 87299781Sjmcneill uint32_t events; 88299781Sjmcneill epoll_udata_t data; 89299781Sjmcneill} 90299781Sjmcneill#if defined(__amd64__) 91299781Sjmcneill__attribute__((packed)) 92299781Sjmcneill#endif 93299781Sjmcneill; 94299781Sjmcneill 95299781Sjmcneill#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 96299781Sjmcneill 97299781Sjmcneillstatic void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 98299781Sjmcneillstatic int epoll_to_kevent(struct thread *td, struct file *epfp, 99299781Sjmcneill int fd, struct epoll_event *l_event, int *kev_flags, 100299781Sjmcneill struct kevent *kevent, int *nkevents); 101299781Sjmcneillstatic void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 102299781Sjmcneillstatic int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 103299781Sjmcneillstatic int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 104299781Sjmcneillstatic int epoll_delete_event(struct thread *td, struct file *epfp, 105299781Sjmcneill int fd, int filter); 106299781Sjmcneillstatic int epoll_delete_all_events(struct thread *td, struct file *epfp, 107299781Sjmcneill int fd); 108299781Sjmcneill 109299781Sjmcneillstruct epoll_copyin_args { 110299781Sjmcneill struct kevent *changelist; 111299781Sjmcneill}; 112299781Sjmcneill 113299781Sjmcneillstruct epoll_copyout_args { 114299781Sjmcneill struct epoll_event *leventlist; 115299781Sjmcneill struct proc *p; 116299781Sjmcneill uint32_t count; 117299781Sjmcneill int error; 118299781Sjmcneill}; 119299781Sjmcneill 120299781Sjmcneill/* eventfd */ 121299781Sjmcneilltypedef uint64_t eventfd_t; 122299781Sjmcneill 123299781Sjmcneillstatic fo_rdwr_t eventfd_read; 124299781Sjmcneillstatic fo_rdwr_t eventfd_write; 125299781Sjmcneillstatic fo_truncate_t eventfd_truncate; 126299862Sjmcneillstatic fo_ioctl_t eventfd_ioctl; 127299781Sjmcneillstatic fo_poll_t eventfd_poll; 128299781Sjmcneillstatic fo_kqfilter_t eventfd_kqfilter; 129299781Sjmcneillstatic fo_stat_t eventfd_stat; 130299781Sjmcneillstatic fo_close_t eventfd_close; 131299781Sjmcneill 132299781Sjmcneillstatic struct fileops eventfdops = { 133299781Sjmcneill .fo_read = eventfd_read, 134299781Sjmcneill .fo_write = eventfd_write, 135299781Sjmcneill .fo_truncate = eventfd_truncate, 136299781Sjmcneill .fo_ioctl = eventfd_ioctl, 137299781Sjmcneill .fo_poll = eventfd_poll, 138299781Sjmcneill .fo_kqfilter = eventfd_kqfilter, 139299781Sjmcneill .fo_stat = eventfd_stat, 140299781Sjmcneill .fo_close = eventfd_close, 141299862Sjmcneill .fo_chmod = invfo_chmod, 142299862Sjmcneill .fo_chown = invfo_chown, 143299781Sjmcneill .fo_sendfile = invfo_sendfile, 144299781Sjmcneill .fo_flags = DFLAG_PASSABLE 145299781Sjmcneill}; 146299781Sjmcneill 147299781Sjmcneillstatic void filt_eventfddetach(struct knote *kn); 148299781Sjmcneillstatic int filt_eventfdread(struct knote *kn, long hint); 149299781Sjmcneillstatic int filt_eventfdwrite(struct knote *kn, long hint); 150299781Sjmcneill 151299781Sjmcneillstatic struct filterops eventfd_rfiltops = { 152299781Sjmcneill .f_isfd = 1, 153299781Sjmcneill .f_detach = filt_eventfddetach, 154299781Sjmcneill .f_event = filt_eventfdread 155299781Sjmcneill}; 156299781Sjmcneillstatic struct filterops eventfd_wfiltops = { 157299781Sjmcneill .f_isfd = 1, 158299781Sjmcneill .f_detach = filt_eventfddetach, 159299781Sjmcneill .f_event = filt_eventfdwrite 160299781Sjmcneill}; 161299781Sjmcneill 162299781Sjmcneillstruct eventfd { 163299781Sjmcneill eventfd_t efd_count; 164299781Sjmcneill uint32_t efd_flags; 165299781Sjmcneill struct selinfo efd_sel; 166299781Sjmcneill struct mtx efd_lock; 167299781Sjmcneill}; 168299781Sjmcneill 169299781Sjmcneillstatic int eventfd_create(struct thread *td, uint32_t initval, int flags); 170299781Sjmcneill 171299781Sjmcneill 172299781Sjmcneillstatic void 173299781Sjmcneillepoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 174299781Sjmcneill{ 175299781Sjmcneill struct linux_pemuldata *pem; 176299781Sjmcneill struct epoll_emuldata *emd; 177299781Sjmcneill struct proc *p; 178299781Sjmcneill 179299781Sjmcneill p = td->td_proc; 180299781Sjmcneill 181299781Sjmcneill pem = pem_find(p); 182299781Sjmcneill KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 183299781Sjmcneill 184299781Sjmcneill LINUX_PEM_XLOCK(pem); 185299781Sjmcneill if (pem->epoll == NULL) { 186299781Sjmcneill emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 187299781Sjmcneill emd->fdc = fd; 188299781Sjmcneill pem->epoll = emd; 189299781Sjmcneill } else { 190299781Sjmcneill emd = pem->epoll; 191299781Sjmcneill if (fd > emd->fdc) { 192299781Sjmcneill emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 193299781Sjmcneill emd->fdc = fd; 194299781Sjmcneill pem->epoll = emd; 195299781Sjmcneill } 196299781Sjmcneill } 197299781Sjmcneill emd->udata[fd] = udata; 198299781Sjmcneill LINUX_PEM_XUNLOCK(pem); 199299781Sjmcneill} 200299781Sjmcneill 201299781Sjmcneillstatic int 202299862Sjmcneillepoll_create_common(struct thread *td, int flags) 203299781Sjmcneill{ 204299781Sjmcneill int error; 205299781Sjmcneill 206299781Sjmcneill error = kern_kqueue(td, flags); 207299781Sjmcneill if (error) 208299781Sjmcneill return (error); 209299781Sjmcneill 210299781Sjmcneill epoll_fd_install(td, EPOLL_DEF_SZ, 0); 211299781Sjmcneill 212299862Sjmcneill return (0); 213299862Sjmcneill} 214299862Sjmcneill 215299862Sjmcneillint 216299862Sjmcneilllinux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 217299862Sjmcneill{ 218299781Sjmcneill 219299862Sjmcneill /* 220299781Sjmcneill * args->size is unused. Linux just tests it 221299781Sjmcneill * and then forgets it as well. 222299781Sjmcneill */ 223299781Sjmcneill if (args->size <= 0) 224299781Sjmcneill return (EINVAL); 225299781Sjmcneill 226299781Sjmcneill return (epoll_create_common(td, 0)); 227299781Sjmcneill} 228299781Sjmcneill 229299781Sjmcneillint 230299781Sjmcneilllinux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 231299781Sjmcneill{ 232299781Sjmcneill int flags; 233299781Sjmcneill 234299781Sjmcneill if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 235299781Sjmcneill return (EINVAL); 236299781Sjmcneill 237299781Sjmcneill flags = 0; 238299781Sjmcneill if ((args->flags & LINUX_O_CLOEXEC) != 0) 239299781Sjmcneill flags |= O_CLOEXEC; 240299781Sjmcneill 241299781Sjmcneill return (epoll_create_common(td, flags)); 242299781Sjmcneill} 243299781Sjmcneill 244299781Sjmcneill/* Structure converting function from epoll to kevent. */ 245299781Sjmcneillstatic int 246299781Sjmcneillepoll_to_kevent(struct thread *td, struct file *epfp, 247299781Sjmcneill int fd, struct epoll_event *l_event, int *kev_flags, 248299781Sjmcneill struct kevent *kevent, int *nkevents) 249299781Sjmcneill{ 250299781Sjmcneill uint32_t levents = l_event->events; 251299781Sjmcneill struct linux_pemuldata *pem; 252299781Sjmcneill struct proc *p; 253299781Sjmcneill 254299781Sjmcneill /* flags related to how event is registered */ 255299781Sjmcneill if ((levents & LINUX_EPOLLONESHOT) != 0) 256299781Sjmcneill *kev_flags |= EV_ONESHOT; 257299781Sjmcneill if ((levents & LINUX_EPOLLET) != 0) 258299781Sjmcneill *kev_flags |= EV_CLEAR; 259299781Sjmcneill if ((levents & LINUX_EPOLLERR) != 0) 260299781Sjmcneill *kev_flags |= EV_ERROR; 261299781Sjmcneill if ((levents & LINUX_EPOLLRDHUP) != 0) 262299781Sjmcneill *kev_flags |= EV_EOF; 263299781Sjmcneill 264299781Sjmcneill /* flags related to what event is registered */ 265299781Sjmcneill if ((levents & LINUX_EPOLL_EVRD) != 0) { 266299781Sjmcneill EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 267299781Sjmcneill ++(*nkevents); 268299781Sjmcneill } 269299781Sjmcneill if ((levents & LINUX_EPOLL_EVWR) != 0) { 270299781Sjmcneill EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 271299781Sjmcneill ++(*nkevents); 272299781Sjmcneill } 273299781Sjmcneill 274299781Sjmcneill if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 275299781Sjmcneill p = td->td_proc; 276299781Sjmcneill 277299781Sjmcneill pem = pem_find(p); 278299781Sjmcneill KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 279299781Sjmcneill KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 280299781Sjmcneill 281299781Sjmcneill LINUX_PEM_XLOCK(pem); 282299781Sjmcneill if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 283299781Sjmcneill pem->flags |= LINUX_XUNSUP_EPOLL; 284299781Sjmcneill LINUX_PEM_XUNLOCK(pem); 285299781Sjmcneill linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 286299781Sjmcneill levents); 287299781Sjmcneill } else 288299781Sjmcneill LINUX_PEM_XUNLOCK(pem); 289299781Sjmcneill return (EINVAL); 290299781Sjmcneill } 291299781Sjmcneill 292299781Sjmcneill return (0); 293299781Sjmcneill} 294299781Sjmcneill 295299781Sjmcneill/* 296299781Sjmcneill * Structure converting function from kevent to epoll. In a case 297299781Sjmcneill * this is called on error in registration we store the error in 298299781Sjmcneill * event->data and pick it up later in linux_epoll_ctl(). 299299781Sjmcneill */ 300299781Sjmcneillstatic void 301299781Sjmcneillkevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 302299781Sjmcneill{ 303299781Sjmcneill 304299781Sjmcneill if ((kevent->flags & EV_ERROR) != 0) { 305299781Sjmcneill l_event->events = LINUX_EPOLLERR; 306299781Sjmcneill return; 307299781Sjmcneill } 308299781Sjmcneill 309299781Sjmcneill switch (kevent->filter) { 310299781Sjmcneill case EVFILT_READ: 311299781Sjmcneill l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI; 312299781Sjmcneill if ((kevent->flags & EV_EOF) != 0) 313299781Sjmcneill l_event->events |= LINUX_EPOLLRDHUP; 314299781Sjmcneill break; 315299781Sjmcneill case EVFILT_WRITE: 316299781Sjmcneill l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM; 317299781Sjmcneill break; 318299781Sjmcneill } 319299781Sjmcneill} 320299781Sjmcneill 321299781Sjmcneill/* 322299781Sjmcneill * Copyout callback used by kevent. This converts kevent 323299781Sjmcneill * events to epoll events and copies them back to the 324299781Sjmcneill * userspace. This is also called on error on registering 325299781Sjmcneill * of the filter. 326299781Sjmcneill */ 327299781Sjmcneillstatic int 328299781Sjmcneillepoll_kev_copyout(void *arg, struct kevent *kevp, int count) 329299781Sjmcneill{ 330299781Sjmcneill struct epoll_copyout_args *args; 331299781Sjmcneill struct linux_pemuldata *pem; 332299781Sjmcneill struct epoll_emuldata *emd; 333299781Sjmcneill struct epoll_event *eep; 334299781Sjmcneill int error, fd, i; 335299781Sjmcneill 336299781Sjmcneill args = (struct epoll_copyout_args*) arg; 337299781Sjmcneill eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 338299781Sjmcneill 339299781Sjmcneill pem = pem_find(args->p); 340299781Sjmcneill KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 341299781Sjmcneill LINUX_PEM_SLOCK(pem); 342299781Sjmcneill emd = pem->epoll; 343299781Sjmcneill KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 344299781Sjmcneill 345299781Sjmcneill for (i = 0; i < count; i++) { 346299781Sjmcneill kevent_to_epoll(&kevp[i], &eep[i]); 347299781Sjmcneill 348299781Sjmcneill fd = kevp[i].ident; 349299781Sjmcneill KASSERT(fd <= emd->fdc, ("epoll user data vector" 350299781Sjmcneill " is too small.\n")); 351299781Sjmcneill eep[i].data = emd->udata[fd]; 352299781Sjmcneill } 353299781Sjmcneill LINUX_PEM_SUNLOCK(pem); 354299781Sjmcneill 355299781Sjmcneill error = copyout(eep, args->leventlist, count * sizeof(*eep)); 356299781Sjmcneill if (error == 0) { 357299781Sjmcneill args->leventlist += count; 358299781Sjmcneill args->count += count; 359299781Sjmcneill } else if (args->error == 0) 360299781Sjmcneill args->error = error; 361299781Sjmcneill 362299781Sjmcneill free(eep, M_EPOLL); 363299781Sjmcneill return (error); 364299781Sjmcneill} 365299781Sjmcneill 366299781Sjmcneill/* 367299781Sjmcneill * Copyin callback used by kevent. This copies already 368299781Sjmcneill * converted filters from kernel memory to the kevent 369299781Sjmcneill * internal kernel memory. Hence the memcpy instead of 370299781Sjmcneill * copyin. 371299781Sjmcneill */ 372299781Sjmcneillstatic int 373299781Sjmcneillepoll_kev_copyin(void *arg, struct kevent *kevp, int count) 374299781Sjmcneill{ 375299781Sjmcneill struct epoll_copyin_args *args; 376299781Sjmcneill 377299781Sjmcneill args = (struct epoll_copyin_args*) arg; 378299781Sjmcneill 379299781Sjmcneill memcpy(kevp, args->changelist, count * sizeof(*kevp)); 380299781Sjmcneill args->changelist += count; 381299781Sjmcneill 382299781Sjmcneill return (0); 383299781Sjmcneill} 384299781Sjmcneill 385299781Sjmcneill/* 386299781Sjmcneill * Load epoll filter, convert it to kevent filter 387299781Sjmcneill * and load it into kevent subsystem. 388299781Sjmcneill */ 389299781Sjmcneillint 390299781Sjmcneilllinux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 391299781Sjmcneill{ 392299781Sjmcneill struct file *epfp, *fp; 393299781Sjmcneill struct epoll_copyin_args ciargs; 394299781Sjmcneill struct kevent kev[2]; 395299781Sjmcneill struct kevent_copyops k_ops = { &ciargs, 396299781Sjmcneill NULL, 397299781Sjmcneill epoll_kev_copyin}; 398308324Smmel struct epoll_event le; 399299781Sjmcneill cap_rights_t rights; 400299781Sjmcneill int kev_flags; 401299781Sjmcneill int nchanges = 0; 402299781Sjmcneill int error; 403299781Sjmcneill 404299781Sjmcneill if (args->op != LINUX_EPOLL_CTL_DEL) { 405308324Smmel error = copyin(args->event, &le, sizeof(le)); 406299781Sjmcneill if (error != 0) 407299781Sjmcneill return (error); 408299781Sjmcneill } 409299781Sjmcneill 410299781Sjmcneill error = fget(td, args->epfd, 411299781Sjmcneill cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 412299781Sjmcneill if (error != 0) 413299862Sjmcneill return (error); 414299781Sjmcneill if (epfp->f_type != DTYPE_KQUEUE) 415299781Sjmcneill goto leave1; 416299781Sjmcneill 417299781Sjmcneill /* Protect user data vector from incorrectly supplied fd. */ 418299781Sjmcneill error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 419299781Sjmcneill if (error != 0) 420299781Sjmcneill goto leave1; 421299781Sjmcneill 422299781Sjmcneill /* Linux disallows spying on himself */ 423299781Sjmcneill if (epfp == fp) { 424299781Sjmcneill error = EINVAL; 425299781Sjmcneill goto leave0; 426299781Sjmcneill } 427299781Sjmcneill 428299781Sjmcneill ciargs.changelist = kev; 429299781Sjmcneill 430299781Sjmcneill switch (args->op) { 431299862Sjmcneill case LINUX_EPOLL_CTL_MOD: 432299781Sjmcneill /* 433299781Sjmcneill * We don't memorize which events were set for this FD 434299781Sjmcneill * on this level, so just delete all we could have set: 435299781Sjmcneill * EVFILT_READ and EVFILT_WRITE, ignoring any errors 436299781Sjmcneill */ 437299781Sjmcneill error = epoll_delete_all_events(td, epfp, args->fd); 438299781Sjmcneill if (error) 439299781Sjmcneill goto leave0; 440299781Sjmcneill /* FALLTHROUGH */ 441299781Sjmcneill 442299781Sjmcneill case LINUX_EPOLL_CTL_ADD: 443299781Sjmcneill kev_flags = EV_ADD | EV_ENABLE; 444299781Sjmcneill break; 445299859Sjmcneill 446299859Sjmcneill case LINUX_EPOLL_CTL_DEL: 447299859Sjmcneill /* CTL_DEL means unregister this fd with this epoll */ 448299859Sjmcneill error = epoll_delete_all_events(td, epfp, args->fd); 449299859Sjmcneill goto leave0; 450299859Sjmcneill 451299859Sjmcneill default: 452299859Sjmcneill error = EINVAL; 453299859Sjmcneill goto leave0; 454299859Sjmcneill } 455299859Sjmcneill 456299781Sjmcneill error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, 457299781Sjmcneill kev, &nchanges); 458299781Sjmcneill if (error) 459299781Sjmcneill goto leave0; 460299781Sjmcneill 461299781Sjmcneill epoll_fd_install(td, args->fd, le.data); 462299781Sjmcneill 463299781Sjmcneill error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 464299781Sjmcneill 465299781Sjmcneillleave0: 466299781Sjmcneill fdrop(fp, td); 467299781Sjmcneill 468299781Sjmcneillleave1: 469299781Sjmcneill fdrop(epfp, td); 470299781Sjmcneill return (error); 471299781Sjmcneill} 472299781Sjmcneill 473299781Sjmcneill/* 474299781Sjmcneill * Wait for a filter to be triggered on the epoll file descriptor. 475309763Smanu */ 476309763Smanustatic int 477309763Smanulinux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, 478309763Smanu int maxevents, int timeout, sigset_t *uset) 479299781Sjmcneill{ 480 struct file *epfp; 481 struct timespec ts, *tsp; 482 cap_rights_t rights; 483 struct epoll_copyout_args coargs; 484 struct kevent_copyops k_ops = { &coargs, 485 epoll_kev_copyout, 486 NULL}; 487 int error; 488 489 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) 490 return (EINVAL); 491 492 if (uset != NULL) { 493 error = kern_sigprocmask(td, SIG_SETMASK, uset, 494 &td->td_oldsigmask, 0); 495 if (error != 0) 496 return (error); 497 td->td_pflags |= TDP_OLDMASK; 498 /* 499 * Make sure that ast() is called on return to 500 * usermode and TDP_OLDMASK is cleared, restoring old 501 * sigmask. 502 */ 503 thread_lock(td); 504 td->td_flags |= TDF_ASTPENDING; 505 thread_unlock(td); 506 } 507 508 error = fget(td, epfd, 509 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 510 if (error != 0) 511 return (error); 512 513 coargs.leventlist = events; 514 coargs.p = td->td_proc; 515 coargs.count = 0; 516 coargs.error = 0; 517 518 if (timeout != -1) { 519 if (timeout < 0) { 520 error = EINVAL; 521 goto leave; 522 } 523 /* Convert from milliseconds to timespec. */ 524 ts.tv_sec = timeout / 1000; 525 ts.tv_nsec = (timeout % 1000) * 1000000; 526 tsp = &ts; 527 } else { 528 tsp = NULL; 529 } 530 531 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); 532 if (error == 0 && coargs.error != 0) 533 error = coargs.error; 534 535 /* 536 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 537 * Maybe we should translate that but I don't think it matters at all. 538 */ 539 if (error == 0) 540 td->td_retval[0] = coargs.count; 541leave: 542 fdrop(epfp, td); 543 return (error); 544} 545 546int 547linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 548{ 549 550 return (linux_epoll_wait_common(td, args->epfd, args->events, 551 args->maxevents, args->timeout, NULL)); 552} 553 554int 555linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) 556{ 557 sigset_t mask, *pmask; 558 l_sigset_t lmask; 559 int error; 560 561 if (args->mask != NULL) { 562 error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); 563 if (error != 0) 564 return (error); 565 linux_to_bsd_sigset(&lmask, &mask); 566 pmask = &mask; 567 } else 568 pmask = NULL; 569 return (linux_epoll_wait_common(td, args->epfd, args->events, 570 args->maxevents, args->timeout, pmask)); 571} 572 573static int 574epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 575{ 576 struct epoll_copyin_args ciargs; 577 struct kevent kev; 578 struct kevent_copyops k_ops = { &ciargs, 579 NULL, 580 epoll_kev_copyin}; 581 int error; 582 583 ciargs.changelist = &kev; 584 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 585 586 error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL); 587 588 /* 589 * here we ignore ENONT, because we don't keep track of events here 590 */ 591 if (error == ENOENT) 592 error = 0; 593 return (error); 594} 595 596static int 597epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 598{ 599 int error1, error2; 600 601 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 602 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 603 604 /* report any errors we got */ 605 return (error1 == 0 ? error2 : error1); 606} 607 608static int 609eventfd_create(struct thread *td, uint32_t initval, int flags) 610{ 611 struct filedesc *fdp; 612 struct eventfd *efd; 613 struct file *fp; 614 int fflags, fd, error; 615 616 fflags = 0; 617 if ((flags & LINUX_O_CLOEXEC) != 0) 618 fflags |= O_CLOEXEC; 619 620 fdp = td->td_proc->p_fd; 621 error = falloc(td, &fp, &fd, fflags); 622 if (error) 623 return (error); 624 625 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); 626 efd->efd_flags = flags; 627 efd->efd_count = initval; 628 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); 629 630 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); 631 632 fflags = FREAD | FWRITE; 633 if ((flags & LINUX_O_NONBLOCK) != 0) 634 fflags |= FNONBLOCK; 635 636 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); 637 fdrop(fp, td); 638 639 td->td_retval[0] = fd; 640 return (error); 641} 642 643int 644linux_eventfd(struct thread *td, struct linux_eventfd_args *args) 645{ 646 647 return (eventfd_create(td, args->initval, 0)); 648} 649 650int 651linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) 652{ 653 654 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) 655 return (EINVAL); 656 657 return (eventfd_create(td, args->initval, args->flags)); 658} 659 660static int 661eventfd_close(struct file *fp, struct thread *td) 662{ 663 struct eventfd *efd; 664 665 efd = fp->f_data; 666 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 667 return (EBADF); 668 669 seldrain(&efd->efd_sel); 670 knlist_destroy(&efd->efd_sel.si_note); 671 672 fp->f_ops = &badfileops; 673 mtx_destroy(&efd->efd_lock); 674 free(efd, M_EPOLL); 675 676 return (0); 677} 678 679static int 680eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, 681 int flags, struct thread *td) 682{ 683 struct eventfd *efd; 684 eventfd_t count; 685 int error; 686 687 efd = fp->f_data; 688 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 689 return (EBADF); 690 691 if (uio->uio_resid < sizeof(eventfd_t)) 692 return (EINVAL); 693 694 error = 0; 695 mtx_lock(&efd->efd_lock); 696retry: 697 if (efd->efd_count == 0) { 698 if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) { 699 mtx_unlock(&efd->efd_lock); 700 return (EAGAIN); 701 } 702 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); 703 if (error == 0) 704 goto retry; 705 } 706 if (error == 0) { 707 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { 708 count = 1; 709 --efd->efd_count; 710 } else { 711 count = efd->efd_count; 712 efd->efd_count = 0; 713 } 714 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 715 selwakeup(&efd->efd_sel); 716 wakeup(&efd->efd_count); 717 mtx_unlock(&efd->efd_lock); 718 error = uiomove(&count, sizeof(eventfd_t), uio); 719 } else 720 mtx_unlock(&efd->efd_lock); 721 722 return (error); 723} 724 725static int 726eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, 727 int flags, struct thread *td) 728{ 729 struct eventfd *efd; 730 eventfd_t count; 731 int error; 732 733 efd = fp->f_data; 734 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 735 return (EBADF); 736 737 if (uio->uio_resid < sizeof(eventfd_t)) 738 return (EINVAL); 739 740 error = uiomove(&count, sizeof(eventfd_t), uio); 741 if (error) 742 return (error); 743 if (count == UINT64_MAX) 744 return (EINVAL); 745 746 mtx_lock(&efd->efd_lock); 747retry: 748 if (UINT64_MAX - efd->efd_count <= count) { 749 if ((efd->efd_flags & LINUX_O_NONBLOCK) != 0) { 750 mtx_unlock(&efd->efd_lock); 751 /* Do not not return the number of bytes written */ 752 uio->uio_resid += sizeof(eventfd_t); 753 return (EAGAIN); 754 } 755 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, 756 PCATCH, "lefdwr", 0); 757 if (error == 0) 758 goto retry; 759 } 760 if (error == 0) { 761 efd->efd_count += count; 762 KNOTE_LOCKED(&efd->efd_sel.si_note, 0); 763 selwakeup(&efd->efd_sel); 764 wakeup(&efd->efd_count); 765 } 766 mtx_unlock(&efd->efd_lock); 767 768 return (error); 769} 770 771static int 772eventfd_poll(struct file *fp, int events, struct ucred *active_cred, 773 struct thread *td) 774{ 775 struct eventfd *efd; 776 int revents = 0; 777 778 efd = fp->f_data; 779 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 780 return (POLLERR); 781 782 mtx_lock(&efd->efd_lock); 783 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) 784 revents |= events & (POLLIN|POLLRDNORM); 785 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) 786 revents |= events & (POLLOUT|POLLWRNORM); 787 if (revents == 0) 788 selrecord(td, &efd->efd_sel); 789 mtx_unlock(&efd->efd_lock); 790 791 return (revents); 792} 793 794/*ARGSUSED*/ 795static int 796eventfd_kqfilter(struct file *fp, struct knote *kn) 797{ 798 struct eventfd *efd; 799 800 efd = fp->f_data; 801 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 802 return (EINVAL); 803 804 mtx_lock(&efd->efd_lock); 805 switch (kn->kn_filter) { 806 case EVFILT_READ: 807 kn->kn_fop = &eventfd_rfiltops; 808 break; 809 case EVFILT_WRITE: 810 kn->kn_fop = &eventfd_wfiltops; 811 break; 812 default: 813 mtx_unlock(&efd->efd_lock); 814 return (EINVAL); 815 } 816 817 kn->kn_hook = efd; 818 knlist_add(&efd->efd_sel.si_note, kn, 1); 819 mtx_unlock(&efd->efd_lock); 820 821 return (0); 822} 823 824static void 825filt_eventfddetach(struct knote *kn) 826{ 827 struct eventfd *efd = kn->kn_hook; 828 829 mtx_lock(&efd->efd_lock); 830 knlist_remove(&efd->efd_sel.si_note, kn, 1); 831 mtx_unlock(&efd->efd_lock); 832} 833 834/*ARGSUSED*/ 835static int 836filt_eventfdread(struct knote *kn, long hint) 837{ 838 struct eventfd *efd = kn->kn_hook; 839 int ret; 840 841 mtx_assert(&efd->efd_lock, MA_OWNED); 842 ret = (efd->efd_count > 0); 843 844 return (ret); 845} 846 847/*ARGSUSED*/ 848static int 849filt_eventfdwrite(struct knote *kn, long hint) 850{ 851 struct eventfd *efd = kn->kn_hook; 852 int ret; 853 854 mtx_assert(&efd->efd_lock, MA_OWNED); 855 ret = (UINT64_MAX - 1 > efd->efd_count); 856 857 return (ret); 858} 859 860/*ARGSUSED*/ 861static int 862eventfd_truncate(struct file *fp, off_t length, struct ucred *active_cred, 863 struct thread *td) 864{ 865 866 return (ENXIO); 867} 868 869/*ARGSUSED*/ 870static int 871eventfd_ioctl(struct file *fp, u_long cmd, void *data, 872 struct ucred *active_cred, struct thread *td) 873{ 874 struct eventfd *efd; 875 876 efd = fp->f_data; 877 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) 878 return (EINVAL); 879 880 switch (cmd) 881 { 882 case FIONBIO: 883 if (*(int *)data) 884 efd->efd_flags |= LINUX_O_NONBLOCK; 885 else 886 efd->efd_flags &= ~LINUX_O_NONBLOCK; 887 case FIOASYNC: 888 return (0); 889 default: 890 return (ENXIO); 891 } 892} 893 894/*ARGSUSED*/ 895static int 896eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, 897 struct thread *td) 898{ 899 900 return (ENXIO); 901} 902