linux_event.c revision 293546
1/*- 2 * Copyright (c) 2007 Roman Divacky 3 * Copyright (c) 2014 Dmitry Chagin 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/10/sys/compat/linux/linux_event.c 293546 2016-01-09 16:44:17Z dchagin $"); 30 31#include "opt_compat.h" 32 33#include <sys/param.h> 34#include <sys/systm.h> 35#include <sys/imgact.h> 36#include <sys/kernel.h> 37#include <sys/limits.h> 38#include <sys/lock.h> 39#include <sys/mutex.h> 40#include <sys/capability.h> 41#include <sys/types.h> 42#include <sys/file.h> 43#include <sys/filedesc.h> 44#include <sys/errno.h> 45#include <sys/event.h> 46#include <sys/proc.h> 47#include <sys/sx.h> 48#include <sys/syscallsubr.h> 49#include <sys/timespec.h> 50 51#ifdef COMPAT_LINUX32 52#include <machine/../linux32/linux.h> 53#include <machine/../linux32/linux32_proto.h> 54#else 55#include <machine/../linux/linux.h> 56#include <machine/../linux/linux_proto.h> 57#endif 58 59#include <compat/linux/linux_emul.h> 60#include <compat/linux/linux_event.h> 61#include <compat/linux/linux_file.h> 62#include <compat/linux/linux_util.h> 63 64/* 65 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits 66 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only 67 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied 68 * data verbatuim. Therefore we allocate 64-bit memory block to pass 69 * user supplied data for every file descriptor. 70 */ 71 72typedef uint64_t epoll_udata_t; 73 74struct epoll_emuldata { 75 uint32_t fdc; /* epoll udata max index */ 76 epoll_udata_t udata[1]; /* epoll user data vector */ 77}; 78 79#define EPOLL_DEF_SZ 16 80#define EPOLL_SIZE(fdn) \ 81 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) 82 83struct epoll_event { 84 uint32_t events; 85 epoll_udata_t data; 86} 87#if defined(__amd64__) 88__attribute__((packed)) 89#endif 90; 91 92#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 93 94static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); 95static int epoll_to_kevent(struct thread *td, struct file *epfp, 96 int fd, struct epoll_event *l_event, int *kev_flags, 97 struct kevent *kevent, int *nkevents); 98static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); 99static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); 100static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); 101static int epoll_delete_event(struct thread *td, struct file *epfp, 102 int fd, int filter); 103static int epoll_delete_all_events(struct thread *td, struct file *epfp, 104 int fd); 105 106struct epoll_copyin_args { 107 struct kevent *changelist; 108}; 109 110struct epoll_copyout_args { 111 struct epoll_event *leventlist; 112 struct proc *p; 113 uint32_t count; 114 int error; 115}; 116 117 118static void 119epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) 120{ 121 struct linux_pemuldata *pem; 122 struct epoll_emuldata *emd; 123 struct proc *p; 124 125 p = td->td_proc; 126 127 pem = pem_find(p); 128 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 129 130 LINUX_PEM_XLOCK(pem); 131 if (pem->epoll == NULL) { 132 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 133 emd->fdc = fd; 134 pem->epoll = emd; 135 } else { 136 emd = pem->epoll; 137 if (fd > emd->fdc) { 138 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); 139 emd->fdc = fd; 140 pem->epoll = emd; 141 } 142 } 143 emd->udata[fd] = udata; 144 LINUX_PEM_XUNLOCK(pem); 145} 146 147static int 148epoll_create_common(struct thread *td, int flags) 149{ 150 int error; 151 152 error = kern_kqueue(td, flags); 153 if (error) 154 return (error); 155 156 epoll_fd_install(td, EPOLL_DEF_SZ, 0); 157 158 return (0); 159} 160 161int 162linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) 163{ 164 165 /* 166 * args->size is unused. Linux just tests it 167 * and then forgets it as well. 168 */ 169 if (args->size <= 0) 170 return (EINVAL); 171 172 return (epoll_create_common(td, 0)); 173} 174 175int 176linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) 177{ 178 int flags; 179 180 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) 181 return (EINVAL); 182 183 flags = 0; 184 if ((args->flags & LINUX_O_CLOEXEC) != 0) 185 flags |= O_CLOEXEC; 186 187 return (epoll_create_common(td, flags)); 188} 189 190/* Structure converting function from epoll to kevent. */ 191static int 192epoll_to_kevent(struct thread *td, struct file *epfp, 193 int fd, struct epoll_event *l_event, int *kev_flags, 194 struct kevent *kevent, int *nkevents) 195{ 196 uint32_t levents = l_event->events; 197 struct linux_pemuldata *pem; 198 struct proc *p; 199 200 /* flags related to how event is registered */ 201 if ((levents & LINUX_EPOLLONESHOT) != 0) 202 *kev_flags |= EV_ONESHOT; 203 if ((levents & LINUX_EPOLLET) != 0) 204 *kev_flags |= EV_CLEAR; 205 206 /* flags related to what event is registered */ 207 if ((levents & LINUX_EPOLL_EVRD) != 0) { 208 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); 209 ++(*nkevents); 210 } 211 if ((levents & LINUX_EPOLL_EVWR) != 0) { 212 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); 213 ++(*nkevents); 214 } 215 216 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { 217 p = td->td_proc; 218 219 pem = pem_find(p); 220 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 221 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); 222 223 LINUX_PEM_XLOCK(pem); 224 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { 225 pem->flags |= LINUX_XUNSUP_EPOLL; 226 LINUX_PEM_XUNLOCK(pem); 227 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", 228 levents); 229 } else 230 LINUX_PEM_XUNLOCK(pem); 231 return (EINVAL); 232 } 233 234 return (0); 235} 236 237/* 238 * Structure converting function from kevent to epoll. In a case 239 * this is called on error in registration we store the error in 240 * event->data and pick it up later in linux_epoll_ctl(). 241 */ 242static void 243kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) 244{ 245 246 if ((kevent->flags & EV_ERROR) != 0) 247 return; 248 249 switch (kevent->filter) { 250 case EVFILT_READ: 251 l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI; 252 break; 253 case EVFILT_WRITE: 254 l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM; 255 break; 256 } 257} 258 259/* 260 * Copyout callback used by kevent. This converts kevent 261 * events to epoll events and copies them back to the 262 * userspace. This is also called on error on registering 263 * of the filter. 264 */ 265static int 266epoll_kev_copyout(void *arg, struct kevent *kevp, int count) 267{ 268 struct epoll_copyout_args *args; 269 struct linux_pemuldata *pem; 270 struct epoll_emuldata *emd; 271 struct epoll_event *eep; 272 int error, fd, i; 273 274 args = (struct epoll_copyout_args*) arg; 275 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); 276 277 pem = pem_find(args->p); 278 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); 279 LINUX_PEM_SLOCK(pem); 280 emd = pem->epoll; 281 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); 282 283 for (i = 0; i < count; i++) { 284 kevent_to_epoll(&kevp[i], &eep[i]); 285 286 fd = kevp[i].ident; 287 KASSERT(fd <= emd->fdc, ("epoll user data vector" 288 " is too small.\n")); 289 eep[i].data = emd->udata[fd]; 290 } 291 LINUX_PEM_SUNLOCK(pem); 292 293 error = copyout(eep, args->leventlist, count * sizeof(*eep)); 294 if (error == 0) { 295 args->leventlist += count; 296 args->count += count; 297 } else if (args->error == 0) 298 args->error = error; 299 300 free(eep, M_EPOLL); 301 return (error); 302} 303 304/* 305 * Copyin callback used by kevent. This copies already 306 * converted filters from kernel memory to the kevent 307 * internal kernel memory. Hence the memcpy instead of 308 * copyin. 309 */ 310static int 311epoll_kev_copyin(void *arg, struct kevent *kevp, int count) 312{ 313 struct epoll_copyin_args *args; 314 315 args = (struct epoll_copyin_args*) arg; 316 317 memcpy(kevp, args->changelist, count * sizeof(*kevp)); 318 args->changelist += count; 319 320 return (0); 321} 322 323/* 324 * Load epoll filter, convert it to kevent filter 325 * and load it into kevent subsystem. 326 */ 327int 328linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) 329{ 330 struct file *epfp, *fp; 331 struct epoll_copyin_args ciargs; 332 struct kevent kev[2]; 333 struct kevent_copyops k_ops = { &ciargs, 334 NULL, 335 epoll_kev_copyin}; 336 struct epoll_event le; 337 cap_rights_t rights; 338 int kev_flags; 339 int nchanges = 0; 340 int error; 341 342 if (args->op != LINUX_EPOLL_CTL_DEL) { 343 error = copyin(args->event, &le, sizeof(le)); 344 if (error != 0) 345 return (error); 346 } 347 348 error = fget(td, args->epfd, 349 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); 350 if (error != 0) 351 return (error); 352 if (epfp->f_type != DTYPE_KQUEUE) 353 goto leave1; 354 355 /* Protect user data vector from incorrectly supplied fd. */ 356 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); 357 if (error != 0) 358 goto leave1; 359 360 /* Linux disallows spying on himself */ 361 if (epfp == fp) { 362 error = EINVAL; 363 goto leave0; 364 } 365 366 ciargs.changelist = kev; 367 368 switch (args->op) { 369 case LINUX_EPOLL_CTL_MOD: 370 /* 371 * We don't memorize which events were set for this FD 372 * on this level, so just delete all we could have set: 373 * EVFILT_READ and EVFILT_WRITE, ignoring any errors 374 */ 375 error = epoll_delete_all_events(td, epfp, args->fd); 376 if (error) 377 goto leave0; 378 /* FALLTHROUGH */ 379 380 case LINUX_EPOLL_CTL_ADD: 381 kev_flags = EV_ADD | EV_ENABLE; 382 break; 383 384 case LINUX_EPOLL_CTL_DEL: 385 /* CTL_DEL means unregister this fd with this epoll */ 386 error = epoll_delete_all_events(td, epfp, args->fd); 387 goto leave0; 388 389 default: 390 error = EINVAL; 391 goto leave0; 392 } 393 394 error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, 395 kev, &nchanges); 396 if (error) 397 goto leave0; 398 399 epoll_fd_install(td, args->fd, le.data); 400 401 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); 402 403leave0: 404 fdrop(fp, td); 405 406leave1: 407 fdrop(epfp, td); 408 return (error); 409} 410 411/* 412 * Wait for a filter to be triggered on the epoll file descriptor. 413 */ 414int 415linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) 416{ 417 struct file *epfp; 418 struct timespec ts, *tsp; 419 cap_rights_t rights; 420 struct epoll_copyout_args coargs; 421 struct kevent_copyops k_ops = { &coargs, 422 epoll_kev_copyout, 423 NULL}; 424 int error; 425 426 if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS) 427 return (EINVAL); 428 429 error = fget(td, args->epfd, 430 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); 431 if (error != 0) 432 return (error); 433 434 coargs.leventlist = args->events; 435 coargs.p = td->td_proc; 436 coargs.count = 0; 437 coargs.error = 0; 438 439 if (args->timeout != -1) { 440 if (args->timeout < 0) { 441 error = EINVAL; 442 goto leave; 443 } 444 /* Convert from milliseconds to timespec. */ 445 ts.tv_sec = args->timeout / 1000; 446 ts.tv_nsec = (args->timeout % 1000) * 1000000; 447 tsp = &ts; 448 } else { 449 tsp = NULL; 450 } 451 452 error = kern_kevent_fp(td, epfp, 0, args->maxevents, &k_ops, tsp); 453 if (error == 0 && coargs.error != 0) 454 error = coargs.error; 455 456 /* 457 * kern_kevent might return ENOMEM which is not expected from epoll_wait. 458 * Maybe we should translate that but I don't think it matters at all. 459 */ 460 if (error == 0) 461 td->td_retval[0] = coargs.count; 462leave: 463 fdrop(epfp, td); 464 return (error); 465} 466 467static int 468epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) 469{ 470 struct epoll_copyin_args ciargs; 471 struct kevent kev; 472 struct kevent_copyops k_ops = { &ciargs, 473 NULL, 474 epoll_kev_copyin}; 475 int error; 476 477 ciargs.changelist = &kev; 478 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); 479 480 error = kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL); 481 482 /* 483 * here we ignore ENONT, because we don't keep track of events here 484 */ 485 if (error == ENOENT) 486 error = 0; 487 return (error); 488} 489 490static int 491epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) 492{ 493 int error1, error2; 494 495 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); 496 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); 497 498 /* report any errors we got */ 499 return (error1 == 0 ? error2 : error1); 500} 501