linux_sysvec.c revision 293535
1/*- 2 * Copyright (c) 2013 Dmitry Chagin 3 * Copyright (c) 2004 Tim J. Robbins 4 * Copyright (c) 2003 Peter Wemm 5 * Copyright (c) 2002 Doug Rabson 6 * Copyright (c) 1998-1999 Andrew Gallatin 7 * Copyright (c) 1994-1996 S��ren Schmidt 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer 15 * in this position and unchanged. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. The name of the author may not be used to endorse or promote products 20 * derived from this software without specific prior written permission 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: stable/10/sys/amd64/linux/linux_sysvec.c 293535 2016-01-09 16:24:30Z dchagin $"); 36 37#include "opt_compat.h" 38 39#define __ELF_WORD_SIZE 64 40 41#include <sys/param.h> 42#include <sys/systm.h> 43#include <sys/exec.h> 44#include <sys/fcntl.h> 45#include <sys/imgact.h> 46#include <sys/imgact_elf.h> 47#include <sys/kernel.h> 48#include <sys/ktr.h> 49#include <sys/lock.h> 50#include <sys/malloc.h> 51#include <sys/module.h> 52#include <sys/mutex.h> 53#include <sys/proc.h> 54#include <sys/resourcevar.h> 55#include <sys/signalvar.h> 56#include <sys/sysctl.h> 57#include <sys/syscallsubr.h> 58#include <sys/sysent.h> 59#include <sys/sysproto.h> 60#include <sys/vnode.h> 61#include <sys/eventhandler.h> 62 63#include <vm/vm.h> 64#include <vm/pmap.h> 65#include <vm/vm_extern.h> 66#include <vm/vm_map.h> 67#include <vm/vm_object.h> 68#include <vm/vm_page.h> 69#include <vm/vm_param.h> 70 71#include <machine/cpu.h> 72#include <machine/md_var.h> 73#include <machine/pcb.h> 74#include <machine/specialreg.h> 75 76#include <amd64/linux/linux.h> 77#include <amd64/linux/linux_proto.h> 78#include <compat/linux/linux_emul.h> 79#include <compat/linux/linux_futex.h> 80#include <compat/linux/linux_ioctl.h> 81#include <compat/linux/linux_mib.h> 82#include <compat/linux/linux_misc.h> 83#include <compat/linux/linux_signal.h> 84#include <compat/linux/linux_sysproto.h> 85#include <compat/linux/linux_util.h> 86#include <compat/linux/linux_vdso.h> 87 88MODULE_VERSION(linux64, 1); 89 90#if BYTE_ORDER == LITTLE_ENDIAN 91#define SHELLMAGIC 0x2123 /* #! */ 92#else 93#define SHELLMAGIC 0x2321 94#endif 95 96#if defined(DEBUG) 97SYSCTL_PROC(_compat_linux, OID_AUTO, debug, 98 CTLTYPE_STRING | CTLFLAG_RW, 99 0, 0, linux_sysctl_debug, "A", 100 "Linux 64 debugging control"); 101#endif 102 103/* 104 * Allow the this functions to use the ldebug() facility 105 * even though they are not syscalls themselves. Map them 106 * to syscall 0. This is slightly less bogus than using 107 * ldebug(sigreturn). 108 */ 109#define LINUX_SYS_linux_rt_sendsig 0 110 111const char *linux_kplatform; 112static int linux_szsigcode; 113static vm_object_t linux_shared_page_obj; 114static char *linux_shared_page_mapping; 115extern char _binary_linux_locore_o_start; 116extern char _binary_linux_locore_o_end; 117 118extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 119 120SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 121 122static register_t * linux_copyout_strings(struct image_params *imgp); 123static int elf_linux_fixup(register_t **stack_base, 124 struct image_params *iparams); 125static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel); 126static void linux_vdso_install(void *param); 127static void linux_vdso_deinstall(void *param); 128static void linux_set_syscall_retval(struct thread *td, int error); 129static int linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa); 130static void linux_exec_setregs(struct thread *td, struct image_params *imgp, 131 u_long stack); 132 133/* 134 * Linux syscalls return negative errno's, we do positive and map them 135 * Reference: 136 * FreeBSD: src/sys/sys/errno.h 137 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h 138 * linux-2.6.17.8/include/asm-generic/errno.h 139 */ 140static int bsd_to_linux_errno[ELAST + 1] = { 141 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 142 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 143 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 144 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 145 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 146 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 147 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 148 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 149 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, 150 -72, -67, -71 151}; 152 153int bsd_to_linux_signal[LINUX_SIGTBLSZ] = { 154 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL, 155 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE, 156 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS, 157 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG, 158 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD, 159 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU, 160 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH, 161 0, LINUX_SIGUSR1, LINUX_SIGUSR2 162}; 163 164int linux_to_bsd_signal[LINUX_SIGTBLSZ] = { 165 SIGHUP, SIGINT, SIGQUIT, SIGILL, 166 SIGTRAP, SIGABRT, SIGBUS, SIGFPE, 167 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2, 168 SIGPIPE, SIGALRM, SIGTERM, SIGBUS, 169 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP, 170 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU, 171 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH, 172 SIGIO, SIGURG, SIGSYS 173}; 174 175#define LINUX_T_UNKNOWN 255 176static int _bsd_to_linux_trapcode[] = { 177 LINUX_T_UNKNOWN, /* 0 */ 178 6, /* 1 T_PRIVINFLT */ 179 LINUX_T_UNKNOWN, /* 2 */ 180 3, /* 3 T_BPTFLT */ 181 LINUX_T_UNKNOWN, /* 4 */ 182 LINUX_T_UNKNOWN, /* 5 */ 183 16, /* 6 T_ARITHTRAP */ 184 254, /* 7 T_ASTFLT */ 185 LINUX_T_UNKNOWN, /* 8 */ 186 13, /* 9 T_PROTFLT */ 187 1, /* 10 T_TRCTRAP */ 188 LINUX_T_UNKNOWN, /* 11 */ 189 14, /* 12 T_PAGEFLT */ 190 LINUX_T_UNKNOWN, /* 13 */ 191 17, /* 14 T_ALIGNFLT */ 192 LINUX_T_UNKNOWN, /* 15 */ 193 LINUX_T_UNKNOWN, /* 16 */ 194 LINUX_T_UNKNOWN, /* 17 */ 195 0, /* 18 T_DIVIDE */ 196 2, /* 19 T_NMI */ 197 4, /* 20 T_OFLOW */ 198 5, /* 21 T_BOUND */ 199 7, /* 22 T_DNA */ 200 8, /* 23 T_DOUBLEFLT */ 201 9, /* 24 T_FPOPFLT */ 202 10, /* 25 T_TSSFLT */ 203 11, /* 26 T_SEGNPFLT */ 204 12, /* 27 T_STKFLT */ 205 18, /* 28 T_MCHK */ 206 19, /* 29 T_XMMFLT */ 207 15 /* 30 T_RESERVED */ 208}; 209#define bsd_to_linux_trapcode(code) \ 210 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \ 211 _bsd_to_linux_trapcode[(code)]: \ 212 LINUX_T_UNKNOWN) 213 214LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode); 215LINUX_VDSO_SYM_CHAR(linux_platform); 216 217/* 218 * If FreeBSD & Linux have a difference of opinion about what a trap 219 * means, deal with it here. 220 * 221 * MPSAFE 222 */ 223static int 224translate_traps(int signal, int trap_code) 225{ 226 227 if (signal != SIGBUS) 228 return signal; 229 switch (trap_code) { 230 case T_PROTFLT: 231 case T_TSSFLT: 232 case T_DOUBLEFLT: 233 case T_PAGEFLT: 234 return SIGSEGV; 235 default: 236 return signal; 237 } 238} 239 240static int 241linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa) 242{ 243 struct proc *p; 244 struct trapframe *frame; 245 246 p = td->td_proc; 247 frame = td->td_frame; 248 249 sa->args[0] = frame->tf_rdi; 250 sa->args[1] = frame->tf_rsi; 251 sa->args[2] = frame->tf_rdx; 252 sa->args[3] = frame->tf_rcx; 253 sa->args[4] = frame->tf_r8; 254 sa->args[5] = frame->tf_r9; 255 sa->code = frame->tf_rax; 256 257 if (sa->code >= p->p_sysent->sv_size) { 258 PROC_LOCK(p); 259 sigexit(td, SIGILL); 260 } else 261 sa->callp = &p->p_sysent->sv_table[sa->code]; 262 sa->narg = sa->callp->sy_narg; 263 264 td->td_retval[0] = 0; 265 return (0); 266} 267 268static void 269linux_set_syscall_retval(struct thread *td, int error) 270{ 271 struct trapframe *frame = td->td_frame; 272 273 /* 274 * On Linux only %rcx and %r11 values are not preserved across 275 * the syscall. 276 * So, do not clobber %rdx and %r10 277 */ 278 td->td_retval[1] = frame->tf_rdx; 279 frame->tf_r10 = frame->tf_rcx; 280 281 cpu_set_syscall_retval(td, error); 282 283 /* Restore all registers. */ 284 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 285} 286 287static int 288elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 289{ 290 Elf_Auxargs *args; 291 Elf_Addr *base; 292 Elf_Addr *pos; 293 struct ps_strings *arginfo; 294 struct proc *p; 295 296 p = imgp->proc; 297 arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 298 299 KASSERT(curthread->td_proc == imgp->proc, 300 ("unsafe elf_linux_fixup(), should be curproc")); 301 base = (Elf64_Addr *)*stack_base; 302 args = (Elf64_Auxargs *)imgp->auxargs; 303 pos = base + (imgp->args->argc + imgp->args->envc + 2); 304 305 AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, 306 imgp->proc->p_sysent->sv_shared_page_base); 307 AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); 308 AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); 309 AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); 310 AUXARGS_ENTRY(pos, AT_PHENT, args->phent); 311 AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); 312 AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); 313 AUXARGS_ENTRY(pos, AT_BASE, args->base); 314 AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); 315 AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); 316 AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 317 AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 318 AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 319 AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 320 AUXARGS_ENTRY(pos, LINUX_AT_SECURE, 0); 321 AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); 322 AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary); 323 if (imgp->execpathp != 0) 324 AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp); 325 if (args->execfd != -1) 326 AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); 327 AUXARGS_ENTRY(pos, AT_NULL, 0); 328 free(imgp->auxargs, M_TEMP); 329 imgp->auxargs = NULL; 330 331 base--; 332 suword(base, (uint64_t)imgp->args->argc); 333 334 *stack_base = (register_t *)base; 335 return (0); 336} 337 338/* 339 * Copy strings out to the new process address space, constructing new arg 340 * and env vector tables. Return a pointer to the base so that it can be used 341 * as the initial stack pointer. 342 */ 343static register_t * 344linux_copyout_strings(struct image_params *imgp) 345{ 346 int argc, envc; 347 char **vectp; 348 char *stringp, *destp; 349 register_t *stack_base; 350 struct ps_strings *arginfo; 351 char canary[LINUX_AT_RANDOM_LEN]; 352 size_t execpath_len; 353 struct proc *p; 354 355 /* 356 * Calculate string base and vector table pointers. 357 */ 358 if (imgp->execpath != NULL && imgp->auxargs != NULL) 359 execpath_len = strlen(imgp->execpath) + 1; 360 else 361 execpath_len = 0; 362 363 p = imgp->proc; 364 arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 365 destp = (caddr_t)arginfo - SPARE_USRSPACE - 366 roundup(sizeof(canary), sizeof(char *)) - 367 roundup(execpath_len, sizeof(char *)) - 368 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *)); 369 370 if (execpath_len != 0) { 371 imgp->execpathp = (uintptr_t)arginfo - execpath_len; 372 copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); 373 } 374 375 /* 376 * Prepare the canary for SSP. 377 */ 378 arc4rand(canary, sizeof(canary), 0); 379 imgp->canary = (uintptr_t)arginfo - 380 roundup(execpath_len, sizeof(char *)) - 381 roundup(sizeof(canary), sizeof(char *)); 382 copyout(canary, (void *)imgp->canary, sizeof(canary)); 383 384 /* 385 * If we have a valid auxargs ptr, prepare some room 386 * on the stack. 387 */ 388 if (imgp->auxargs) { 389 /* 390 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 391 * lower compatibility. 392 */ 393 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : 394 (LINUX_AT_COUNT * 2); 395 396 /* 397 * The '+ 2' is for the null pointers at the end of each of 398 * the arg and env vector sets,and imgp->auxarg_size is room 399 * for argument of Runtime loader. 400 */ 401 vectp = (char **)(destp - (imgp->args->argc + 402 imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); 403 404 } else { 405 /* 406 * The '+ 2' is for the null pointers at the end of each of 407 * the arg and env vector sets 408 */ 409 vectp = (char **)(destp - (imgp->args->argc + 410 imgp->args->envc + 2) * sizeof(char *)); 411 } 412 413 /* 414 * vectp also becomes our initial stack base 415 */ 416 stack_base = (register_t *)vectp; 417 418 stringp = imgp->args->begin_argv; 419 argc = imgp->args->argc; 420 envc = imgp->args->envc; 421 422 /* 423 * Copy out strings - arguments and environment. 424 */ 425 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 426 427 /* 428 * Fill in "ps_strings" struct for ps, w, etc. 429 */ 430 suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); 431 suword(&arginfo->ps_nargvstr, argc); 432 433 /* 434 * Fill in argument portion of vector table. 435 */ 436 for (; argc > 0; --argc) { 437 suword(vectp++, (long)(intptr_t)destp); 438 while (*stringp++ != 0) 439 destp++; 440 destp++; 441 } 442 443 /* a null vector table pointer separates the argp's from the envp's */ 444 suword(vectp++, 0); 445 446 suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); 447 suword(&arginfo->ps_nenvstr, envc); 448 449 /* 450 * Fill in environment portion of vector table. 451 */ 452 for (; envc > 0; --envc) { 453 suword(vectp++, (long)(intptr_t)destp); 454 while (*stringp++ != 0) 455 destp++; 456 destp++; 457 } 458 459 /* end of vector table is a null pointer */ 460 suword(vectp, 0); 461 return (stack_base); 462} 463 464/* 465 * Reset registers to default values on exec. 466 */ 467static void 468linux_exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 469{ 470 struct trapframe *regs = td->td_frame; 471 struct pcb *pcb = td->td_pcb; 472 473 mtx_lock(&dt_lock); 474 if (td->td_proc->p_md.md_ldt != NULL) 475 user_ldt_free(td); 476 else 477 mtx_unlock(&dt_lock); 478 479 pcb->pcb_fsbase = 0; 480 pcb->pcb_gsbase = 0; 481 clear_pcb_flags(pcb, PCB_32BIT); 482 pcb->pcb_initial_fpucw = __LINUX_NPXCW__; 483 set_pcb_flags(pcb, PCB_FULL_IRET); 484 485 bzero((char *)regs, sizeof(struct trapframe)); 486 regs->tf_rip = imgp->entry_addr; 487 regs->tf_rsp = stack; 488 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 489 regs->tf_ss = _udatasel; 490 regs->tf_cs = _ucodesel; 491 regs->tf_ds = _udatasel; 492 regs->tf_es = _udatasel; 493 regs->tf_fs = _ufssel; 494 regs->tf_gs = _ugssel; 495 regs->tf_flags = TF_HASSEGS; 496 497 /* 498 * Reset the hardware debug registers if they were in use. 499 * They won't have any meaning for the newly exec'd process. 500 */ 501 if (pcb->pcb_flags & PCB_DBREGS) { 502 pcb->pcb_dr0 = 0; 503 pcb->pcb_dr1 = 0; 504 pcb->pcb_dr2 = 0; 505 pcb->pcb_dr3 = 0; 506 pcb->pcb_dr6 = 0; 507 pcb->pcb_dr7 = 0; 508 if (pcb == curpcb) { 509 /* 510 * Clear the debug registers on the running 511 * CPU, otherwise they will end up affecting 512 * the next process we switch to. 513 */ 514 reset_dbregs(); 515 } 516 clear_pcb_flags(pcb, PCB_DBREGS); 517 } 518 519 /* 520 * Drop the FP state if we hold it, so that the process gets a 521 * clean FP state if it uses the FPU again. 522 */ 523 fpstate_drop(td); 524} 525 526/* 527 * Copied from amd64/amd64/machdep.c 528 * 529 * XXX fpu state need? don't think so 530 */ 531int 532linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 533{ 534 struct proc *p; 535 struct l_ucontext uc; 536 struct l_sigcontext *context; 537 struct trapframe *regs; 538 unsigned long rflags; 539 int error; 540 ksiginfo_t ksi; 541 542 regs = td->td_frame; 543 error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); 544 if (error != 0) 545 return (error); 546 547 p = td->td_proc; 548 context = &uc.uc_mcontext; 549 rflags = context->sc_rflags; 550 551 /* 552 * Don't allow users to change privileged or reserved flags. 553 */ 554 /* 555 * XXX do allow users to change the privileged flag PSL_RF. 556 * The cpu sets PSL_RF in tf_rflags for faults. Debuggers 557 * should sometimes set it there too. tf_rflags is kept in 558 * the signal context during signal handling and there is no 559 * other place to remember it, so the PSL_RF bit may be 560 * corrupted by the signal handler without us knowing. 561 * Corruption of the PSL_RF bit at worst causes one more or 562 * one less debugger trap, so allowing it is fairly harmless. 563 */ 564 565#define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 566 if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 567 printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); 568 return (EINVAL); 569 } 570 571 /* 572 * Don't allow users to load a valid privileged %cs. Let the 573 * hardware check for invalid selectors, excess privilege in 574 * other selectors, invalid %eip's and invalid %esp's. 575 */ 576#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 577 if (!CS_SECURE(context->sc_cs)) { 578 printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); 579 ksiginfo_init_trap(&ksi); 580 ksi.ksi_signo = SIGBUS; 581 ksi.ksi_code = BUS_OBJERR; 582 ksi.ksi_trapno = T_PROTFLT; 583 ksi.ksi_addr = (void *)regs->tf_rip; 584 trapsignal(td, &ksi); 585 return (EINVAL); 586 } 587 588 PROC_LOCK(p); 589 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 590 SIG_CANTMASK(td->td_sigmask); 591 signotify(td); 592 PROC_UNLOCK(p); 593 594 regs->tf_rdi = context->sc_rdi; 595 regs->tf_rsi = context->sc_rsi; 596 regs->tf_rdx = context->sc_rdx; 597 regs->tf_rbp = context->sc_rbp; 598 regs->tf_rbx = context->sc_rbx; 599 regs->tf_rcx = context->sc_rcx; 600 regs->tf_rax = context->sc_rax; 601 regs->tf_rip = context->sc_rip; 602 regs->tf_rsp = context->sc_rsp; 603 regs->tf_r8 = context->sc_r8; 604 regs->tf_r9 = context->sc_r9; 605 regs->tf_r10 = context->sc_r10; 606 regs->tf_r11 = context->sc_r11; 607 regs->tf_r12 = context->sc_r12; 608 regs->tf_r13 = context->sc_r13; 609 regs->tf_r14 = context->sc_r14; 610 regs->tf_r15 = context->sc_r15; 611 regs->tf_cs = context->sc_cs; 612 regs->tf_err = context->sc_err; 613 regs->tf_rflags = rflags; 614 615 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 616 return (EJUSTRETURN); 617} 618 619/* 620 * copied from amd64/amd64/machdep.c 621 * 622 * Send an interrupt to process. 623 */ 624static void 625linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 626{ 627 struct l_rt_sigframe sf, *sfp; 628 struct proc *p; 629 struct thread *td; 630 struct sigacts *psp; 631 caddr_t sp; 632 struct trapframe *regs; 633 int sig, code; 634 int oonstack; 635 636 td = curthread; 637 p = td->td_proc; 638 PROC_LOCK_ASSERT(p, MA_OWNED); 639 sig = ksi->ksi_signo; 640 psp = p->p_sigacts; 641 code = ksi->ksi_code; 642 mtx_assert(&psp->ps_mtx, MA_OWNED); 643 regs = td->td_frame; 644 oonstack = sigonstack(regs->tf_rsp); 645 646 LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", 647 catcher, sig, mask, code); 648 649 /* Allocate space for the signal handler context. */ 650 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 651 SIGISMEMBER(psp->ps_sigonstack, sig)) { 652 sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size - 653 sizeof(struct l_rt_sigframe); 654 } else 655 sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; 656 /* Align to 16 bytes. */ 657 sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); 658 mtx_unlock(&psp->ps_mtx); 659 660 /* Translate the signal if appropriate. */ 661 sig = BSD_TO_LINUX_SIGNAL(sig); 662 663 /* Save user context. */ 664 bzero(&sf, sizeof(sf)); 665 bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); 666 bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); 667 668 sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 669 sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 670 sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 671 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 672 PROC_UNLOCK(p); 673 674 sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; 675 sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; 676 sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; 677 sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; 678 sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; 679 sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; 680 sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; 681 sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; 682 sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; 683 sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; 684 sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; 685 sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; 686 sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; 687 sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; 688 sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; 689 sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; 690 sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; 691 sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 692 sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; 693 sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; 694 sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 695 sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; 696 697 /* Build the argument list for the signal handler. */ 698 regs->tf_rdi = sig; /* arg 1 in %rdi */ 699 regs->tf_rax = 0; 700 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 701 regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ 702 703 sf.sf_handler = catcher; 704 /* Fill in POSIX parts */ 705 ksiginfo_to_lsiginfo(ksi, &sf.sf_si, sig); 706 707 /* 708 * Copy the sigframe out to the user's stack. 709 */ 710 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 711#ifdef DEBUG 712 printf("process %ld has trashed its stack\n", (long)p->p_pid); 713#endif 714 PROC_LOCK(p); 715 sigexit(td, SIGILL); 716 } 717 718 regs->tf_rsp = (long)sfp; 719 regs->tf_rip = linux_rt_sigcode; 720 regs->tf_rflags &= ~(PSL_T | PSL_D); 721 regs->tf_cs = _ucodesel; 722 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 723 PROC_LOCK(p); 724 mtx_lock(&psp->ps_mtx); 725} 726 727/* 728 * If a linux binary is exec'ing something, try this image activator 729 * first. We override standard shell script execution in order to 730 * be able to modify the interpreter path. We only do this if a linux 731 * binary is doing the exec, so we do not create an EXEC module for it. 732 */ 733static int exec_linux_imgact_try(struct image_params *iparams); 734 735static int 736exec_linux_imgact_try(struct image_params *imgp) 737{ 738 const char *head = (const char *)imgp->image_header; 739 char *rpath; 740 int error = -1, len; 741 742 /* 743 * The interpreter for shell scripts run from a linux binary needs 744 * to be located in /compat/linux if possible in order to recursively 745 * maintain linux path emulation. 746 */ 747 if (((const short *)head)[0] == SHELLMAGIC) { 748 /* 749 * Run our normal shell image activator. If it succeeds 750 * attempt to use the alternate path for the interpreter. 751 * If an alternate path is found, use our stringspace 752 * to store it. 753 */ 754 if ((error = exec_shell_imgact(imgp)) == 0) { 755 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 756 imgp->interpreter_name, UIO_SYSSPACE, 757 &rpath, 0, AT_FDCWD); 758 if (rpath != NULL) { 759 len = strlen(rpath) + 1; 760 761 if (len <= MAXSHELLCMDLEN) 762 memcpy(imgp->interpreter_name, 763 rpath, len); 764 free(rpath, M_TEMP); 765 } 766 } 767 } 768 return(error); 769} 770 771struct sysentvec elf_linux_sysvec = { 772 .sv_size = LINUX_SYS_MAXSYSCALL, 773 .sv_table = linux_sysent, 774 .sv_mask = 0, 775 .sv_sigsize = LINUX_SIGTBLSZ, 776 .sv_sigtbl = bsd_to_linux_signal, 777 .sv_errsize = ELAST + 1, 778 .sv_errtbl = bsd_to_linux_errno, 779 .sv_transtrap = translate_traps, 780 .sv_fixup = elf_linux_fixup, 781 .sv_sendsig = linux_rt_sendsig, 782 .sv_sigcode = &_binary_linux_locore_o_start, 783 .sv_szsigcode = &linux_szsigcode, 784 .sv_prepsyscall = NULL, 785 .sv_name = "Linux ELF64", 786 .sv_coredump = elf64_coredump, 787 .sv_imgact_try = exec_linux_imgact_try, 788 .sv_minsigstksz = LINUX_MINSIGSTKSZ, 789 .sv_pagesize = PAGE_SIZE, 790 .sv_minuser = VM_MIN_ADDRESS, 791 .sv_maxuser = VM_MAXUSER_ADDRESS, 792 .sv_usrstack = USRSTACK, 793 .sv_psstrings = PS_STRINGS, 794 .sv_stackprot = VM_PROT_ALL, 795 .sv_copyout_strings = linux_copyout_strings, 796 .sv_setregs = linux_exec_setregs, 797 .sv_fixlimit = NULL, 798 .sv_maxssiz = NULL, 799 .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP, 800 .sv_set_syscall_retval = linux_set_syscall_retval, 801 .sv_fetch_syscall_args = linux_fetch_syscall_args, 802 .sv_syscallnames = NULL, 803 .sv_shared_page_base = SHAREDPAGE, 804 .sv_shared_page_len = PAGE_SIZE, 805 .sv_schedtail = linux_schedtail, 806 .sv_thread_detach = linux_thread_detach 807}; 808 809static void 810linux_vdso_install(void *param) 811{ 812 813 linux_szsigcode = (&_binary_linux_locore_o_end - 814 &_binary_linux_locore_o_start); 815 816 if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) 817 panic("Linux invalid vdso size\n"); 818 819 __elfN(linux_vdso_fixup)(&elf_linux_sysvec); 820 821 linux_shared_page_obj = __elfN(linux_shared_page_init) 822 (&linux_shared_page_mapping); 823 824 __elfN(linux_vdso_reloc)(&elf_linux_sysvec, SHAREDPAGE); 825 826 bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, 827 linux_szsigcode); 828 elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; 829 830 linux_kplatform = linux_shared_page_mapping + 831 (linux_platform - (caddr_t)SHAREDPAGE); 832} 833SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, 834 (sysinit_cfunc_t)linux_vdso_install, NULL); 835 836static void 837linux_vdso_deinstall(void *param) 838{ 839 840 __elfN(linux_shared_page_fini)(linux_shared_page_obj); 841}; 842SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, 843 (sysinit_cfunc_t)linux_vdso_deinstall, NULL); 844 845static char GNULINUX_ABI_VENDOR[] = "GNU"; 846static int GNULINUX_ABI_DESC = 0; 847 848static boolean_t 849linux_trans_osrel(const Elf_Note *note, int32_t *osrel) 850{ 851 const Elf32_Word *desc; 852 uintptr_t p; 853 854 p = (uintptr_t)(note + 1); 855 p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); 856 857 desc = (const Elf32_Word *)p; 858 if (desc[0] != GNULINUX_ABI_DESC) 859 return (FALSE); 860 861 /* 862 * For linux we encode osrel as follows (see linux_mib.c): 863 * VVVMMMIII (version, major, minor), see linux_mib.c. 864 */ 865 *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; 866 867 return (TRUE); 868} 869 870static Elf_Brandnote linux64_brandnote = { 871 .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), 872 .hdr.n_descsz = 16, 873 .hdr.n_type = 1, 874 .vendor = GNULINUX_ABI_VENDOR, 875 .flags = BN_TRANSLATE_OSREL, 876 .trans_osrel = linux_trans_osrel 877}; 878 879static Elf64_Brandinfo linux_glibc2brand = { 880 .brand = ELFOSABI_LINUX, 881 .machine = EM_X86_64, 882 .compat_3_brand = "Linux", 883 .emul_path = "/compat/linux", 884 .interp_path = "/lib64/ld-linux-x86-64.so.2", 885 .sysvec = &elf_linux_sysvec, 886 .interp_newpath = NULL, 887 .brand_note = &linux64_brandnote, 888 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE 889}; 890 891static Elf64_Brandinfo linux_glibc2brandshort = { 892 .brand = ELFOSABI_LINUX, 893 .machine = EM_X86_64, 894 .compat_3_brand = "Linux", 895 .emul_path = "/compat/linux", 896 .interp_path = "/lib64/ld-linux.so.2", 897 .sysvec = &elf_linux_sysvec, 898 .interp_newpath = NULL, 899 .brand_note = &linux64_brandnote, 900 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE 901}; 902 903Elf64_Brandinfo *linux_brandlist[] = { 904 &linux_glibc2brand, 905 &linux_glibc2brandshort, 906 NULL 907}; 908 909static int 910linux64_elf_modevent(module_t mod, int type, void *data) 911{ 912 Elf64_Brandinfo **brandinfo; 913 int error; 914 struct linux_ioctl_handler **lihp; 915 916 error = 0; 917 918 switch(type) { 919 case MOD_LOAD: 920 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 921 ++brandinfo) 922 if (elf64_insert_brand_entry(*brandinfo) < 0) 923 error = EINVAL; 924 if (error == 0) { 925 SET_FOREACH(lihp, linux_ioctl_handler_set) 926 linux_ioctl_register_handler(*lihp); 927 LIST_INIT(&futex_list); 928 mtx_init(&futex_mtx, "ftllk64", NULL, MTX_DEF); 929 stclohz = (stathz ? stathz : hz); 930 if (bootverbose) 931 printf("Linux x86-64 ELF exec handler installed\n"); 932 } else 933 printf("cannot insert Linux x86-64 ELF brand handler\n"); 934 break; 935 case MOD_UNLOAD: 936 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 937 ++brandinfo) 938 if (elf64_brand_inuse(*brandinfo)) 939 error = EBUSY; 940 if (error == 0) { 941 for (brandinfo = &linux_brandlist[0]; 942 *brandinfo != NULL; ++brandinfo) 943 if (elf64_remove_brand_entry(*brandinfo) < 0) 944 error = EINVAL; 945 } 946 if (error == 0) { 947 SET_FOREACH(lihp, linux_ioctl_handler_set) 948 linux_ioctl_unregister_handler(*lihp); 949 mtx_destroy(&futex_mtx); 950 if (bootverbose) 951 printf("Linux ELF exec handler removed\n"); 952 } else 953 printf("Could not deinstall ELF interpreter entry\n"); 954 break; 955 default: 956 return (EOPNOTSUPP); 957 } 958 return (error); 959} 960 961static moduledata_t linux64_elf_mod = { 962 "linux64elf", 963 linux64_elf_modevent, 964 0 965}; 966 967DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 968MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); 969