linux_sysvec.c revision 293535
1/*-
2 * Copyright (c) 2013 Dmitry Chagin
3 * Copyright (c) 2004 Tim J. Robbins
4 * Copyright (c) 2003 Peter Wemm
5 * Copyright (c) 2002 Doug Rabson
6 * Copyright (c) 1998-1999 Andrew Gallatin
7 * Copyright (c) 1994-1996 S��ren Schmidt
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer
15 *    in this position and unchanged.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The name of the author may not be used to endorse or promote products
20 *    derived from this software without specific prior written permission
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD: stable/10/sys/amd64/linux/linux_sysvec.c 293535 2016-01-09 16:24:30Z dchagin $");
36
37#include "opt_compat.h"
38
39#define	__ELF_WORD_SIZE	64
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/exec.h>
44#include <sys/fcntl.h>
45#include <sys/imgact.h>
46#include <sys/imgact_elf.h>
47#include <sys/kernel.h>
48#include <sys/ktr.h>
49#include <sys/lock.h>
50#include <sys/malloc.h>
51#include <sys/module.h>
52#include <sys/mutex.h>
53#include <sys/proc.h>
54#include <sys/resourcevar.h>
55#include <sys/signalvar.h>
56#include <sys/sysctl.h>
57#include <sys/syscallsubr.h>
58#include <sys/sysent.h>
59#include <sys/sysproto.h>
60#include <sys/vnode.h>
61#include <sys/eventhandler.h>
62
63#include <vm/vm.h>
64#include <vm/pmap.h>
65#include <vm/vm_extern.h>
66#include <vm/vm_map.h>
67#include <vm/vm_object.h>
68#include <vm/vm_page.h>
69#include <vm/vm_param.h>
70
71#include <machine/cpu.h>
72#include <machine/md_var.h>
73#include <machine/pcb.h>
74#include <machine/specialreg.h>
75
76#include <amd64/linux/linux.h>
77#include <amd64/linux/linux_proto.h>
78#include <compat/linux/linux_emul.h>
79#include <compat/linux/linux_futex.h>
80#include <compat/linux/linux_ioctl.h>
81#include <compat/linux/linux_mib.h>
82#include <compat/linux/linux_misc.h>
83#include <compat/linux/linux_signal.h>
84#include <compat/linux/linux_sysproto.h>
85#include <compat/linux/linux_util.h>
86#include <compat/linux/linux_vdso.h>
87
88MODULE_VERSION(linux64, 1);
89
90#if BYTE_ORDER == LITTLE_ENDIAN
91#define SHELLMAGIC      0x2123 /* #! */
92#else
93#define SHELLMAGIC      0x2321
94#endif
95
96#if defined(DEBUG)
97SYSCTL_PROC(_compat_linux, OID_AUTO, debug,
98	    CTLTYPE_STRING | CTLFLAG_RW,
99	    0, 0, linux_sysctl_debug, "A",
100	    "Linux 64 debugging control");
101#endif
102
103/*
104 * Allow the this functions to use the ldebug() facility
105 * even though they are not syscalls themselves. Map them
106 * to syscall 0. This is slightly less bogus than using
107 * ldebug(sigreturn).
108 */
109#define	LINUX_SYS_linux_rt_sendsig	0
110
111const char *linux_kplatform;
112static int linux_szsigcode;
113static vm_object_t linux_shared_page_obj;
114static char *linux_shared_page_mapping;
115extern char _binary_linux_locore_o_start;
116extern char _binary_linux_locore_o_end;
117
118extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
119
120SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
121
122static register_t * linux_copyout_strings(struct image_params *imgp);
123static int	elf_linux_fixup(register_t **stack_base,
124		    struct image_params *iparams);
125static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
126static void	linux_vdso_install(void *param);
127static void	linux_vdso_deinstall(void *param);
128static void	linux_set_syscall_retval(struct thread *td, int error);
129static int	linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa);
130static void	linux_exec_setregs(struct thread *td, struct image_params *imgp,
131		    u_long stack);
132
133/*
134 * Linux syscalls return negative errno's, we do positive and map them
135 * Reference:
136 *   FreeBSD: src/sys/sys/errno.h
137 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138 *            linux-2.6.17.8/include/asm-generic/errno.h
139 */
140static int bsd_to_linux_errno[ELAST + 1] = {
141	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150	 -72, -67, -71
151};
152
153int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161	0, LINUX_SIGUSR1, LINUX_SIGUSR2
162};
163
164int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165	SIGHUP, SIGINT, SIGQUIT, SIGILL,
166	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172	SIGIO, SIGURG, SIGSYS
173};
174
175#define LINUX_T_UNKNOWN  255
176static int _bsd_to_linux_trapcode[] = {
177	LINUX_T_UNKNOWN,	/* 0 */
178	6,			/* 1  T_PRIVINFLT */
179	LINUX_T_UNKNOWN,	/* 2 */
180	3,			/* 3  T_BPTFLT */
181	LINUX_T_UNKNOWN,	/* 4 */
182	LINUX_T_UNKNOWN,	/* 5 */
183	16,			/* 6  T_ARITHTRAP */
184	254,			/* 7  T_ASTFLT */
185	LINUX_T_UNKNOWN,	/* 8 */
186	13,			/* 9  T_PROTFLT */
187	1,			/* 10 T_TRCTRAP */
188	LINUX_T_UNKNOWN,	/* 11 */
189	14,			/* 12 T_PAGEFLT */
190	LINUX_T_UNKNOWN,	/* 13 */
191	17,			/* 14 T_ALIGNFLT */
192	LINUX_T_UNKNOWN,	/* 15 */
193	LINUX_T_UNKNOWN,	/* 16 */
194	LINUX_T_UNKNOWN,	/* 17 */
195	0,			/* 18 T_DIVIDE */
196	2,			/* 19 T_NMI */
197	4,			/* 20 T_OFLOW */
198	5,			/* 21 T_BOUND */
199	7,			/* 22 T_DNA */
200	8,			/* 23 T_DOUBLEFLT */
201	9,			/* 24 T_FPOPFLT */
202	10,			/* 25 T_TSSFLT */
203	11,			/* 26 T_SEGNPFLT */
204	12,			/* 27 T_STKFLT */
205	18,			/* 28 T_MCHK */
206	19,			/* 29 T_XMMFLT */
207	15			/* 30 T_RESERVED */
208};
209#define bsd_to_linux_trapcode(code) \
210    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211     _bsd_to_linux_trapcode[(code)]: \
212     LINUX_T_UNKNOWN)
213
214LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
215LINUX_VDSO_SYM_CHAR(linux_platform);
216
217/*
218 * If FreeBSD & Linux have a difference of opinion about what a trap
219 * means, deal with it here.
220 *
221 * MPSAFE
222 */
223static int
224translate_traps(int signal, int trap_code)
225{
226
227	if (signal != SIGBUS)
228		return signal;
229	switch (trap_code) {
230	case T_PROTFLT:
231	case T_TSSFLT:
232	case T_DOUBLEFLT:
233	case T_PAGEFLT:
234		return SIGSEGV;
235	default:
236		return signal;
237	}
238}
239
240static int
241linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
242{
243	struct proc *p;
244	struct trapframe *frame;
245
246	p = td->td_proc;
247	frame = td->td_frame;
248
249	sa->args[0] = frame->tf_rdi;
250	sa->args[1] = frame->tf_rsi;
251	sa->args[2] = frame->tf_rdx;
252	sa->args[3] = frame->tf_rcx;
253	sa->args[4] = frame->tf_r8;
254	sa->args[5] = frame->tf_r9;
255	sa->code = frame->tf_rax;
256
257	if (sa->code >= p->p_sysent->sv_size) {
258		PROC_LOCK(p);
259		sigexit(td, SIGILL);
260	} else
261		sa->callp = &p->p_sysent->sv_table[sa->code];
262	sa->narg = sa->callp->sy_narg;
263
264	td->td_retval[0] = 0;
265	return (0);
266}
267
268static void
269linux_set_syscall_retval(struct thread *td, int error)
270{
271	struct trapframe *frame = td->td_frame;
272
273	/*
274	 * On Linux only %rcx and %r11 values are not preserved across
275	 * the syscall.
276	 * So, do not clobber %rdx and %r10
277	 */
278	td->td_retval[1] = frame->tf_rdx;
279	frame->tf_r10 = frame->tf_rcx;
280
281	cpu_set_syscall_retval(td, error);
282
283	 /* Restore all registers. */
284	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
285}
286
287static int
288elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
289{
290	Elf_Auxargs *args;
291	Elf_Addr *base;
292	Elf_Addr *pos;
293	struct ps_strings *arginfo;
294	struct proc *p;
295
296	p = imgp->proc;
297	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
298
299	KASSERT(curthread->td_proc == imgp->proc,
300	    ("unsafe elf_linux_fixup(), should be curproc"));
301	base = (Elf64_Addr *)*stack_base;
302	args = (Elf64_Auxargs *)imgp->auxargs;
303	pos = base + (imgp->args->argc + imgp->args->envc + 2);
304
305	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
306	    imgp->proc->p_sysent->sv_shared_page_base);
307	AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
308	AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
309	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
310	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
311	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
312	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
313	AUXARGS_ENTRY(pos, AT_BASE, args->base);
314	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
315	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
316	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
317	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
318	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
319	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
320	AUXARGS_ENTRY(pos, LINUX_AT_SECURE, 0);
321	AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
322	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary);
323	if (imgp->execpathp != 0)
324		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp);
325	if (args->execfd != -1)
326		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
327	AUXARGS_ENTRY(pos, AT_NULL, 0);
328	free(imgp->auxargs, M_TEMP);
329	imgp->auxargs = NULL;
330
331	base--;
332	suword(base, (uint64_t)imgp->args->argc);
333
334	*stack_base = (register_t *)base;
335	return (0);
336}
337
338/*
339 * Copy strings out to the new process address space, constructing new arg
340 * and env vector tables. Return a pointer to the base so that it can be used
341 * as the initial stack pointer.
342 */
343static register_t *
344linux_copyout_strings(struct image_params *imgp)
345{
346	int argc, envc;
347	char **vectp;
348	char *stringp, *destp;
349	register_t *stack_base;
350	struct ps_strings *arginfo;
351	char canary[LINUX_AT_RANDOM_LEN];
352	size_t execpath_len;
353	struct proc *p;
354
355	/*
356	 * Calculate string base and vector table pointers.
357	 */
358	if (imgp->execpath != NULL && imgp->auxargs != NULL)
359		execpath_len = strlen(imgp->execpath) + 1;
360	else
361		execpath_len = 0;
362
363	p = imgp->proc;
364	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
365	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
366	    roundup(sizeof(canary), sizeof(char *)) -
367	    roundup(execpath_len, sizeof(char *)) -
368	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
369
370	if (execpath_len != 0) {
371		imgp->execpathp = (uintptr_t)arginfo - execpath_len;
372		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
373	}
374
375	/*
376	 * Prepare the canary for SSP.
377	 */
378	arc4rand(canary, sizeof(canary), 0);
379	imgp->canary = (uintptr_t)arginfo -
380	    roundup(execpath_len, sizeof(char *)) -
381	    roundup(sizeof(canary), sizeof(char *));
382	copyout(canary, (void *)imgp->canary, sizeof(canary));
383
384	/*
385	 * If we have a valid auxargs ptr, prepare some room
386	 * on the stack.
387	 */
388	if (imgp->auxargs) {
389		/*
390		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
391		 * lower compatibility.
392		 */
393		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
394		    (LINUX_AT_COUNT * 2);
395
396		/*
397		 * The '+ 2' is for the null pointers at the end of each of
398		 * the arg and env vector sets,and imgp->auxarg_size is room
399		 * for argument of Runtime loader.
400		 */
401		vectp = (char **)(destp - (imgp->args->argc +
402		    imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *));
403
404	} else {
405		/*
406		 * The '+ 2' is for the null pointers at the end of each of
407		 * the arg and env vector sets
408		 */
409		vectp = (char **)(destp - (imgp->args->argc +
410		    imgp->args->envc + 2) * sizeof(char *));
411	}
412
413	/*
414	 * vectp also becomes our initial stack base
415	 */
416	stack_base = (register_t *)vectp;
417
418	stringp = imgp->args->begin_argv;
419	argc = imgp->args->argc;
420	envc = imgp->args->envc;
421
422	/*
423	 * Copy out strings - arguments and environment.
424	 */
425	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
426
427	/*
428	 * Fill in "ps_strings" struct for ps, w, etc.
429	 */
430	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
431	suword(&arginfo->ps_nargvstr, argc);
432
433	/*
434	 * Fill in argument portion of vector table.
435	 */
436	for (; argc > 0; --argc) {
437		suword(vectp++, (long)(intptr_t)destp);
438		while (*stringp++ != 0)
439			destp++;
440		destp++;
441	}
442
443	/* a null vector table pointer separates the argp's from the envp's */
444	suword(vectp++, 0);
445
446	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
447	suword(&arginfo->ps_nenvstr, envc);
448
449	/*
450	 * Fill in environment portion of vector table.
451	 */
452	for (; envc > 0; --envc) {
453		suword(vectp++, (long)(intptr_t)destp);
454		while (*stringp++ != 0)
455			destp++;
456		destp++;
457	}
458
459	/* end of vector table is a null pointer */
460	suword(vectp, 0);
461	return (stack_base);
462}
463
464/*
465 * Reset registers to default values on exec.
466 */
467static void
468linux_exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
469{
470	struct trapframe *regs = td->td_frame;
471	struct pcb *pcb = td->td_pcb;
472
473	mtx_lock(&dt_lock);
474	if (td->td_proc->p_md.md_ldt != NULL)
475		user_ldt_free(td);
476	else
477		mtx_unlock(&dt_lock);
478
479	pcb->pcb_fsbase = 0;
480	pcb->pcb_gsbase = 0;
481	clear_pcb_flags(pcb, PCB_32BIT);
482	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
483	set_pcb_flags(pcb, PCB_FULL_IRET);
484
485	bzero((char *)regs, sizeof(struct trapframe));
486	regs->tf_rip = imgp->entry_addr;
487	regs->tf_rsp = stack;
488	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
489	regs->tf_ss = _udatasel;
490	regs->tf_cs = _ucodesel;
491	regs->tf_ds = _udatasel;
492	regs->tf_es = _udatasel;
493	regs->tf_fs = _ufssel;
494	regs->tf_gs = _ugssel;
495	regs->tf_flags = TF_HASSEGS;
496
497	/*
498	 * Reset the hardware debug registers if they were in use.
499	 * They won't have any meaning for the newly exec'd process.
500	 */
501	if (pcb->pcb_flags & PCB_DBREGS) {
502		pcb->pcb_dr0 = 0;
503		pcb->pcb_dr1 = 0;
504		pcb->pcb_dr2 = 0;
505		pcb->pcb_dr3 = 0;
506		pcb->pcb_dr6 = 0;
507		pcb->pcb_dr7 = 0;
508		if (pcb == curpcb) {
509			/*
510			 * Clear the debug registers on the running
511			 * CPU, otherwise they will end up affecting
512			 * the next process we switch to.
513			 */
514			reset_dbregs();
515		}
516		clear_pcb_flags(pcb, PCB_DBREGS);
517	}
518
519	/*
520	 * Drop the FP state if we hold it, so that the process gets a
521	 * clean FP state if it uses the FPU again.
522	 */
523	fpstate_drop(td);
524}
525
526/*
527 * Copied from amd64/amd64/machdep.c
528 *
529 * XXX fpu state need? don't think so
530 */
531int
532linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
533{
534	struct proc *p;
535	struct l_ucontext uc;
536	struct l_sigcontext *context;
537	struct trapframe *regs;
538	unsigned long rflags;
539	int error;
540	ksiginfo_t ksi;
541
542	regs = td->td_frame;
543	error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc));
544	if (error != 0)
545		return (error);
546
547	p = td->td_proc;
548	context = &uc.uc_mcontext;
549	rflags = context->sc_rflags;
550
551	/*
552	 * Don't allow users to change privileged or reserved flags.
553	 */
554	/*
555	 * XXX do allow users to change the privileged flag PSL_RF.
556	 * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
557	 * should sometimes set it there too.  tf_rflags is kept in
558	 * the signal context during signal handling and there is no
559	 * other place to remember it, so the PSL_RF bit may be
560	 * corrupted by the signal handler without us knowing.
561	 * Corruption of the PSL_RF bit at worst causes one more or
562	 * one less debugger trap, so allowing it is fairly harmless.
563	 */
564
565#define RFLAG_SECURE(ef, oef)     ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
566	if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
567		printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags);
568		return (EINVAL);
569	}
570
571	/*
572	 * Don't allow users to load a valid privileged %cs.  Let the
573	 * hardware check for invalid selectors, excess privilege in
574	 * other selectors, invalid %eip's and invalid %esp's.
575	 */
576#define CS_SECURE(cs)           (ISPL(cs) == SEL_UPL)
577	if (!CS_SECURE(context->sc_cs)) {
578		printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs);
579		ksiginfo_init_trap(&ksi);
580		ksi.ksi_signo = SIGBUS;
581		ksi.ksi_code = BUS_OBJERR;
582		ksi.ksi_trapno = T_PROTFLT;
583		ksi.ksi_addr = (void *)regs->tf_rip;
584		trapsignal(td, &ksi);
585		return (EINVAL);
586	}
587
588	PROC_LOCK(p);
589	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
590	SIG_CANTMASK(td->td_sigmask);
591	signotify(td);
592	PROC_UNLOCK(p);
593
594	regs->tf_rdi    = context->sc_rdi;
595	regs->tf_rsi    = context->sc_rsi;
596	regs->tf_rdx    = context->sc_rdx;
597	regs->tf_rbp    = context->sc_rbp;
598	regs->tf_rbx    = context->sc_rbx;
599	regs->tf_rcx    = context->sc_rcx;
600	regs->tf_rax    = context->sc_rax;
601	regs->tf_rip    = context->sc_rip;
602	regs->tf_rsp    = context->sc_rsp;
603	regs->tf_r8     = context->sc_r8;
604	regs->tf_r9     = context->sc_r9;
605	regs->tf_r10    = context->sc_r10;
606	regs->tf_r11    = context->sc_r11;
607	regs->tf_r12    = context->sc_r12;
608	regs->tf_r13    = context->sc_r13;
609	regs->tf_r14    = context->sc_r14;
610	regs->tf_r15    = context->sc_r15;
611	regs->tf_cs     = context->sc_cs;
612	regs->tf_err    = context->sc_err;
613	regs->tf_rflags = rflags;
614
615	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
616	return (EJUSTRETURN);
617}
618
619/*
620 * copied from amd64/amd64/machdep.c
621 *
622 * Send an interrupt to process.
623 */
624static void
625linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
626{
627	struct l_rt_sigframe sf, *sfp;
628	struct proc *p;
629	struct thread *td;
630	struct sigacts *psp;
631	caddr_t sp;
632	struct trapframe *regs;
633	int sig, code;
634	int oonstack;
635
636	td = curthread;
637	p = td->td_proc;
638	PROC_LOCK_ASSERT(p, MA_OWNED);
639	sig = ksi->ksi_signo;
640	psp = p->p_sigacts;
641	code = ksi->ksi_code;
642	mtx_assert(&psp->ps_mtx, MA_OWNED);
643	regs = td->td_frame;
644	oonstack = sigonstack(regs->tf_rsp);
645
646	LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u",
647	    catcher, sig, mask, code);
648
649	/* Allocate space for the signal handler context. */
650	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
651	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
652		sp = td->td_sigstk.ss_sp + td->td_sigstk.ss_size -
653		    sizeof(struct l_rt_sigframe);
654	} else
655		sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128;
656	/* Align to 16 bytes. */
657	sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul);
658	mtx_unlock(&psp->ps_mtx);
659
660	/* Translate the signal if appropriate. */
661	sig = BSD_TO_LINUX_SIGNAL(sig);
662
663	/* Save user context. */
664	bzero(&sf, sizeof(sf));
665	bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask);
666	bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask);
667
668	sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
669	sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
670	sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
671	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
672	PROC_UNLOCK(p);
673
674	sf.sf_sc.uc_mcontext.sc_rdi    = regs->tf_rdi;
675	sf.sf_sc.uc_mcontext.sc_rsi    = regs->tf_rsi;
676	sf.sf_sc.uc_mcontext.sc_rdx    = regs->tf_rdx;
677	sf.sf_sc.uc_mcontext.sc_rbp    = regs->tf_rbp;
678	sf.sf_sc.uc_mcontext.sc_rbx    = regs->tf_rbx;
679	sf.sf_sc.uc_mcontext.sc_rcx    = regs->tf_rcx;
680	sf.sf_sc.uc_mcontext.sc_rax    = regs->tf_rax;
681	sf.sf_sc.uc_mcontext.sc_rip    = regs->tf_rip;
682	sf.sf_sc.uc_mcontext.sc_rsp    = regs->tf_rsp;
683	sf.sf_sc.uc_mcontext.sc_r8     = regs->tf_r8;
684	sf.sf_sc.uc_mcontext.sc_r9     = regs->tf_r9;
685	sf.sf_sc.uc_mcontext.sc_r10    = regs->tf_r10;
686	sf.sf_sc.uc_mcontext.sc_r11    = regs->tf_r11;
687	sf.sf_sc.uc_mcontext.sc_r12    = regs->tf_r12;
688	sf.sf_sc.uc_mcontext.sc_r13    = regs->tf_r13;
689	sf.sf_sc.uc_mcontext.sc_r14    = regs->tf_r14;
690	sf.sf_sc.uc_mcontext.sc_r15    = regs->tf_r15;
691	sf.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
692	sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags;
693	sf.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
694	sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
695	sf.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
696
697	/* Build the argument list for the signal handler. */
698	regs->tf_rdi = sig;			/* arg 1 in %rdi */
699	regs->tf_rax = 0;
700	regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
701	regs->tf_rdx = (register_t)&sfp->sf_sc;	/* arg 3 in %rdx */
702
703	sf.sf_handler = catcher;
704	/* Fill in POSIX parts */
705	ksiginfo_to_lsiginfo(ksi, &sf.sf_si, sig);
706
707	/*
708	 * Copy the sigframe out to the user's stack.
709	 */
710	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
711#ifdef DEBUG
712		printf("process %ld has trashed its stack\n", (long)p->p_pid);
713#endif
714		PROC_LOCK(p);
715		sigexit(td, SIGILL);
716	}
717
718	regs->tf_rsp = (long)sfp;
719	regs->tf_rip = linux_rt_sigcode;
720	regs->tf_rflags &= ~(PSL_T | PSL_D);
721	regs->tf_cs = _ucodesel;
722	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
723	PROC_LOCK(p);
724	mtx_lock(&psp->ps_mtx);
725}
726
727/*
728 * If a linux binary is exec'ing something, try this image activator
729 * first.  We override standard shell script execution in order to
730 * be able to modify the interpreter path.  We only do this if a linux
731 * binary is doing the exec, so we do not create an EXEC module for it.
732 */
733static int exec_linux_imgact_try(struct image_params *iparams);
734
735static int
736exec_linux_imgact_try(struct image_params *imgp)
737{
738	const char *head = (const char *)imgp->image_header;
739	char *rpath;
740	int error = -1, len;
741
742	/*
743	 * The interpreter for shell scripts run from a linux binary needs
744	 * to be located in /compat/linux if possible in order to recursively
745	 * maintain linux path emulation.
746	 */
747	if (((const short *)head)[0] == SHELLMAGIC) {
748		/*
749		 * Run our normal shell image activator.  If it succeeds
750		 * attempt to use the alternate path for the interpreter.
751		 * If an alternate path is found, use our stringspace
752		 * to store it.
753		 */
754		if ((error = exec_shell_imgact(imgp)) == 0) {
755			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
756			    imgp->interpreter_name, UIO_SYSSPACE,
757			    &rpath, 0, AT_FDCWD);
758			if (rpath != NULL) {
759				len = strlen(rpath) + 1;
760
761				if (len <= MAXSHELLCMDLEN)
762					memcpy(imgp->interpreter_name,
763					    rpath, len);
764				free(rpath, M_TEMP);
765			}
766		}
767	}
768	return(error);
769}
770
771struct sysentvec elf_linux_sysvec = {
772	.sv_size	= LINUX_SYS_MAXSYSCALL,
773	.sv_table	= linux_sysent,
774	.sv_mask	= 0,
775	.sv_sigsize	= LINUX_SIGTBLSZ,
776	.sv_sigtbl	= bsd_to_linux_signal,
777	.sv_errsize	= ELAST + 1,
778	.sv_errtbl	= bsd_to_linux_errno,
779	.sv_transtrap	= translate_traps,
780	.sv_fixup	= elf_linux_fixup,
781	.sv_sendsig	= linux_rt_sendsig,
782	.sv_sigcode	= &_binary_linux_locore_o_start,
783	.sv_szsigcode	= &linux_szsigcode,
784	.sv_prepsyscall	= NULL,
785	.sv_name	= "Linux ELF64",
786	.sv_coredump	= elf64_coredump,
787	.sv_imgact_try	= exec_linux_imgact_try,
788	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
789	.sv_pagesize	= PAGE_SIZE,
790	.sv_minuser	= VM_MIN_ADDRESS,
791	.sv_maxuser	= VM_MAXUSER_ADDRESS,
792	.sv_usrstack	= USRSTACK,
793	.sv_psstrings	= PS_STRINGS,
794	.sv_stackprot	= VM_PROT_ALL,
795	.sv_copyout_strings = linux_copyout_strings,
796	.sv_setregs	= linux_exec_setregs,
797	.sv_fixlimit	= NULL,
798	.sv_maxssiz	= NULL,
799	.sv_flags	= SV_ABI_LINUX | SV_LP64 | SV_SHP,
800	.sv_set_syscall_retval = linux_set_syscall_retval,
801	.sv_fetch_syscall_args = linux_fetch_syscall_args,
802	.sv_syscallnames = NULL,
803	.sv_shared_page_base = SHAREDPAGE,
804	.sv_shared_page_len = PAGE_SIZE,
805	.sv_schedtail	= linux_schedtail,
806	.sv_thread_detach = linux_thread_detach
807};
808
809static void
810linux_vdso_install(void *param)
811{
812
813	linux_szsigcode = (&_binary_linux_locore_o_end -
814	    &_binary_linux_locore_o_start);
815
816	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
817		panic("Linux invalid vdso size\n");
818
819	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
820
821	linux_shared_page_obj = __elfN(linux_shared_page_init)
822	    (&linux_shared_page_mapping);
823
824	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, SHAREDPAGE);
825
826	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
827	    linux_szsigcode);
828	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
829
830	linux_kplatform = linux_shared_page_mapping +
831	    (linux_platform - (caddr_t)SHAREDPAGE);
832}
833SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
834    (sysinit_cfunc_t)linux_vdso_install, NULL);
835
836static void
837linux_vdso_deinstall(void *param)
838{
839
840	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
841};
842SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
843    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
844
845static char GNULINUX_ABI_VENDOR[] = "GNU";
846static int GNULINUX_ABI_DESC = 0;
847
848static boolean_t
849linux_trans_osrel(const Elf_Note *note, int32_t *osrel)
850{
851	const Elf32_Word *desc;
852	uintptr_t p;
853
854	p = (uintptr_t)(note + 1);
855	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
856
857	desc = (const Elf32_Word *)p;
858	if (desc[0] != GNULINUX_ABI_DESC)
859		return (FALSE);
860
861	/*
862	 * For linux we encode osrel as follows (see linux_mib.c):
863	 * VVVMMMIII (version, major, minor), see linux_mib.c.
864	 */
865	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
866
867	return (TRUE);
868}
869
870static Elf_Brandnote linux64_brandnote = {
871	.hdr.n_namesz	= sizeof(GNULINUX_ABI_VENDOR),
872	.hdr.n_descsz	= 16,
873	.hdr.n_type	= 1,
874	.vendor		= GNULINUX_ABI_VENDOR,
875	.flags		= BN_TRANSLATE_OSREL,
876	.trans_osrel	= linux_trans_osrel
877};
878
879static Elf64_Brandinfo linux_glibc2brand = {
880	.brand		= ELFOSABI_LINUX,
881	.machine	= EM_X86_64,
882	.compat_3_brand	= "Linux",
883	.emul_path	= "/compat/linux",
884	.interp_path	= "/lib64/ld-linux-x86-64.so.2",
885	.sysvec		= &elf_linux_sysvec,
886	.interp_newpath	= NULL,
887	.brand_note	= &linux64_brandnote,
888	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
889};
890
891static Elf64_Brandinfo linux_glibc2brandshort = {
892	.brand		= ELFOSABI_LINUX,
893	.machine	= EM_X86_64,
894	.compat_3_brand	= "Linux",
895	.emul_path	= "/compat/linux",
896	.interp_path	= "/lib64/ld-linux.so.2",
897	.sysvec		= &elf_linux_sysvec,
898	.interp_newpath	= NULL,
899	.brand_note	= &linux64_brandnote,
900	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
901};
902
903Elf64_Brandinfo *linux_brandlist[] = {
904	&linux_glibc2brand,
905	&linux_glibc2brandshort,
906	NULL
907};
908
909static int
910linux64_elf_modevent(module_t mod, int type, void *data)
911{
912	Elf64_Brandinfo **brandinfo;
913	int error;
914	struct linux_ioctl_handler **lihp;
915
916	error = 0;
917
918	switch(type) {
919	case MOD_LOAD:
920		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
921		     ++brandinfo)
922			if (elf64_insert_brand_entry(*brandinfo) < 0)
923				error = EINVAL;
924		if (error == 0) {
925			SET_FOREACH(lihp, linux_ioctl_handler_set)
926				linux_ioctl_register_handler(*lihp);
927			LIST_INIT(&futex_list);
928			mtx_init(&futex_mtx, "ftllk64", NULL, MTX_DEF);
929			stclohz = (stathz ? stathz : hz);
930			if (bootverbose)
931				printf("Linux x86-64 ELF exec handler installed\n");
932		} else
933			printf("cannot insert Linux x86-64 ELF brand handler\n");
934		break;
935	case MOD_UNLOAD:
936		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
937		     ++brandinfo)
938			if (elf64_brand_inuse(*brandinfo))
939				error = EBUSY;
940		if (error == 0) {
941			for (brandinfo = &linux_brandlist[0];
942			     *brandinfo != NULL; ++brandinfo)
943				if (elf64_remove_brand_entry(*brandinfo) < 0)
944					error = EINVAL;
945		}
946		if (error == 0) {
947			SET_FOREACH(lihp, linux_ioctl_handler_set)
948				linux_ioctl_unregister_handler(*lihp);
949			mtx_destroy(&futex_mtx);
950			if (bootverbose)
951				printf("Linux ELF exec handler removed\n");
952		} else
953			printf("Could not deinstall ELF interpreter entry\n");
954		break;
955	default:
956		return (EOPNOTSUPP);
957	}
958	return (error);
959}
960
961static moduledata_t linux64_elf_mod = {
962	"linux64elf",
963	linux64_elf_modevent,
964	0
965};
966
967DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
968MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1);
969