linux32_sysvec.c revision 293514
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 S��ren Schmidt
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer
14 *    in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 *    derived from this software without specific prior written permission
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: stable/10/sys/amd64/linux32/linux32_sysvec.c 293514 2016-01-09 15:44:38Z dchagin $");
35#include "opt_compat.h"
36
37#ifndef COMPAT_FREEBSD32
38#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39#endif
40
41#define	__ELF_WORD_SIZE	32
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/exec.h>
46#include <sys/fcntl.h>
47#include <sys/imgact.h>
48#include <sys/imgact_elf.h>
49#include <sys/kernel.h>
50#include <sys/lock.h>
51#include <sys/malloc.h>
52#include <sys/module.h>
53#include <sys/mutex.h>
54#include <sys/proc.h>
55#include <sys/resourcevar.h>
56#include <sys/signalvar.h>
57#include <sys/sysctl.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysent.h>
60#include <sys/sysproto.h>
61#include <sys/vnode.h>
62#include <sys/eventhandler.h>
63
64#include <vm/vm.h>
65#include <vm/pmap.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_map.h>
68#include <vm/vm_object.h>
69#include <vm/vm_page.h>
70#include <vm/vm_param.h>
71
72#include <machine/cpu.h>
73#include <machine/md_var.h>
74#include <machine/pcb.h>
75#include <machine/specialreg.h>
76
77#include <amd64/linux32/linux.h>
78#include <amd64/linux32/linux32_proto.h>
79#include <compat/linux/linux_emul.h>
80#include <compat/linux/linux_futex.h>
81#include <compat/linux/linux_ioctl.h>
82#include <compat/linux/linux_mib.h>
83#include <compat/linux/linux_misc.h>
84#include <compat/linux/linux_signal.h>
85#include <compat/linux/linux_util.h>
86#include <compat/linux/linux_vdso.h>
87
88MODULE_VERSION(linux, 1);
89
90MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
91
92#define	AUXARGS_ENTRY_32(pos, id, val)	\
93	do {				\
94		suword32(pos++, id);	\
95		suword32(pos++, val);	\
96	} while (0)
97
98#if BYTE_ORDER == LITTLE_ENDIAN
99#define SHELLMAGIC      0x2123 /* #! */
100#else
101#define SHELLMAGIC      0x2321
102#endif
103
104/*
105 * Allow the sendsig functions to use the ldebug() facility
106 * even though they are not syscalls themselves. Map them
107 * to syscall 0. This is slightly less bogus than using
108 * ldebug(sigreturn).
109 */
110#define	LINUX_SYS_linux_rt_sendsig	0
111#define	LINUX_SYS_linux_sendsig		0
112
113const char *linux_platform = "i686";
114static int linux_szplatform;
115static int linux_szsigcode;
116static vm_object_t linux_shared_page_obj;
117static char *linux_shared_page_mapping;
118extern char _binary_linux32_locore_o_start;
119extern char _binary_linux32_locore_o_end;
120
121extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
122
123SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
124SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
125
126static int	elf_linux_fixup(register_t **stack_base,
127		    struct image_params *iparams);
128static register_t *linux_copyout_strings(struct image_params *imgp);
129static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
130static void	exec_linux_setregs(struct thread *td,
131				   struct image_params *imgp, u_long stack);
132static void	linux32_fixlimit(struct rlimit *rl, int which);
133static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
134static void	linux_vdso_install(void *param);
135static void	linux_vdso_deinstall(void *param);
136
137static eventhandler_tag linux_exit_tag;
138static eventhandler_tag linux_exec_tag;
139static eventhandler_tag linux_thread_dtor_tag;
140
141/*
142 * Linux syscalls return negative errno's, we do positive and map them
143 * Reference:
144 *   FreeBSD: src/sys/sys/errno.h
145 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
146 *            linux-2.6.17.8/include/asm-generic/errno.h
147 */
148static int bsd_to_linux_errno[ELAST + 1] = {
149	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
150	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
151	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
152	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
153	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
154	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
155	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
156	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
157	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
158	 -72, -67, -71
159};
160
161int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
162	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
163	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
164	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
165	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
166	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
167	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
168	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
169	0, LINUX_SIGUSR1, LINUX_SIGUSR2
170};
171
172int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
173	SIGHUP, SIGINT, SIGQUIT, SIGILL,
174	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
175	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
176	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
177	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
178	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
179	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
180	SIGIO, SIGURG, SIGSYS
181};
182
183#define LINUX_T_UNKNOWN  255
184static int _bsd_to_linux_trapcode[] = {
185	LINUX_T_UNKNOWN,	/* 0 */
186	6,			/* 1  T_PRIVINFLT */
187	LINUX_T_UNKNOWN,	/* 2 */
188	3,			/* 3  T_BPTFLT */
189	LINUX_T_UNKNOWN,	/* 4 */
190	LINUX_T_UNKNOWN,	/* 5 */
191	16,			/* 6  T_ARITHTRAP */
192	254,			/* 7  T_ASTFLT */
193	LINUX_T_UNKNOWN,	/* 8 */
194	13,			/* 9  T_PROTFLT */
195	1,			/* 10 T_TRCTRAP */
196	LINUX_T_UNKNOWN,	/* 11 */
197	14,			/* 12 T_PAGEFLT */
198	LINUX_T_UNKNOWN,	/* 13 */
199	17,			/* 14 T_ALIGNFLT */
200	LINUX_T_UNKNOWN,	/* 15 */
201	LINUX_T_UNKNOWN,	/* 16 */
202	LINUX_T_UNKNOWN,	/* 17 */
203	0,			/* 18 T_DIVIDE */
204	2,			/* 19 T_NMI */
205	4,			/* 20 T_OFLOW */
206	5,			/* 21 T_BOUND */
207	7,			/* 22 T_DNA */
208	8,			/* 23 T_DOUBLEFLT */
209	9,			/* 24 T_FPOPFLT */
210	10,			/* 25 T_TSSFLT */
211	11,			/* 26 T_SEGNPFLT */
212	12,			/* 27 T_STKFLT */
213	18,			/* 28 T_MCHK */
214	19,			/* 29 T_XMMFLT */
215	15			/* 30 T_RESERVED */
216};
217#define bsd_to_linux_trapcode(code) \
218    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
219     _bsd_to_linux_trapcode[(code)]: \
220     LINUX_T_UNKNOWN)
221
222struct linux32_ps_strings {
223	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
224	u_int ps_nargvstr;	/* the number of argument strings */
225	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
226	u_int ps_nenvstr;	/* the number of environment strings */
227};
228
229LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
230LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
231LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
232
233/*
234 * If FreeBSD & Linux have a difference of opinion about what a trap
235 * means, deal with it here.
236 *
237 * MPSAFE
238 */
239static int
240translate_traps(int signal, int trap_code)
241{
242	if (signal != SIGBUS)
243		return signal;
244	switch (trap_code) {
245	case T_PROTFLT:
246	case T_TSSFLT:
247	case T_DOUBLEFLT:
248	case T_PAGEFLT:
249		return SIGSEGV;
250	default:
251		return signal;
252	}
253}
254
255static int
256elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
257{
258	Elf32_Auxargs *args;
259	Elf32_Addr *base;
260	Elf32_Addr *pos, *uplatform;
261	struct linux32_ps_strings *arginfo;
262
263	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
264	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
265
266	KASSERT(curthread->td_proc == imgp->proc,
267	    ("unsafe elf_linux_fixup(), should be curproc"));
268	base = (Elf32_Addr *)*stack_base;
269	args = (Elf32_Auxargs *)imgp->auxargs;
270	pos = base + (imgp->args->argc + imgp->args->envc + 2);
271
272	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
273	    imgp->proc->p_sysent->sv_shared_page_base);
274	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
275	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
276
277	/*
278	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
279	 * as it has appeared in the 2.4.0-rc7 first time.
280	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
281	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
282	 * is not present.
283	 * Also see linux_times() implementation.
284	 */
285	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
286		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
287	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
288	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
289	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
290	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
291	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
292	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
293	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
294	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
295	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
296	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
297	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
298	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
299	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
300	if (args->execfd != -1)
301		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
302	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
303
304	free(imgp->auxargs, M_TEMP);
305	imgp->auxargs = NULL;
306
307	base--;
308	suword32(base, (uint32_t)imgp->args->argc);
309	*stack_base = (register_t *)base;
310	return (0);
311}
312
313static void
314linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
315{
316	struct thread *td = curthread;
317	struct proc *p = td->td_proc;
318	struct sigacts *psp;
319	struct trapframe *regs;
320	struct l_rt_sigframe *fp, frame;
321	int oonstack;
322	int sig;
323	int code;
324
325	sig = ksi->ksi_signo;
326	code = ksi->ksi_code;
327	PROC_LOCK_ASSERT(p, MA_OWNED);
328	psp = p->p_sigacts;
329	mtx_assert(&psp->ps_mtx, MA_OWNED);
330	regs = td->td_frame;
331	oonstack = sigonstack(regs->tf_rsp);
332
333#ifdef DEBUG
334	if (ldebug(rt_sendsig))
335		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
336		    catcher, sig, (void*)mask, code);
337#endif
338	/*
339	 * Allocate space for the signal handler context.
340	 */
341	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
342	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
343		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
344		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
345	} else
346		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
347	mtx_unlock(&psp->ps_mtx);
348
349	/*
350	 * Build the argument list for the signal handler.
351	 */
352	if (p->p_sysent->sv_sigtbl)
353		if (sig <= p->p_sysent->sv_sigsize)
354			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
355
356	bzero(&frame, sizeof(frame));
357
358	frame.sf_handler = PTROUT(catcher);
359	frame.sf_sig = sig;
360	frame.sf_siginfo = PTROUT(&fp->sf_si);
361	frame.sf_ucontext = PTROUT(&fp->sf_sc);
362
363	/* Fill in POSIX parts */
364	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
365
366	/*
367	 * Build the signal context to be used by sigreturn
368	 * and libgcc unwind.
369	 */
370	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
371	frame.sf_sc.uc_link = 0;		/* XXX ??? */
372
373	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
374	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
375	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
376	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
377	PROC_UNLOCK(p);
378
379	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
380
381	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
382	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
383	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
384	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
385	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
386	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
387	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
388	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
389	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
390	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
391	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
392	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
393	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
394	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
395	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
396	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
397	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
398	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
399	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
400	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
401	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
402
403#ifdef DEBUG
404	if (ldebug(rt_sendsig))
405		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
406		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
407		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
408#endif
409
410	if (copyout(&frame, fp, sizeof(frame)) != 0) {
411		/*
412		 * Process has trashed its stack; give it an illegal
413		 * instruction to halt it in its tracks.
414		 */
415#ifdef DEBUG
416		if (ldebug(rt_sendsig))
417			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
418			    fp, oonstack);
419#endif
420		PROC_LOCK(p);
421		sigexit(td, SIGILL);
422	}
423
424	/*
425	 * Build context to run handler in.
426	 */
427	regs->tf_rsp = PTROUT(fp);
428	regs->tf_rip = linux32_rt_sigcode;
429	regs->tf_rflags &= ~(PSL_T | PSL_D);
430	regs->tf_cs = _ucode32sel;
431	regs->tf_ss = _udatasel;
432	regs->tf_ds = _udatasel;
433	regs->tf_es = _udatasel;
434	regs->tf_fs = _ufssel;
435	regs->tf_gs = _ugssel;
436	regs->tf_flags = TF_HASSEGS;
437	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
438	PROC_LOCK(p);
439	mtx_lock(&psp->ps_mtx);
440}
441
442
443/*
444 * Send an interrupt to process.
445 *
446 * Stack is set up to allow sigcode stored
447 * in u. to call routine, followed by kcall
448 * to sigreturn routine below.  After sigreturn
449 * resets the signal mask, the stack, and the
450 * frame pointer, it returns to the user
451 * specified pc, psl.
452 */
453static void
454linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
455{
456	struct thread *td = curthread;
457	struct proc *p = td->td_proc;
458	struct sigacts *psp;
459	struct trapframe *regs;
460	struct l_sigframe *fp, frame;
461	l_sigset_t lmask;
462	int oonstack, i;
463	int sig, code;
464
465	sig = ksi->ksi_signo;
466	code = ksi->ksi_code;
467	PROC_LOCK_ASSERT(p, MA_OWNED);
468	psp = p->p_sigacts;
469	mtx_assert(&psp->ps_mtx, MA_OWNED);
470	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
471		/* Signal handler installed with SA_SIGINFO. */
472		linux_rt_sendsig(catcher, ksi, mask);
473		return;
474	}
475
476	regs = td->td_frame;
477	oonstack = sigonstack(regs->tf_rsp);
478
479#ifdef DEBUG
480	if (ldebug(sendsig))
481		printf(ARGS(sendsig, "%p, %d, %p, %u"),
482		    catcher, sig, (void*)mask, code);
483#endif
484
485	/*
486	 * Allocate space for the signal handler context.
487	 */
488	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
489	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
490		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
491		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
492	} else
493		fp = (struct l_sigframe *)regs->tf_rsp - 1;
494	mtx_unlock(&psp->ps_mtx);
495	PROC_UNLOCK(p);
496
497	/*
498	 * Build the argument list for the signal handler.
499	 */
500	if (p->p_sysent->sv_sigtbl)
501		if (sig <= p->p_sysent->sv_sigsize)
502			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
503
504	bzero(&frame, sizeof(frame));
505
506	frame.sf_handler = PTROUT(catcher);
507	frame.sf_sig = sig;
508
509	bsd_to_linux_sigset(mask, &lmask);
510
511	/*
512	 * Build the signal context to be used by sigreturn.
513	 */
514	frame.sf_sc.sc_mask   = lmask.__bits[0];
515	frame.sf_sc.sc_gs     = regs->tf_gs;
516	frame.sf_sc.sc_fs     = regs->tf_fs;
517	frame.sf_sc.sc_es     = regs->tf_es;
518	frame.sf_sc.sc_ds     = regs->tf_ds;
519	frame.sf_sc.sc_edi    = regs->tf_rdi;
520	frame.sf_sc.sc_esi    = regs->tf_rsi;
521	frame.sf_sc.sc_ebp    = regs->tf_rbp;
522	frame.sf_sc.sc_ebx    = regs->tf_rbx;
523	frame.sf_sc.sc_esp    = regs->tf_rsp;
524	frame.sf_sc.sc_edx    = regs->tf_rdx;
525	frame.sf_sc.sc_ecx    = regs->tf_rcx;
526	frame.sf_sc.sc_eax    = regs->tf_rax;
527	frame.sf_sc.sc_eip    = regs->tf_rip;
528	frame.sf_sc.sc_cs     = regs->tf_cs;
529	frame.sf_sc.sc_eflags = regs->tf_rflags;
530	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
531	frame.sf_sc.sc_ss     = regs->tf_ss;
532	frame.sf_sc.sc_err    = regs->tf_err;
533	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
534	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
535
536	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
537		frame.sf_extramask[i] = lmask.__bits[i+1];
538
539	if (copyout(&frame, fp, sizeof(frame)) != 0) {
540		/*
541		 * Process has trashed its stack; give it an illegal
542		 * instruction to halt it in its tracks.
543		 */
544		PROC_LOCK(p);
545		sigexit(td, SIGILL);
546	}
547
548	/*
549	 * Build context to run handler in.
550	 */
551	regs->tf_rsp = PTROUT(fp);
552	regs->tf_rip = linux32_sigcode;
553	regs->tf_rflags &= ~(PSL_T | PSL_D);
554	regs->tf_cs = _ucode32sel;
555	regs->tf_ss = _udatasel;
556	regs->tf_ds = _udatasel;
557	regs->tf_es = _udatasel;
558	regs->tf_fs = _ufssel;
559	regs->tf_gs = _ugssel;
560	regs->tf_flags = TF_HASSEGS;
561	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
562	PROC_LOCK(p);
563	mtx_lock(&psp->ps_mtx);
564}
565
566/*
567 * System call to cleanup state after a signal
568 * has been taken.  Reset signal mask and
569 * stack state from context left by sendsig (above).
570 * Return to previous pc and psl as specified by
571 * context left by sendsig. Check carefully to
572 * make sure that the user has not modified the
573 * psl to gain improper privileges or to cause
574 * a machine fault.
575 */
576int
577linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
578{
579	struct l_sigframe frame;
580	struct trapframe *regs;
581	sigset_t bmask;
582	l_sigset_t lmask;
583	int eflags, i;
584	ksiginfo_t ksi;
585
586	regs = td->td_frame;
587
588#ifdef DEBUG
589	if (ldebug(sigreturn))
590		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
591#endif
592	/*
593	 * The trampoline code hands us the sigframe.
594	 * It is unsafe to keep track of it ourselves, in the event that a
595	 * program jumps out of a signal handler.
596	 */
597	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
598		return (EFAULT);
599
600	/*
601	 * Check for security violations.
602	 */
603#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
604	eflags = frame.sf_sc.sc_eflags;
605	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
606		return(EINVAL);
607
608	/*
609	 * Don't allow users to load a valid privileged %cs.  Let the
610	 * hardware check for invalid selectors, excess privilege in
611	 * other selectors, invalid %eip's and invalid %esp's.
612	 */
613#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
614	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
615		ksiginfo_init_trap(&ksi);
616		ksi.ksi_signo = SIGBUS;
617		ksi.ksi_code = BUS_OBJERR;
618		ksi.ksi_trapno = T_PROTFLT;
619		ksi.ksi_addr = (void *)regs->tf_rip;
620		trapsignal(td, &ksi);
621		return(EINVAL);
622	}
623
624	lmask.__bits[0] = frame.sf_sc.sc_mask;
625	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
626		lmask.__bits[i+1] = frame.sf_extramask[i];
627	linux_to_bsd_sigset(&lmask, &bmask);
628	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
629
630	/*
631	 * Restore signal context.
632	 */
633	regs->tf_rdi    = frame.sf_sc.sc_edi;
634	regs->tf_rsi    = frame.sf_sc.sc_esi;
635	regs->tf_rbp    = frame.sf_sc.sc_ebp;
636	regs->tf_rbx    = frame.sf_sc.sc_ebx;
637	regs->tf_rdx    = frame.sf_sc.sc_edx;
638	regs->tf_rcx    = frame.sf_sc.sc_ecx;
639	regs->tf_rax    = frame.sf_sc.sc_eax;
640	regs->tf_rip    = frame.sf_sc.sc_eip;
641	regs->tf_cs     = frame.sf_sc.sc_cs;
642	regs->tf_ds     = frame.sf_sc.sc_ds;
643	regs->tf_es     = frame.sf_sc.sc_es;
644	regs->tf_fs     = frame.sf_sc.sc_fs;
645	regs->tf_gs     = frame.sf_sc.sc_gs;
646	regs->tf_rflags = eflags;
647	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
648	regs->tf_ss     = frame.sf_sc.sc_ss;
649	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
650
651	return (EJUSTRETURN);
652}
653
654/*
655 * System call to cleanup state after a signal
656 * has been taken.  Reset signal mask and
657 * stack state from context left by rt_sendsig (above).
658 * Return to previous pc and psl as specified by
659 * context left by sendsig. Check carefully to
660 * make sure that the user has not modified the
661 * psl to gain improper privileges or to cause
662 * a machine fault.
663 */
664int
665linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
666{
667	struct l_ucontext uc;
668	struct l_sigcontext *context;
669	sigset_t bmask;
670	l_stack_t *lss;
671	stack_t ss;
672	struct trapframe *regs;
673	int eflags;
674	ksiginfo_t ksi;
675
676	regs = td->td_frame;
677
678#ifdef DEBUG
679	if (ldebug(rt_sigreturn))
680		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
681#endif
682	/*
683	 * The trampoline code hands us the ucontext.
684	 * It is unsafe to keep track of it ourselves, in the event that a
685	 * program jumps out of a signal handler.
686	 */
687	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
688		return (EFAULT);
689
690	context = &uc.uc_mcontext;
691
692	/*
693	 * Check for security violations.
694	 */
695#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
696	eflags = context->sc_eflags;
697	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
698		return(EINVAL);
699
700	/*
701	 * Don't allow users to load a valid privileged %cs.  Let the
702	 * hardware check for invalid selectors, excess privilege in
703	 * other selectors, invalid %eip's and invalid %esp's.
704	 */
705#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
706	if (!CS_SECURE(context->sc_cs)) {
707		ksiginfo_init_trap(&ksi);
708		ksi.ksi_signo = SIGBUS;
709		ksi.ksi_code = BUS_OBJERR;
710		ksi.ksi_trapno = T_PROTFLT;
711		ksi.ksi_addr = (void *)regs->tf_rip;
712		trapsignal(td, &ksi);
713		return(EINVAL);
714	}
715
716	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
717	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
718
719	/*
720	 * Restore signal context
721	 */
722	regs->tf_gs	= context->sc_gs;
723	regs->tf_fs	= context->sc_fs;
724	regs->tf_es	= context->sc_es;
725	regs->tf_ds	= context->sc_ds;
726	regs->tf_rdi    = context->sc_edi;
727	regs->tf_rsi    = context->sc_esi;
728	regs->tf_rbp    = context->sc_ebp;
729	regs->tf_rbx    = context->sc_ebx;
730	regs->tf_rdx    = context->sc_edx;
731	regs->tf_rcx    = context->sc_ecx;
732	regs->tf_rax    = context->sc_eax;
733	regs->tf_rip    = context->sc_eip;
734	regs->tf_cs     = context->sc_cs;
735	regs->tf_rflags = eflags;
736	regs->tf_rsp    = context->sc_esp_at_signal;
737	regs->tf_ss     = context->sc_ss;
738	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
739
740	/*
741	 * call sigaltstack & ignore results..
742	 */
743	lss = &uc.uc_stack;
744	ss.ss_sp = PTRIN(lss->ss_sp);
745	ss.ss_size = lss->ss_size;
746	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
747
748#ifdef DEBUG
749	if (ldebug(rt_sigreturn))
750		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
751		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
752#endif
753	(void)kern_sigaltstack(td, &ss, NULL);
754
755	return (EJUSTRETURN);
756}
757
758static int
759linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
760{
761	struct proc *p;
762	struct trapframe *frame;
763
764	p = td->td_proc;
765	frame = td->td_frame;
766
767	sa->args[0] = frame->tf_rbx;
768	sa->args[1] = frame->tf_rcx;
769	sa->args[2] = frame->tf_rdx;
770	sa->args[3] = frame->tf_rsi;
771	sa->args[4] = frame->tf_rdi;
772	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
773	sa->code = frame->tf_rax;
774
775	if (sa->code >= p->p_sysent->sv_size)
776		sa->callp = &p->p_sysent->sv_table[0];
777	else
778		sa->callp = &p->p_sysent->sv_table[sa->code];
779	sa->narg = sa->callp->sy_narg;
780
781	td->td_retval[0] = 0;
782	td->td_retval[1] = frame->tf_rdx;
783
784	return (0);
785}
786
787/*
788 * If a linux binary is exec'ing something, try this image activator
789 * first.  We override standard shell script execution in order to
790 * be able to modify the interpreter path.  We only do this if a linux
791 * binary is doing the exec, so we do not create an EXEC module for it.
792 */
793static int	exec_linux_imgact_try(struct image_params *iparams);
794
795static int
796exec_linux_imgact_try(struct image_params *imgp)
797{
798	const char *head = (const char *)imgp->image_header;
799	char *rpath;
800	int error = -1;
801
802	/*
803	* The interpreter for shell scripts run from a linux binary needs
804	* to be located in /compat/linux if possible in order to recursively
805	* maintain linux path emulation.
806	*/
807	if (((const short *)head)[0] == SHELLMAGIC) {
808		/*
809		* Run our normal shell image activator.  If it succeeds attempt
810		* to use the alternate path for the interpreter.  If an
811		* alternate * path is found, use our stringspace to store it.
812		*/
813		if ((error = exec_shell_imgact(imgp)) == 0) {
814			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
815			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
816			    AT_FDCWD);
817			if (rpath != NULL)
818				imgp->args->fname_buf =
819				    imgp->interpreter_name = rpath;
820		}
821	}
822	return (error);
823}
824
825/*
826 * Clear registers on exec
827 * XXX copied from ia32_signal.c.
828 */
829static void
830exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
831{
832	struct trapframe *regs = td->td_frame;
833	struct pcb *pcb = td->td_pcb;
834
835	mtx_lock(&dt_lock);
836	if (td->td_proc->p_md.md_ldt != NULL)
837		user_ldt_free(td);
838	else
839		mtx_unlock(&dt_lock);
840
841	critical_enter();
842	wrmsr(MSR_FSBASE, 0);
843	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
844	pcb->pcb_fsbase = 0;
845	pcb->pcb_gsbase = 0;
846	critical_exit();
847	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
848
849	bzero((char *)regs, sizeof(struct trapframe));
850	regs->tf_rip = imgp->entry_addr;
851	regs->tf_rsp = stack;
852	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
853	regs->tf_gs = _ugssel;
854	regs->tf_fs = _ufssel;
855	regs->tf_es = _udatasel;
856	regs->tf_ds = _udatasel;
857	regs->tf_ss = _udatasel;
858	regs->tf_flags = TF_HASSEGS;
859	regs->tf_cs = _ucode32sel;
860	regs->tf_rbx = imgp->ps_strings;
861
862	fpstate_drop(td);
863
864	/* Do full restore on return so that we can change to a different %cs */
865	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
866	td->td_retval[1] = 0;
867}
868
869/*
870 * XXX copied from ia32_sysvec.c.
871 */
872static register_t *
873linux_copyout_strings(struct image_params *imgp)
874{
875	int argc, envc;
876	u_int32_t *vectp;
877	char *stringp, *destp;
878	u_int32_t *stack_base;
879	struct linux32_ps_strings *arginfo;
880
881	/*
882	 * Calculate string base and vector table pointers.
883	 * Also deal with signal trampoline code for this exec type.
884	 */
885	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
886	destp =	(caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
887	    roundup((ARG_MAX - imgp->args->stringspace),
888	    sizeof(char *));
889
890	/*
891	 * Install LINUX_PLATFORM
892	 */
893	copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
894	    linux_szplatform);
895
896	/*
897	 * If we have a valid auxargs ptr, prepare some room
898	 * on the stack.
899	 */
900	if (imgp->auxargs) {
901		/*
902		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
903		 * lower compatibility.
904		 */
905		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
906		    (LINUX_AT_COUNT * 2);
907		/*
908		 * The '+ 2' is for the null pointers at the end of each of
909		 * the arg and env vector sets,and imgp->auxarg_size is room
910		 * for argument of Runtime loader.
911		 */
912		vectp = (u_int32_t *) (destp - (imgp->args->argc +
913		    imgp->args->envc + 2 + imgp->auxarg_size) *
914		    sizeof(u_int32_t));
915
916	} else
917		/*
918		 * The '+ 2' is for the null pointers at the end of each of
919		 * the arg and env vector sets
920		 */
921		vectp = (u_int32_t *)(destp - (imgp->args->argc +
922		    imgp->args->envc + 2) * sizeof(u_int32_t));
923
924	/*
925	 * vectp also becomes our initial stack base
926	 */
927	stack_base = vectp;
928
929	stringp = imgp->args->begin_argv;
930	argc = imgp->args->argc;
931	envc = imgp->args->envc;
932	/*
933	 * Copy out strings - arguments and environment.
934	 */
935	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
936
937	/*
938	 * Fill in "ps_strings" struct for ps, w, etc.
939	 */
940	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
941	suword32(&arginfo->ps_nargvstr, argc);
942
943	/*
944	 * Fill in argument portion of vector table.
945	 */
946	for (; argc > 0; --argc) {
947		suword32(vectp++, (uint32_t)(intptr_t)destp);
948		while (*stringp++ != 0)
949			destp++;
950		destp++;
951	}
952
953	/* a null vector table pointer separates the argp's from the envp's */
954	suword32(vectp++, 0);
955
956	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
957	suword32(&arginfo->ps_nenvstr, envc);
958
959	/*
960	 * Fill in environment portion of vector table.
961	 */
962	for (; envc > 0; --envc) {
963		suword32(vectp++, (uint32_t)(intptr_t)destp);
964		while (*stringp++ != 0)
965			destp++;
966		destp++;
967	}
968
969	/* end of vector table is a null pointer */
970	suword32(vectp, 0);
971
972	return ((register_t *)stack_base);
973}
974
975static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
976    "32-bit Linux emulation");
977
978static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
979SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
980    &linux32_maxdsiz, 0, "");
981static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
982SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
983    &linux32_maxssiz, 0, "");
984static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
985SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
986    &linux32_maxvmem, 0, "");
987
988static void
989linux32_fixlimit(struct rlimit *rl, int which)
990{
991
992	switch (which) {
993	case RLIMIT_DATA:
994		if (linux32_maxdsiz != 0) {
995			if (rl->rlim_cur > linux32_maxdsiz)
996				rl->rlim_cur = linux32_maxdsiz;
997			if (rl->rlim_max > linux32_maxdsiz)
998				rl->rlim_max = linux32_maxdsiz;
999		}
1000		break;
1001	case RLIMIT_STACK:
1002		if (linux32_maxssiz != 0) {
1003			if (rl->rlim_cur > linux32_maxssiz)
1004				rl->rlim_cur = linux32_maxssiz;
1005			if (rl->rlim_max > linux32_maxssiz)
1006				rl->rlim_max = linux32_maxssiz;
1007		}
1008		break;
1009	case RLIMIT_VMEM:
1010		if (linux32_maxvmem != 0) {
1011			if (rl->rlim_cur > linux32_maxvmem)
1012				rl->rlim_cur = linux32_maxvmem;
1013			if (rl->rlim_max > linux32_maxvmem)
1014				rl->rlim_max = linux32_maxvmem;
1015		}
1016		break;
1017	}
1018}
1019
1020struct sysentvec elf_linux_sysvec = {
1021	.sv_size	= LINUX_SYS_MAXSYSCALL,
1022	.sv_table	= linux_sysent,
1023	.sv_mask	= 0,
1024	.sv_sigsize	= LINUX_SIGTBLSZ,
1025	.sv_sigtbl	= bsd_to_linux_signal,
1026	.sv_errsize	= ELAST + 1,
1027	.sv_errtbl	= bsd_to_linux_errno,
1028	.sv_transtrap	= translate_traps,
1029	.sv_fixup	= elf_linux_fixup,
1030	.sv_sendsig	= linux_sendsig,
1031	.sv_sigcode	= &_binary_linux32_locore_o_start,
1032	.sv_szsigcode	= &linux_szsigcode,
1033	.sv_prepsyscall	= NULL,
1034	.sv_name	= "Linux ELF32",
1035	.sv_coredump	= elf32_coredump,
1036	.sv_imgact_try	= exec_linux_imgact_try,
1037	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1038	.sv_pagesize	= PAGE_SIZE,
1039	.sv_minuser	= VM_MIN_ADDRESS,
1040	.sv_maxuser	= LINUX32_MAXUSER,
1041	.sv_usrstack	= LINUX32_USRSTACK,
1042	.sv_psstrings	= LINUX32_PS_STRINGS,
1043	.sv_stackprot	= VM_PROT_ALL,
1044	.sv_copyout_strings = linux_copyout_strings,
1045	.sv_setregs	= exec_linux_setregs,
1046	.sv_fixlimit	= linux32_fixlimit,
1047	.sv_maxssiz	= &linux32_maxssiz,
1048	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1049	.sv_set_syscall_retval = cpu_set_syscall_retval,
1050	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1051	.sv_syscallnames = NULL,
1052	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1053	.sv_shared_page_len = PAGE_SIZE,
1054	.sv_schedtail	= linux_schedtail,
1055	.sv_thread_detach = linux_thread_detach,
1056};
1057
1058static void
1059linux_vdso_install(void *param)
1060{
1061
1062	linux_szsigcode = (&_binary_linux32_locore_o_end -
1063	    &_binary_linux32_locore_o_start);
1064
1065	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1066		panic("Linux invalid vdso size\n");
1067
1068	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1069
1070	linux_shared_page_obj = __elfN(linux_shared_page_init)
1071	    (&linux_shared_page_mapping);
1072
1073	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1074
1075	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1076	    linux_szsigcode);
1077	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1078}
1079SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1080    (sysinit_cfunc_t)linux_vdso_install, NULL);
1081
1082static void
1083linux_vdso_deinstall(void *param)
1084{
1085
1086	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1087};
1088SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1089    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1090
1091static char GNU_ABI_VENDOR[] = "GNU";
1092static int GNULINUX_ABI_DESC = 0;
1093
1094static boolean_t
1095linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1096{
1097	const Elf32_Word *desc;
1098	uintptr_t p;
1099
1100	p = (uintptr_t)(note + 1);
1101	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1102
1103	desc = (const Elf32_Word *)p;
1104	if (desc[0] != GNULINUX_ABI_DESC)
1105		return (FALSE);
1106
1107	/*
1108	 * For linux we encode osrel as follows (see linux_mib.c):
1109	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1110	 */
1111	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1112
1113	return (TRUE);
1114}
1115
1116static Elf_Brandnote linux32_brandnote = {
1117	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1118	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1119	.hdr.n_type	= 1,
1120	.vendor		= GNU_ABI_VENDOR,
1121	.flags		= BN_TRANSLATE_OSREL,
1122	.trans_osrel	= linux32_trans_osrel
1123};
1124
1125static Elf32_Brandinfo linux_brand = {
1126	.brand		= ELFOSABI_LINUX,
1127	.machine	= EM_386,
1128	.compat_3_brand	= "Linux",
1129	.emul_path	= "/compat/linux",
1130	.interp_path	= "/lib/ld-linux.so.1",
1131	.sysvec		= &elf_linux_sysvec,
1132	.interp_newpath	= NULL,
1133	.brand_note	= &linux32_brandnote,
1134	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1135};
1136
1137static Elf32_Brandinfo linux_glibc2brand = {
1138	.brand		= ELFOSABI_LINUX,
1139	.machine	= EM_386,
1140	.compat_3_brand	= "Linux",
1141	.emul_path	= "/compat/linux",
1142	.interp_path	= "/lib/ld-linux.so.2",
1143	.sysvec		= &elf_linux_sysvec,
1144	.interp_newpath	= NULL,
1145	.brand_note	= &linux32_brandnote,
1146	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1147};
1148
1149Elf32_Brandinfo *linux_brandlist[] = {
1150	&linux_brand,
1151	&linux_glibc2brand,
1152	NULL
1153};
1154
1155static int
1156linux_elf_modevent(module_t mod, int type, void *data)
1157{
1158	Elf32_Brandinfo **brandinfo;
1159	int error;
1160	struct linux_ioctl_handler **lihp;
1161	struct linux_device_handler **ldhp;
1162
1163	error = 0;
1164
1165	switch(type) {
1166	case MOD_LOAD:
1167		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1168		     ++brandinfo)
1169			if (elf32_insert_brand_entry(*brandinfo) < 0)
1170				error = EINVAL;
1171		if (error == 0) {
1172			SET_FOREACH(lihp, linux_ioctl_handler_set)
1173				linux_ioctl_register_handler(*lihp);
1174			SET_FOREACH(ldhp, linux_device_handler_set)
1175				linux_device_register_handler(*ldhp);
1176			LIST_INIT(&futex_list);
1177			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1178			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1179			    linux_proc_exit, NULL, 1000);
1180			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1181			    linux_proc_exec, NULL, 1000);
1182			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1183			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1184			linux_szplatform = roundup(strlen(linux_platform) + 1,
1185			    sizeof(char *));
1186			linux_osd_jail_register();
1187			stclohz = (stathz ? stathz : hz);
1188			if (bootverbose)
1189				printf("Linux ELF exec handler installed\n");
1190		} else
1191			printf("cannot insert Linux ELF brand handler\n");
1192		break;
1193	case MOD_UNLOAD:
1194		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1195		     ++brandinfo)
1196			if (elf32_brand_inuse(*brandinfo))
1197				error = EBUSY;
1198		if (error == 0) {
1199			for (brandinfo = &linux_brandlist[0];
1200			     *brandinfo != NULL; ++brandinfo)
1201				if (elf32_remove_brand_entry(*brandinfo) < 0)
1202					error = EINVAL;
1203		}
1204		if (error == 0) {
1205			SET_FOREACH(lihp, linux_ioctl_handler_set)
1206				linux_ioctl_unregister_handler(*lihp);
1207			SET_FOREACH(ldhp, linux_device_handler_set)
1208				linux_device_unregister_handler(*ldhp);
1209			mtx_destroy(&futex_mtx);
1210			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1211			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1212			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1213			linux_osd_jail_deregister();
1214			if (bootverbose)
1215				printf("Linux ELF exec handler removed\n");
1216		} else
1217			printf("Could not deinstall ELF interpreter entry\n");
1218		break;
1219	default:
1220		return (EOPNOTSUPP);
1221	}
1222	return (error);
1223}
1224
1225static moduledata_t linux_elf_mod = {
1226	"linuxelf",
1227	linux_elf_modevent,
1228	0
1229};
1230
1231DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1232