linux32_sysvec.c revision 293535
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 S��ren Schmidt
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer
14 *    in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 *    derived from this software without specific prior written permission
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#include <sys/cdefs.h>
34__FBSDID("$FreeBSD: stable/10/sys/amd64/linux32/linux32_sysvec.c 293535 2016-01-09 16:24:30Z dchagin $");
35#include "opt_compat.h"
36
37#ifndef COMPAT_FREEBSD32
38#error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
39#endif
40
41#define	__ELF_WORD_SIZE	32
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/exec.h>
46#include <sys/fcntl.h>
47#include <sys/imgact.h>
48#include <sys/imgact_elf.h>
49#include <sys/kernel.h>
50#include <sys/lock.h>
51#include <sys/malloc.h>
52#include <sys/module.h>
53#include <sys/mutex.h>
54#include <sys/proc.h>
55#include <sys/resourcevar.h>
56#include <sys/signalvar.h>
57#include <sys/sysctl.h>
58#include <sys/syscallsubr.h>
59#include <sys/sysent.h>
60#include <sys/sysproto.h>
61#include <sys/vnode.h>
62#include <sys/eventhandler.h>
63
64#include <vm/vm.h>
65#include <vm/pmap.h>
66#include <vm/vm_extern.h>
67#include <vm/vm_map.h>
68#include <vm/vm_object.h>
69#include <vm/vm_page.h>
70#include <vm/vm_param.h>
71
72#include <machine/cpu.h>
73#include <machine/md_var.h>
74#include <machine/pcb.h>
75#include <machine/specialreg.h>
76
77#include <amd64/linux32/linux.h>
78#include <amd64/linux32/linux32_proto.h>
79#include <compat/linux/linux_emul.h>
80#include <compat/linux/linux_futex.h>
81#include <compat/linux/linux_ioctl.h>
82#include <compat/linux/linux_mib.h>
83#include <compat/linux/linux_misc.h>
84#include <compat/linux/linux_signal.h>
85#include <compat/linux/linux_util.h>
86#include <compat/linux/linux_vdso.h>
87
88MODULE_VERSION(linux, 1);
89
90#define	AUXARGS_ENTRY_32(pos, id, val)	\
91	do {				\
92		suword32(pos++, id);	\
93		suword32(pos++, val);	\
94	} while (0)
95
96#if BYTE_ORDER == LITTLE_ENDIAN
97#define SHELLMAGIC      0x2123 /* #! */
98#else
99#define SHELLMAGIC      0x2321
100#endif
101
102/*
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
106 * ldebug(sigreturn).
107 */
108#define	LINUX_SYS_linux_rt_sendsig	0
109#define	LINUX_SYS_linux_sendsig		0
110
111const char *linux_kplatform;
112static int linux_szsigcode;
113static vm_object_t linux_shared_page_obj;
114static char *linux_shared_page_mapping;
115extern char _binary_linux32_locore_o_start;
116extern char _binary_linux32_locore_o_end;
117
118extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
119
120SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
121
122static int	elf_linux_fixup(register_t **stack_base,
123		    struct image_params *iparams);
124static register_t *linux_copyout_strings(struct image_params *imgp);
125static void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
126static void	exec_linux_setregs(struct thread *td,
127				   struct image_params *imgp, u_long stack);
128static void	linux32_fixlimit(struct rlimit *rl, int which);
129static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130static void	linux_vdso_install(void *param);
131static void	linux_vdso_deinstall(void *param);
132
133/*
134 * Linux syscalls return negative errno's, we do positive and map them
135 * Reference:
136 *   FreeBSD: src/sys/sys/errno.h
137 *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
138 *            linux-2.6.17.8/include/asm-generic/errno.h
139 */
140static int bsd_to_linux_errno[ELAST + 1] = {
141	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
142	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
149	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
150	 -72, -67, -71
151};
152
153int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161	0, LINUX_SIGUSR1, LINUX_SIGUSR2
162};
163
164int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165	SIGHUP, SIGINT, SIGQUIT, SIGILL,
166	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172	SIGIO, SIGURG, SIGSYS
173};
174
175#define LINUX_T_UNKNOWN  255
176static int _bsd_to_linux_trapcode[] = {
177	LINUX_T_UNKNOWN,	/* 0 */
178	6,			/* 1  T_PRIVINFLT */
179	LINUX_T_UNKNOWN,	/* 2 */
180	3,			/* 3  T_BPTFLT */
181	LINUX_T_UNKNOWN,	/* 4 */
182	LINUX_T_UNKNOWN,	/* 5 */
183	16,			/* 6  T_ARITHTRAP */
184	254,			/* 7  T_ASTFLT */
185	LINUX_T_UNKNOWN,	/* 8 */
186	13,			/* 9  T_PROTFLT */
187	1,			/* 10 T_TRCTRAP */
188	LINUX_T_UNKNOWN,	/* 11 */
189	14,			/* 12 T_PAGEFLT */
190	LINUX_T_UNKNOWN,	/* 13 */
191	17,			/* 14 T_ALIGNFLT */
192	LINUX_T_UNKNOWN,	/* 15 */
193	LINUX_T_UNKNOWN,	/* 16 */
194	LINUX_T_UNKNOWN,	/* 17 */
195	0,			/* 18 T_DIVIDE */
196	2,			/* 19 T_NMI */
197	4,			/* 20 T_OFLOW */
198	5,			/* 21 T_BOUND */
199	7,			/* 22 T_DNA */
200	8,			/* 23 T_DOUBLEFLT */
201	9,			/* 24 T_FPOPFLT */
202	10,			/* 25 T_TSSFLT */
203	11,			/* 26 T_SEGNPFLT */
204	12,			/* 27 T_STKFLT */
205	18,			/* 28 T_MCHK */
206	19,			/* 29 T_XMMFLT */
207	15			/* 30 T_RESERVED */
208};
209#define bsd_to_linux_trapcode(code) \
210    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211     _bsd_to_linux_trapcode[(code)]: \
212     LINUX_T_UNKNOWN)
213
214struct linux32_ps_strings {
215	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
216	u_int ps_nargvstr;	/* the number of argument strings */
217	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
218	u_int ps_nenvstr;	/* the number of environment strings */
219};
220
221LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
222LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
223LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
224LINUX_VDSO_SYM_CHAR(linux_platform);
225
226/*
227 * If FreeBSD & Linux have a difference of opinion about what a trap
228 * means, deal with it here.
229 *
230 * MPSAFE
231 */
232static int
233translate_traps(int signal, int trap_code)
234{
235	if (signal != SIGBUS)
236		return signal;
237	switch (trap_code) {
238	case T_PROTFLT:
239	case T_TSSFLT:
240	case T_DOUBLEFLT:
241	case T_PAGEFLT:
242		return SIGSEGV;
243	default:
244		return signal;
245	}
246}
247
248static int
249elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
250{
251	Elf32_Auxargs *args;
252	Elf32_Addr *base;
253	Elf32_Addr *pos;
254	struct linux32_ps_strings *arginfo;
255
256	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
257
258	KASSERT(curthread->td_proc == imgp->proc,
259	    ("unsafe elf_linux_fixup(), should be curproc"));
260	base = (Elf32_Addr *)*stack_base;
261	args = (Elf32_Auxargs *)imgp->auxargs;
262	pos = base + (imgp->args->argc + imgp->args->envc + 2);
263
264	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
265	    imgp->proc->p_sysent->sv_shared_page_base);
266	AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
267	AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
268
269	/*
270	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
271	 * as it has appeared in the 2.4.0-rc7 first time.
272	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
273	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
274	 * is not present.
275	 * Also see linux_times() implementation.
276	 */
277	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
278		AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
279	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
280	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
281	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
282	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
283	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
284	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
285	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
286	AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
287	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
288	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
289	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
290	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
291	AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform));
292	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, PTROUT(imgp->canary));
293	if (imgp->execpathp != 0)
294		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, PTROUT(imgp->execpathp));
295	if (args->execfd != -1)
296		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
297	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
298
299	free(imgp->auxargs, M_TEMP);
300	imgp->auxargs = NULL;
301
302	base--;
303	suword32(base, (uint32_t)imgp->args->argc);
304	*stack_base = (register_t *)base;
305	return (0);
306}
307
308static void
309linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
310{
311	struct thread *td = curthread;
312	struct proc *p = td->td_proc;
313	struct sigacts *psp;
314	struct trapframe *regs;
315	struct l_rt_sigframe *fp, frame;
316	int oonstack;
317	int sig;
318	int code;
319
320	sig = ksi->ksi_signo;
321	code = ksi->ksi_code;
322	PROC_LOCK_ASSERT(p, MA_OWNED);
323	psp = p->p_sigacts;
324	mtx_assert(&psp->ps_mtx, MA_OWNED);
325	regs = td->td_frame;
326	oonstack = sigonstack(regs->tf_rsp);
327
328#ifdef DEBUG
329	if (ldebug(rt_sendsig))
330		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
331		    catcher, sig, (void*)mask, code);
332#endif
333	/*
334	 * Allocate space for the signal handler context.
335	 */
336	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
337	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
338		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
339		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
340	} else
341		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
342	mtx_unlock(&psp->ps_mtx);
343
344	/*
345	 * Build the argument list for the signal handler.
346	 */
347	if (p->p_sysent->sv_sigtbl)
348		if (sig <= p->p_sysent->sv_sigsize)
349			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
350
351	bzero(&frame, sizeof(frame));
352
353	frame.sf_handler = PTROUT(catcher);
354	frame.sf_sig = sig;
355	frame.sf_siginfo = PTROUT(&fp->sf_si);
356	frame.sf_ucontext = PTROUT(&fp->sf_sc);
357
358	/* Fill in POSIX parts */
359	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
360
361	/*
362	 * Build the signal context to be used by sigreturn
363	 * and libgcc unwind.
364	 */
365	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
366	frame.sf_sc.uc_link = 0;		/* XXX ??? */
367
368	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
369	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
370	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
371	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
372	PROC_UNLOCK(p);
373
374	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
375
376	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
377	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
378	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
379	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
380	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
381	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_rsp;
382	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
383	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
384	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
385	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
386	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
387	frame.sf_sc.uc_mcontext.sc_gs     = regs->tf_gs;
388	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
389	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
390	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
391	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
392	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
393	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
394	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
395	frame.sf_sc.uc_mcontext.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
396	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
397
398#ifdef DEBUG
399	if (ldebug(rt_sendsig))
400		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
401		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
402		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
403#endif
404
405	if (copyout(&frame, fp, sizeof(frame)) != 0) {
406		/*
407		 * Process has trashed its stack; give it an illegal
408		 * instruction to halt it in its tracks.
409		 */
410#ifdef DEBUG
411		if (ldebug(rt_sendsig))
412			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
413			    fp, oonstack);
414#endif
415		PROC_LOCK(p);
416		sigexit(td, SIGILL);
417	}
418
419	/*
420	 * Build context to run handler in.
421	 */
422	regs->tf_rsp = PTROUT(fp);
423	regs->tf_rip = linux32_rt_sigcode;
424	regs->tf_rflags &= ~(PSL_T | PSL_D);
425	regs->tf_cs = _ucode32sel;
426	regs->tf_ss = _udatasel;
427	regs->tf_ds = _udatasel;
428	regs->tf_es = _udatasel;
429	regs->tf_fs = _ufssel;
430	regs->tf_gs = _ugssel;
431	regs->tf_flags = TF_HASSEGS;
432	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
433	PROC_LOCK(p);
434	mtx_lock(&psp->ps_mtx);
435}
436
437
438/*
439 * Send an interrupt to process.
440 *
441 * Stack is set up to allow sigcode stored
442 * in u. to call routine, followed by kcall
443 * to sigreturn routine below.  After sigreturn
444 * resets the signal mask, the stack, and the
445 * frame pointer, it returns to the user
446 * specified pc, psl.
447 */
448static void
449linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
450{
451	struct thread *td = curthread;
452	struct proc *p = td->td_proc;
453	struct sigacts *psp;
454	struct trapframe *regs;
455	struct l_sigframe *fp, frame;
456	l_sigset_t lmask;
457	int oonstack, i;
458	int sig, code;
459
460	sig = ksi->ksi_signo;
461	code = ksi->ksi_code;
462	PROC_LOCK_ASSERT(p, MA_OWNED);
463	psp = p->p_sigacts;
464	mtx_assert(&psp->ps_mtx, MA_OWNED);
465	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
466		/* Signal handler installed with SA_SIGINFO. */
467		linux_rt_sendsig(catcher, ksi, mask);
468		return;
469	}
470
471	regs = td->td_frame;
472	oonstack = sigonstack(regs->tf_rsp);
473
474#ifdef DEBUG
475	if (ldebug(sendsig))
476		printf(ARGS(sendsig, "%p, %d, %p, %u"),
477		    catcher, sig, (void*)mask, code);
478#endif
479
480	/*
481	 * Allocate space for the signal handler context.
482	 */
483	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
484	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
485		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
486		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
487	} else
488		fp = (struct l_sigframe *)regs->tf_rsp - 1;
489	mtx_unlock(&psp->ps_mtx);
490	PROC_UNLOCK(p);
491
492	/*
493	 * Build the argument list for the signal handler.
494	 */
495	if (p->p_sysent->sv_sigtbl)
496		if (sig <= p->p_sysent->sv_sigsize)
497			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
498
499	bzero(&frame, sizeof(frame));
500
501	frame.sf_handler = PTROUT(catcher);
502	frame.sf_sig = sig;
503
504	bsd_to_linux_sigset(mask, &lmask);
505
506	/*
507	 * Build the signal context to be used by sigreturn.
508	 */
509	frame.sf_sc.sc_mask   = lmask.__bits[0];
510	frame.sf_sc.sc_gs     = regs->tf_gs;
511	frame.sf_sc.sc_fs     = regs->tf_fs;
512	frame.sf_sc.sc_es     = regs->tf_es;
513	frame.sf_sc.sc_ds     = regs->tf_ds;
514	frame.sf_sc.sc_edi    = regs->tf_rdi;
515	frame.sf_sc.sc_esi    = regs->tf_rsi;
516	frame.sf_sc.sc_ebp    = regs->tf_rbp;
517	frame.sf_sc.sc_ebx    = regs->tf_rbx;
518	frame.sf_sc.sc_esp    = regs->tf_rsp;
519	frame.sf_sc.sc_edx    = regs->tf_rdx;
520	frame.sf_sc.sc_ecx    = regs->tf_rcx;
521	frame.sf_sc.sc_eax    = regs->tf_rax;
522	frame.sf_sc.sc_eip    = regs->tf_rip;
523	frame.sf_sc.sc_cs     = regs->tf_cs;
524	frame.sf_sc.sc_eflags = regs->tf_rflags;
525	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
526	frame.sf_sc.sc_ss     = regs->tf_ss;
527	frame.sf_sc.sc_err    = regs->tf_err;
528	frame.sf_sc.sc_cr2    = (u_int32_t)(uintptr_t)ksi->ksi_addr;
529	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
530
531	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
532		frame.sf_extramask[i] = lmask.__bits[i+1];
533
534	if (copyout(&frame, fp, sizeof(frame)) != 0) {
535		/*
536		 * Process has trashed its stack; give it an illegal
537		 * instruction to halt it in its tracks.
538		 */
539		PROC_LOCK(p);
540		sigexit(td, SIGILL);
541	}
542
543	/*
544	 * Build context to run handler in.
545	 */
546	regs->tf_rsp = PTROUT(fp);
547	regs->tf_rip = linux32_sigcode;
548	regs->tf_rflags &= ~(PSL_T | PSL_D);
549	regs->tf_cs = _ucode32sel;
550	regs->tf_ss = _udatasel;
551	regs->tf_ds = _udatasel;
552	regs->tf_es = _udatasel;
553	regs->tf_fs = _ufssel;
554	regs->tf_gs = _ugssel;
555	regs->tf_flags = TF_HASSEGS;
556	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
557	PROC_LOCK(p);
558	mtx_lock(&psp->ps_mtx);
559}
560
561/*
562 * System call to cleanup state after a signal
563 * has been taken.  Reset signal mask and
564 * stack state from context left by sendsig (above).
565 * Return to previous pc and psl as specified by
566 * context left by sendsig. Check carefully to
567 * make sure that the user has not modified the
568 * psl to gain improper privileges or to cause
569 * a machine fault.
570 */
571int
572linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
573{
574	struct l_sigframe frame;
575	struct trapframe *regs;
576	sigset_t bmask;
577	l_sigset_t lmask;
578	int eflags, i;
579	ksiginfo_t ksi;
580
581	regs = td->td_frame;
582
583#ifdef DEBUG
584	if (ldebug(sigreturn))
585		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
586#endif
587	/*
588	 * The trampoline code hands us the sigframe.
589	 * It is unsafe to keep track of it ourselves, in the event that a
590	 * program jumps out of a signal handler.
591	 */
592	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
593		return (EFAULT);
594
595	/*
596	 * Check for security violations.
597	 */
598#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
599	eflags = frame.sf_sc.sc_eflags;
600	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
601		return(EINVAL);
602
603	/*
604	 * Don't allow users to load a valid privileged %cs.  Let the
605	 * hardware check for invalid selectors, excess privilege in
606	 * other selectors, invalid %eip's and invalid %esp's.
607	 */
608#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
609	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
610		ksiginfo_init_trap(&ksi);
611		ksi.ksi_signo = SIGBUS;
612		ksi.ksi_code = BUS_OBJERR;
613		ksi.ksi_trapno = T_PROTFLT;
614		ksi.ksi_addr = (void *)regs->tf_rip;
615		trapsignal(td, &ksi);
616		return(EINVAL);
617	}
618
619	lmask.__bits[0] = frame.sf_sc.sc_mask;
620	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
621		lmask.__bits[i+1] = frame.sf_extramask[i];
622	linux_to_bsd_sigset(&lmask, &bmask);
623	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
624
625	/*
626	 * Restore signal context.
627	 */
628	regs->tf_rdi    = frame.sf_sc.sc_edi;
629	regs->tf_rsi    = frame.sf_sc.sc_esi;
630	regs->tf_rbp    = frame.sf_sc.sc_ebp;
631	regs->tf_rbx    = frame.sf_sc.sc_ebx;
632	regs->tf_rdx    = frame.sf_sc.sc_edx;
633	regs->tf_rcx    = frame.sf_sc.sc_ecx;
634	regs->tf_rax    = frame.sf_sc.sc_eax;
635	regs->tf_rip    = frame.sf_sc.sc_eip;
636	regs->tf_cs     = frame.sf_sc.sc_cs;
637	regs->tf_ds     = frame.sf_sc.sc_ds;
638	regs->tf_es     = frame.sf_sc.sc_es;
639	regs->tf_fs     = frame.sf_sc.sc_fs;
640	regs->tf_gs     = frame.sf_sc.sc_gs;
641	regs->tf_rflags = eflags;
642	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
643	regs->tf_ss     = frame.sf_sc.sc_ss;
644	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
645
646	return (EJUSTRETURN);
647}
648
649/*
650 * System call to cleanup state after a signal
651 * has been taken.  Reset signal mask and
652 * stack state from context left by rt_sendsig (above).
653 * Return to previous pc and psl as specified by
654 * context left by sendsig. Check carefully to
655 * make sure that the user has not modified the
656 * psl to gain improper privileges or to cause
657 * a machine fault.
658 */
659int
660linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
661{
662	struct l_ucontext uc;
663	struct l_sigcontext *context;
664	sigset_t bmask;
665	l_stack_t *lss;
666	stack_t ss;
667	struct trapframe *regs;
668	int eflags;
669	ksiginfo_t ksi;
670
671	regs = td->td_frame;
672
673#ifdef DEBUG
674	if (ldebug(rt_sigreturn))
675		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
676#endif
677	/*
678	 * The trampoline code hands us the ucontext.
679	 * It is unsafe to keep track of it ourselves, in the event that a
680	 * program jumps out of a signal handler.
681	 */
682	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
683		return (EFAULT);
684
685	context = &uc.uc_mcontext;
686
687	/*
688	 * Check for security violations.
689	 */
690#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
691	eflags = context->sc_eflags;
692	if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
693		return(EINVAL);
694
695	/*
696	 * Don't allow users to load a valid privileged %cs.  Let the
697	 * hardware check for invalid selectors, excess privilege in
698	 * other selectors, invalid %eip's and invalid %esp's.
699	 */
700#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
701	if (!CS_SECURE(context->sc_cs)) {
702		ksiginfo_init_trap(&ksi);
703		ksi.ksi_signo = SIGBUS;
704		ksi.ksi_code = BUS_OBJERR;
705		ksi.ksi_trapno = T_PROTFLT;
706		ksi.ksi_addr = (void *)regs->tf_rip;
707		trapsignal(td, &ksi);
708		return(EINVAL);
709	}
710
711	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
712	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
713
714	/*
715	 * Restore signal context
716	 */
717	regs->tf_gs	= context->sc_gs;
718	regs->tf_fs	= context->sc_fs;
719	regs->tf_es	= context->sc_es;
720	regs->tf_ds	= context->sc_ds;
721	regs->tf_rdi    = context->sc_edi;
722	regs->tf_rsi    = context->sc_esi;
723	regs->tf_rbp    = context->sc_ebp;
724	regs->tf_rbx    = context->sc_ebx;
725	regs->tf_rdx    = context->sc_edx;
726	regs->tf_rcx    = context->sc_ecx;
727	regs->tf_rax    = context->sc_eax;
728	regs->tf_rip    = context->sc_eip;
729	regs->tf_cs     = context->sc_cs;
730	regs->tf_rflags = eflags;
731	regs->tf_rsp    = context->sc_esp_at_signal;
732	regs->tf_ss     = context->sc_ss;
733	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
734
735	/*
736	 * call sigaltstack & ignore results..
737	 */
738	lss = &uc.uc_stack;
739	ss.ss_sp = PTRIN(lss->ss_sp);
740	ss.ss_size = lss->ss_size;
741	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
742
743#ifdef DEBUG
744	if (ldebug(rt_sigreturn))
745		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
746		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
747#endif
748	(void)kern_sigaltstack(td, &ss, NULL);
749
750	return (EJUSTRETURN);
751}
752
753static int
754linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
755{
756	struct proc *p;
757	struct trapframe *frame;
758
759	p = td->td_proc;
760	frame = td->td_frame;
761
762	sa->args[0] = frame->tf_rbx;
763	sa->args[1] = frame->tf_rcx;
764	sa->args[2] = frame->tf_rdx;
765	sa->args[3] = frame->tf_rsi;
766	sa->args[4] = frame->tf_rdi;
767	sa->args[5] = frame->tf_rbp;	/* Unconfirmed */
768	sa->code = frame->tf_rax;
769
770	if (sa->code >= p->p_sysent->sv_size)
771		sa->callp = &p->p_sysent->sv_table[0];
772	else
773		sa->callp = &p->p_sysent->sv_table[sa->code];
774	sa->narg = sa->callp->sy_narg;
775
776	td->td_retval[0] = 0;
777	td->td_retval[1] = frame->tf_rdx;
778
779	return (0);
780}
781
782/*
783 * If a linux binary is exec'ing something, try this image activator
784 * first.  We override standard shell script execution in order to
785 * be able to modify the interpreter path.  We only do this if a linux
786 * binary is doing the exec, so we do not create an EXEC module for it.
787 */
788static int	exec_linux_imgact_try(struct image_params *iparams);
789
790static int
791exec_linux_imgact_try(struct image_params *imgp)
792{
793	const char *head = (const char *)imgp->image_header;
794	char *rpath;
795	int error = -1;
796
797	/*
798	* The interpreter for shell scripts run from a linux binary needs
799	* to be located in /compat/linux if possible in order to recursively
800	* maintain linux path emulation.
801	*/
802	if (((const short *)head)[0] == SHELLMAGIC) {
803		/*
804		* Run our normal shell image activator.  If it succeeds attempt
805		* to use the alternate path for the interpreter.  If an
806		* alternate * path is found, use our stringspace to store it.
807		*/
808		if ((error = exec_shell_imgact(imgp)) == 0) {
809			linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
810			    imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
811			    AT_FDCWD);
812			if (rpath != NULL)
813				imgp->args->fname_buf =
814				    imgp->interpreter_name = rpath;
815		}
816	}
817	return (error);
818}
819
820/*
821 * Clear registers on exec
822 * XXX copied from ia32_signal.c.
823 */
824static void
825exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
826{
827	struct trapframe *regs = td->td_frame;
828	struct pcb *pcb = td->td_pcb;
829
830	mtx_lock(&dt_lock);
831	if (td->td_proc->p_md.md_ldt != NULL)
832		user_ldt_free(td);
833	else
834		mtx_unlock(&dt_lock);
835
836	critical_enter();
837	wrmsr(MSR_FSBASE, 0);
838	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
839	pcb->pcb_fsbase = 0;
840	pcb->pcb_gsbase = 0;
841	critical_exit();
842	pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
843
844	bzero((char *)regs, sizeof(struct trapframe));
845	regs->tf_rip = imgp->entry_addr;
846	regs->tf_rsp = stack;
847	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
848	regs->tf_gs = _ugssel;
849	regs->tf_fs = _ufssel;
850	regs->tf_es = _udatasel;
851	regs->tf_ds = _udatasel;
852	regs->tf_ss = _udatasel;
853	regs->tf_flags = TF_HASSEGS;
854	regs->tf_cs = _ucode32sel;
855	regs->tf_rbx = imgp->ps_strings;
856
857	fpstate_drop(td);
858
859	/* Do full restore on return so that we can change to a different %cs */
860	set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
861	td->td_retval[1] = 0;
862}
863
864/*
865 * XXX copied from ia32_sysvec.c.
866 */
867static register_t *
868linux_copyout_strings(struct image_params *imgp)
869{
870	int argc, envc;
871	u_int32_t *vectp;
872	char *stringp, *destp;
873	u_int32_t *stack_base;
874	struct linux32_ps_strings *arginfo;
875	char canary[LINUX_AT_RANDOM_LEN];
876	size_t execpath_len;
877
878	/*
879	 * Calculate string base and vector table pointers.
880	 */
881	if (imgp->execpath != NULL && imgp->auxargs != NULL)
882		execpath_len = strlen(imgp->execpath) + 1;
883	else
884		execpath_len = 0;
885
886	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
887	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
888	    roundup(sizeof(canary), sizeof(char *)) -
889	    roundup(execpath_len, sizeof(char *)) -
890	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
891
892	if (execpath_len != 0) {
893		imgp->execpathp = (uintptr_t)arginfo - execpath_len;
894		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
895	}
896
897	/*
898	 * Prepare the canary for SSP.
899	 */
900	arc4rand(canary, sizeof(canary), 0);
901	imgp->canary = (uintptr_t)arginfo -
902	    roundup(execpath_len, sizeof(char *)) -
903	    roundup(sizeof(canary), sizeof(char *));
904	copyout(canary, (void *)imgp->canary, sizeof(canary));
905
906	/*
907	 * If we have a valid auxargs ptr, prepare some room
908	 * on the stack.
909	 */
910	if (imgp->auxargs) {
911		/*
912		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
913		 * lower compatibility.
914		 */
915		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
916		    (LINUX_AT_COUNT * 2);
917		/*
918		 * The '+ 2' is for the null pointers at the end of each of
919		 * the arg and env vector sets,and imgp->auxarg_size is room
920		 * for argument of Runtime loader.
921		 */
922		vectp = (u_int32_t *) (destp - (imgp->args->argc +
923		    imgp->args->envc + 2 + imgp->auxarg_size) *
924		    sizeof(u_int32_t));
925
926	} else
927		/*
928		 * The '+ 2' is for the null pointers at the end of each of
929		 * the arg and env vector sets
930		 */
931		vectp = (u_int32_t *)(destp - (imgp->args->argc +
932		    imgp->args->envc + 2) * sizeof(u_int32_t));
933
934	/*
935	 * vectp also becomes our initial stack base
936	 */
937	stack_base = vectp;
938
939	stringp = imgp->args->begin_argv;
940	argc = imgp->args->argc;
941	envc = imgp->args->envc;
942	/*
943	 * Copy out strings - arguments and environment.
944	 */
945	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
946
947	/*
948	 * Fill in "ps_strings" struct for ps, w, etc.
949	 */
950	suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
951	suword32(&arginfo->ps_nargvstr, argc);
952
953	/*
954	 * Fill in argument portion of vector table.
955	 */
956	for (; argc > 0; --argc) {
957		suword32(vectp++, (uint32_t)(intptr_t)destp);
958		while (*stringp++ != 0)
959			destp++;
960		destp++;
961	}
962
963	/* a null vector table pointer separates the argp's from the envp's */
964	suword32(vectp++, 0);
965
966	suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
967	suword32(&arginfo->ps_nenvstr, envc);
968
969	/*
970	 * Fill in environment portion of vector table.
971	 */
972	for (; envc > 0; --envc) {
973		suword32(vectp++, (uint32_t)(intptr_t)destp);
974		while (*stringp++ != 0)
975			destp++;
976		destp++;
977	}
978
979	/* end of vector table is a null pointer */
980	suword32(vectp, 0);
981
982	return ((register_t *)stack_base);
983}
984
985static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
986    "32-bit Linux emulation");
987
988static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
989SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
990    &linux32_maxdsiz, 0, "");
991static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
992SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
993    &linux32_maxssiz, 0, "");
994static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
995SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
996    &linux32_maxvmem, 0, "");
997
998#if defined(DEBUG)
999SYSCTL_PROC(_compat_linux32, OID_AUTO, debug,
1000            CTLTYPE_STRING | CTLFLAG_RW,
1001            0, 0, linux_sysctl_debug, "A",
1002            "Linux debugging control");
1003#endif
1004
1005static void
1006linux32_fixlimit(struct rlimit *rl, int which)
1007{
1008
1009	switch (which) {
1010	case RLIMIT_DATA:
1011		if (linux32_maxdsiz != 0) {
1012			if (rl->rlim_cur > linux32_maxdsiz)
1013				rl->rlim_cur = linux32_maxdsiz;
1014			if (rl->rlim_max > linux32_maxdsiz)
1015				rl->rlim_max = linux32_maxdsiz;
1016		}
1017		break;
1018	case RLIMIT_STACK:
1019		if (linux32_maxssiz != 0) {
1020			if (rl->rlim_cur > linux32_maxssiz)
1021				rl->rlim_cur = linux32_maxssiz;
1022			if (rl->rlim_max > linux32_maxssiz)
1023				rl->rlim_max = linux32_maxssiz;
1024		}
1025		break;
1026	case RLIMIT_VMEM:
1027		if (linux32_maxvmem != 0) {
1028			if (rl->rlim_cur > linux32_maxvmem)
1029				rl->rlim_cur = linux32_maxvmem;
1030			if (rl->rlim_max > linux32_maxvmem)
1031				rl->rlim_max = linux32_maxvmem;
1032		}
1033		break;
1034	}
1035}
1036
1037struct sysentvec elf_linux_sysvec = {
1038	.sv_size	= LINUX_SYS_MAXSYSCALL,
1039	.sv_table	= linux_sysent,
1040	.sv_mask	= 0,
1041	.sv_sigsize	= LINUX_SIGTBLSZ,
1042	.sv_sigtbl	= bsd_to_linux_signal,
1043	.sv_errsize	= ELAST + 1,
1044	.sv_errtbl	= bsd_to_linux_errno,
1045	.sv_transtrap	= translate_traps,
1046	.sv_fixup	= elf_linux_fixup,
1047	.sv_sendsig	= linux_sendsig,
1048	.sv_sigcode	= &_binary_linux32_locore_o_start,
1049	.sv_szsigcode	= &linux_szsigcode,
1050	.sv_prepsyscall	= NULL,
1051	.sv_name	= "Linux ELF32",
1052	.sv_coredump	= elf32_coredump,
1053	.sv_imgact_try	= exec_linux_imgact_try,
1054	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1055	.sv_pagesize	= PAGE_SIZE,
1056	.sv_minuser	= VM_MIN_ADDRESS,
1057	.sv_maxuser	= LINUX32_MAXUSER,
1058	.sv_usrstack	= LINUX32_USRSTACK,
1059	.sv_psstrings	= LINUX32_PS_STRINGS,
1060	.sv_stackprot	= VM_PROT_ALL,
1061	.sv_copyout_strings = linux_copyout_strings,
1062	.sv_setregs	= exec_linux_setregs,
1063	.sv_fixlimit	= linux32_fixlimit,
1064	.sv_maxssiz	= &linux32_maxssiz,
1065	.sv_flags	= SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1066	.sv_set_syscall_retval = cpu_set_syscall_retval,
1067	.sv_fetch_syscall_args = linux32_fetch_syscall_args,
1068	.sv_syscallnames = NULL,
1069	.sv_shared_page_base = LINUX32_SHAREDPAGE,
1070	.sv_shared_page_len = PAGE_SIZE,
1071	.sv_schedtail	= linux_schedtail,
1072	.sv_thread_detach = linux_thread_detach,
1073};
1074
1075static void
1076linux_vdso_install(void *param)
1077{
1078
1079	linux_szsigcode = (&_binary_linux32_locore_o_end -
1080	    &_binary_linux32_locore_o_start);
1081
1082	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1083		panic("Linux invalid vdso size\n");
1084
1085	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1086
1087	linux_shared_page_obj = __elfN(linux_shared_page_init)
1088	    (&linux_shared_page_mapping);
1089
1090	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1091
1092	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1093	    linux_szsigcode);
1094	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1095
1096	linux_kplatform = linux_shared_page_mapping +
1097	    (linux_platform - (caddr_t)LINUX32_SHAREDPAGE);
1098}
1099SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1100    (sysinit_cfunc_t)linux_vdso_install, NULL);
1101
1102static void
1103linux_vdso_deinstall(void *param)
1104{
1105
1106	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1107};
1108SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1109    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1110
1111static char GNU_ABI_VENDOR[] = "GNU";
1112static int GNULINUX_ABI_DESC = 0;
1113
1114static boolean_t
1115linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1116{
1117	const Elf32_Word *desc;
1118	uintptr_t p;
1119
1120	p = (uintptr_t)(note + 1);
1121	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1122
1123	desc = (const Elf32_Word *)p;
1124	if (desc[0] != GNULINUX_ABI_DESC)
1125		return (FALSE);
1126
1127	/*
1128	 * For linux we encode osrel as follows (see linux_mib.c):
1129	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1130	 */
1131	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1132
1133	return (TRUE);
1134}
1135
1136static Elf_Brandnote linux32_brandnote = {
1137	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1138	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1139	.hdr.n_type	= 1,
1140	.vendor		= GNU_ABI_VENDOR,
1141	.flags		= BN_TRANSLATE_OSREL,
1142	.trans_osrel	= linux32_trans_osrel
1143};
1144
1145static Elf32_Brandinfo linux_brand = {
1146	.brand		= ELFOSABI_LINUX,
1147	.machine	= EM_386,
1148	.compat_3_brand	= "Linux",
1149	.emul_path	= "/compat/linux",
1150	.interp_path	= "/lib/ld-linux.so.1",
1151	.sysvec		= &elf_linux_sysvec,
1152	.interp_newpath	= NULL,
1153	.brand_note	= &linux32_brandnote,
1154	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1155};
1156
1157static Elf32_Brandinfo linux_glibc2brand = {
1158	.brand		= ELFOSABI_LINUX,
1159	.machine	= EM_386,
1160	.compat_3_brand	= "Linux",
1161	.emul_path	= "/compat/linux",
1162	.interp_path	= "/lib/ld-linux.so.2",
1163	.sysvec		= &elf_linux_sysvec,
1164	.interp_newpath	= NULL,
1165	.brand_note	= &linux32_brandnote,
1166	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1167};
1168
1169Elf32_Brandinfo *linux_brandlist[] = {
1170	&linux_brand,
1171	&linux_glibc2brand,
1172	NULL
1173};
1174
1175static int
1176linux_elf_modevent(module_t mod, int type, void *data)
1177{
1178	Elf32_Brandinfo **brandinfo;
1179	int error;
1180	struct linux_ioctl_handler **lihp;
1181
1182	error = 0;
1183
1184	switch(type) {
1185	case MOD_LOAD:
1186		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1187		     ++brandinfo)
1188			if (elf32_insert_brand_entry(*brandinfo) < 0)
1189				error = EINVAL;
1190		if (error == 0) {
1191			SET_FOREACH(lihp, linux_ioctl_handler_set)
1192				linux_ioctl_register_handler(*lihp);
1193			LIST_INIT(&futex_list);
1194			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1195			stclohz = (stathz ? stathz : hz);
1196			if (bootverbose)
1197				printf("Linux ELF exec handler installed\n");
1198		} else
1199			printf("cannot insert Linux ELF brand handler\n");
1200		break;
1201	case MOD_UNLOAD:
1202		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1203		     ++brandinfo)
1204			if (elf32_brand_inuse(*brandinfo))
1205				error = EBUSY;
1206		if (error == 0) {
1207			for (brandinfo = &linux_brandlist[0];
1208			     *brandinfo != NULL; ++brandinfo)
1209				if (elf32_remove_brand_entry(*brandinfo) < 0)
1210					error = EINVAL;
1211		}
1212		if (error == 0) {
1213			SET_FOREACH(lihp, linux_ioctl_handler_set)
1214				linux_ioctl_unregister_handler(*lihp);
1215			mtx_destroy(&futex_mtx);
1216			if (bootverbose)
1217				printf("Linux ELF exec handler removed\n");
1218		} else
1219			printf("Could not deinstall ELF interpreter entry\n");
1220		break;
1221	default:
1222		return (EOPNOTSUPP);
1223	}
1224	return (error);
1225}
1226
1227static moduledata_t linux_elf_mod = {
1228	"linuxelf",
1229	linux_elf_modevent,
1230	0
1231};
1232
1233DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1234MODULE_DEPEND(linuxelf, linux_common, 1, 1, 1);
1235