linux_sysvec.c revision 293535
1131554Stjr/*-
2131554Stjr * Copyright (c) 1994-1996 S��ren Schmidt
3131554Stjr * All rights reserved.
4131554Stjr *
5131554Stjr * Redistribution and use in source and binary forms, with or without
6131554Stjr * modification, are permitted provided that the following conditions
7131554Stjr * are met:
8131554Stjr * 1. Redistributions of source code must retain the above copyright
9131554Stjr *    notice, this list of conditions and the following disclaimer
10131554Stjr *    in this position and unchanged.
11131554Stjr * 2. Redistributions in binary form must reproduce the above copyright
12131554Stjr *    notice, this list of conditions and the following disclaimer in the
13131554Stjr *    documentation and/or other materials provided with the distribution.
14131554Stjr * 3. The name of the author may not be used to endorse or promote products
15131554Stjr *    derived from this software without specific prior written permission
16131554Stjr *
17131554Stjr * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18131554Stjr * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19131554Stjr * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20131554Stjr * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21131554Stjr * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22131554Stjr * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23131554Stjr * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24131554Stjr * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25131554Stjr * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26131554Stjr * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27131554Stjr */
28131554Stjr
29131554Stjr#include <sys/cdefs.h>
30131554Stjr__FBSDID("$FreeBSD: stable/10/sys/i386/linux/linux_sysvec.c 293535 2016-01-09 16:24:30Z dchagin $");
31131554Stjr
32131554Stjr#include <sys/param.h>
33131554Stjr#include <sys/systm.h>
34131554Stjr#include <sys/exec.h>
35131554Stjr#include <sys/fcntl.h>
36131554Stjr#include <sys/imgact.h>
37131554Stjr#include <sys/imgact_aout.h>
38131554Stjr#include <sys/imgact_elf.h>
39131554Stjr#include <sys/kernel.h>
40131554Stjr#include <sys/lock.h>
41131554Stjr#include <sys/malloc.h>
42131554Stjr#include <sys/module.h>
43131554Stjr#include <sys/mutex.h>
44131554Stjr#include <sys/proc.h>
45131554Stjr#include <sys/signalvar.h>
46131554Stjr#include <sys/syscallsubr.h>
47131554Stjr#include <sys/sysctl.h>
48131554Stjr#include <sys/sysent.h>
49131554Stjr#include <sys/sysproto.h>
50131554Stjr#include <sys/vnode.h>
51131554Stjr#include <sys/eventhandler.h>
52131554Stjr
53131554Stjr#include <vm/vm.h>
54131554Stjr#include <vm/pmap.h>
55131554Stjr#include <vm/vm_extern.h>
56131554Stjr#include <vm/vm_map.h>
57131554Stjr#include <vm/vm_object.h>
58131554Stjr#include <vm/vm_page.h>
59131554Stjr#include <vm/vm_param.h>
60131554Stjr
61131554Stjr#include <machine/cpu.h>
62131554Stjr#include <machine/cputypes.h>
63131554Stjr#include <machine/md_var.h>
64131554Stjr#include <machine/pcb.h>
65131554Stjr
66131554Stjr#include <i386/linux/linux.h>
67131554Stjr#include <i386/linux/linux_proto.h>
68131554Stjr#include <compat/linux/linux_emul.h>
69131554Stjr#include <compat/linux/linux_futex.h>
70131554Stjr#include <compat/linux/linux_ioctl.h>
71131554Stjr#include <compat/linux/linux_mib.h>
72131554Stjr#include <compat/linux/linux_misc.h>
73131554Stjr#include <compat/linux/linux_signal.h>
74131554Stjr#include <compat/linux/linux_util.h>
75131554Stjr#include <compat/linux/linux_vdso.h>
76131554Stjr
77131554StjrMODULE_VERSION(linux, 1);
78131554Stjr
79131554Stjr#if BYTE_ORDER == LITTLE_ENDIAN
80131554Stjr#define SHELLMAGIC      0x2123 /* #! */
81131554Stjr#else
82131554Stjr#define SHELLMAGIC      0x2321
83131554Stjr#endif
84131554Stjr
85131554Stjr#if defined(DEBUG)
86131554StjrSYSCTL_PROC(_compat_linux, OID_AUTO, debug,
87131554Stjr            CTLTYPE_STRING | CTLFLAG_RW,
88131554Stjr            0, 0, linux_sysctl_debug, "A",
89131554Stjr            "Linux debugging control");
90131554Stjr#endif
91131554Stjr
92131554Stjr/*
93131554Stjr * Allow the sendsig functions to use the ldebug() facility
94131554Stjr * even though they are not syscalls themselves. Map them
95131554Stjr * to syscall 0. This is slightly less bogus than using
96131554Stjr * ldebug(sigreturn).
97131554Stjr */
98131554Stjr#define	LINUX_SYS_linux_rt_sendsig	0
99131554Stjr#define	LINUX_SYS_linux_sendsig		0
100131554Stjr
101131554Stjr#define	LINUX_PS_STRINGS	(LINUX_USRSTACK - sizeof(struct ps_strings))
102131554Stjr
103131554Stjrstatic int linux_szsigcode;
104131554Stjrstatic vm_object_t linux_shared_page_obj;
105131554Stjrstatic char *linux_shared_page_mapping;
106131554Stjrextern char _binary_linux_locore_o_start;
107131554Stjrextern char _binary_linux_locore_o_end;
108131554Stjr
109131554Stjrextern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
110131554Stjr
111131554StjrSET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
112131554Stjr
113131554Stjrstatic int	linux_fixup(register_t **stack_base,
114131554Stjr		    struct image_params *iparams);
115131554Stjrstatic int	elf_linux_fixup(register_t **stack_base,
116131554Stjr		    struct image_params *iparams);
117131554Stjrstatic void     linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
118131554Stjrstatic void	exec_linux_setregs(struct thread *td,
119131554Stjr		    struct image_params *imgp, u_long stack);
120131554Stjrstatic register_t *linux_copyout_strings(struct image_params *imgp);
121131554Stjrstatic boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel);
122131554Stjrstatic void	linux_vdso_install(void *param);
123131554Stjrstatic void	linux_vdso_deinstall(void *param);
124131554Stjr
125131554Stjrstatic int linux_szplatform;
126131554Stjrconst char *linux_kplatform;
127131554Stjr
128131554Stjrstatic eventhandler_tag linux_exit_tag;
129131554Stjrstatic eventhandler_tag linux_exec_tag;
130131554Stjrstatic eventhandler_tag linux_thread_dtor_tag;
131131554Stjr
132131554Stjr/*
133131554Stjr * Linux syscalls return negative errno's, we do positive and map them
134131554Stjr * Reference:
135131554Stjr *   FreeBSD: src/sys/sys/errno.h
136131554Stjr *   Linux:   linux-2.6.17.8/include/asm-generic/errno-base.h
137131554Stjr *            linux-2.6.17.8/include/asm-generic/errno.h
138131554Stjr */
139131554Stjrstatic int bsd_to_linux_errno[ELAST + 1] = {
140131554Stjr	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
141131554Stjr	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
142131554Stjr	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
143131554Stjr	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
144131554Stjr	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
145131554Stjr	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
146131554Stjr	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
147131554Stjr	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
148131554Stjr	  -6,  -6, -43, -42, -75,-125, -84, -95, -16, -74,
149131554Stjr	 -72, -67, -71
150131554Stjr};
151131554Stjr
152131554Stjrint bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
153131554Stjr	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
154131554Stjr	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
155131554Stjr	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
156131554Stjr	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
157131554Stjr	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
158131554Stjr	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
159131554Stjr	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
160131554Stjr	0, LINUX_SIGUSR1, LINUX_SIGUSR2
161131554Stjr};
162131554Stjr
163131554Stjrint linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
164131554Stjr	SIGHUP, SIGINT, SIGQUIT, SIGILL,
165131554Stjr	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
166131554Stjr	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
167131554Stjr	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
168131554Stjr	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
169131554Stjr	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
170131554Stjr	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
171131554Stjr	SIGIO, SIGURG, SIGSYS
172131554Stjr};
173131554Stjr
174131554Stjr#define LINUX_T_UNKNOWN  255
175131554Stjrstatic int _bsd_to_linux_trapcode[] = {
176131554Stjr	LINUX_T_UNKNOWN,	/* 0 */
177131554Stjr	6,			/* 1  T_PRIVINFLT */
178131554Stjr	LINUX_T_UNKNOWN,	/* 2 */
179131554Stjr	3,			/* 3  T_BPTFLT */
180131554Stjr	LINUX_T_UNKNOWN,	/* 4 */
181131554Stjr	LINUX_T_UNKNOWN,	/* 5 */
182131554Stjr	16,			/* 6  T_ARITHTRAP */
183131554Stjr	254,			/* 7  T_ASTFLT */
184131554Stjr	LINUX_T_UNKNOWN,	/* 8 */
185131554Stjr	13,			/* 9  T_PROTFLT */
186131554Stjr	1,			/* 10 T_TRCTRAP */
187131554Stjr	LINUX_T_UNKNOWN,	/* 11 */
188131554Stjr	14,			/* 12 T_PAGEFLT */
189131554Stjr	LINUX_T_UNKNOWN,	/* 13 */
190131554Stjr	17,			/* 14 T_ALIGNFLT */
191131554Stjr	LINUX_T_UNKNOWN,	/* 15 */
192131554Stjr	LINUX_T_UNKNOWN,	/* 16 */
193131554Stjr	LINUX_T_UNKNOWN,	/* 17 */
194131554Stjr	0,			/* 18 T_DIVIDE */
195131554Stjr	2,			/* 19 T_NMI */
196131554Stjr	4,			/* 20 T_OFLOW */
197131554Stjr	5,			/* 21 T_BOUND */
198131554Stjr	7,			/* 22 T_DNA */
199131554Stjr	8,			/* 23 T_DOUBLEFLT */
200131554Stjr	9,			/* 24 T_FPOPFLT */
201131554Stjr	10,			/* 25 T_TSSFLT */
202131554Stjr	11,			/* 26 T_SEGNPFLT */
203131554Stjr	12,			/* 27 T_STKFLT */
204131554Stjr	18,			/* 28 T_MCHK */
205131554Stjr	19,			/* 29 T_XMMFLT */
206131554Stjr	15			/* 30 T_RESERVED */
207131554Stjr};
208131554Stjr#define bsd_to_linux_trapcode(code) \
209131554Stjr    ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
210131554Stjr     _bsd_to_linux_trapcode[(code)]: \
211131554Stjr     LINUX_T_UNKNOWN)
212131554Stjr
213131554StjrLINUX_VDSO_SYM_INTPTR(linux_sigcode);
214131554StjrLINUX_VDSO_SYM_INTPTR(linux_rt_sigcode);
215131554StjrLINUX_VDSO_SYM_INTPTR(linux_vsyscall);
216131554Stjr
217131554Stjr/*
218131554Stjr * If FreeBSD & Linux have a difference of opinion about what a trap
219131554Stjr * means, deal with it here.
220131554Stjr *
221131554Stjr * MPSAFE
222131554Stjr */
223131554Stjrstatic int
224131554Stjrtranslate_traps(int signal, int trap_code)
225131554Stjr{
226131554Stjr	if (signal != SIGBUS)
227131554Stjr		return (signal);
228131554Stjr	switch (trap_code) {
229131554Stjr	case T_PROTFLT:
230131554Stjr	case T_TSSFLT:
231131554Stjr	case T_DOUBLEFLT:
232131554Stjr	case T_PAGEFLT:
233131554Stjr		return (SIGSEGV);
234131554Stjr	default:
235131554Stjr		return (signal);
236131554Stjr	}
237131554Stjr}
238131554Stjr
239131554Stjrstatic int
240131554Stjrlinux_fixup(register_t **stack_base, struct image_params *imgp)
241131554Stjr{
242131554Stjr	register_t *argv, *envp;
243131554Stjr
244131554Stjr	argv = *stack_base;
245131554Stjr	envp = *stack_base + (imgp->args->argc + 1);
246131554Stjr	(*stack_base)--;
247131554Stjr	suword(*stack_base, (intptr_t)(void *)envp);
248131554Stjr	(*stack_base)--;
249131554Stjr	suword(*stack_base, (intptr_t)(void *)argv);
250131554Stjr	(*stack_base)--;
251131554Stjr	suword(*stack_base, imgp->args->argc);
252131554Stjr	return (0);
253131554Stjr}
254131554Stjr
255131554Stjrstatic int
256131554Stjrelf_linux_fixup(register_t **stack_base, struct image_params *imgp)
257131554Stjr{
258131554Stjr	struct proc *p;
259131554Stjr	Elf32_Auxargs *args;
260131554Stjr	Elf32_Addr *uplatform;
261131554Stjr	struct ps_strings *arginfo;
262131554Stjr	register_t *pos;
263131554Stjr
264131554Stjr	KASSERT(curthread->td_proc == imgp->proc,
265131554Stjr	    ("unsafe elf_linux_fixup(), should be curproc"));
266131554Stjr
267131554Stjr	p = imgp->proc;
268131554Stjr	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
269131554Stjr	uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
270131554Stjr	args = (Elf32_Auxargs *)imgp->auxargs;
271131554Stjr	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
272131554Stjr
273131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR,
274131554Stjr	    imgp->proc->p_sysent->sv_shared_page_base);
275131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO, linux_vsyscall);
276131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature);
277131554Stjr
278131554Stjr	/*
279131554Stjr	 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
280131554Stjr	 * as it has appeared in the 2.4.0-rc7 first time.
281131554Stjr	 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
282131554Stjr	 * glibc falls back to the hard-coded CLK_TCK value when aux entry
283131554Stjr	 * is not present.
284131554Stjr	 * Also see linux_times() implementation.
285131554Stjr	 */
286131554Stjr	if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
287131554Stjr		AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz);
288131554Stjr	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
289131554Stjr	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
290131554Stjr	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
291131554Stjr	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
292131554Stjr	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
293131554Stjr	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
294131554Stjr	AUXARGS_ENTRY(pos, AT_BASE, args->base);
295131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_SECURE, 0);
296131554Stjr	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
297131554Stjr	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
298131554Stjr	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
299131554Stjr	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
300131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
301131554Stjr	AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary);
302131554Stjr	if (imgp->execpathp != 0)
303131554Stjr		AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp);
304131554Stjr	if (args->execfd != -1)
305131554Stjr		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
306131554Stjr	AUXARGS_ENTRY(pos, AT_NULL, 0);
307131554Stjr
308131554Stjr	free(imgp->auxargs, M_TEMP);
309131554Stjr	imgp->auxargs = NULL;
310131554Stjr
311131554Stjr	(*stack_base)--;
312131554Stjr	suword(*stack_base, (register_t)imgp->args->argc);
313131554Stjr	return (0);
314131554Stjr}
315131554Stjr
316131554Stjr/*
317131554Stjr * Copied from kern/kern_exec.c
318131554Stjr */
319131554Stjrstatic register_t *
320131554Stjrlinux_copyout_strings(struct image_params *imgp)
321131554Stjr{
322131554Stjr	int argc, envc;
323131554Stjr	char **vectp;
324131554Stjr	char *stringp, *destp;
325131554Stjr	register_t *stack_base;
326131554Stjr	struct ps_strings *arginfo;
327131554Stjr	char canary[LINUX_AT_RANDOM_LEN];
328131554Stjr	size_t execpath_len;
329131554Stjr	struct proc *p;
330131554Stjr
331131554Stjr	/*
332131554Stjr	 * Calculate string base and vector table pointers.
333131554Stjr	 */
334131554Stjr	p = imgp->proc;
335131554Stjr	if (imgp->execpath != NULL && imgp->auxargs != NULL)
336131554Stjr		execpath_len = strlen(imgp->execpath) + 1;
337131554Stjr	else
338131554Stjr		execpath_len = 0;
339131554Stjr	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
340131554Stjr	destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
341131554Stjr	    roundup(sizeof(canary), sizeof(char *)) -
342131554Stjr	    roundup(execpath_len, sizeof(char *)) -
343131554Stjr	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
344131554Stjr
345131554Stjr	/*
346131554Stjr	 * install LINUX_PLATFORM
347131554Stjr	 */
348131554Stjr	copyout(linux_kplatform, ((caddr_t)arginfo - linux_szplatform),
349131554Stjr	    linux_szplatform);
350131554Stjr
351131554Stjr	if (execpath_len != 0) {
352131554Stjr		imgp->execpathp = (uintptr_t)arginfo -
353131554Stjr		linux_szplatform - execpath_len;
354131554Stjr		copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
355131554Stjr	}
356131554Stjr
357131554Stjr	/*
358131554Stjr	 * Prepare the canary for SSP.
359131554Stjr	 */
360131554Stjr	arc4rand(canary, sizeof(canary), 0);
361131554Stjr	imgp->canary = (uintptr_t)arginfo - linux_szplatform -
362131554Stjr	    roundup(execpath_len, sizeof(char *)) -
363131554Stjr	    roundup(sizeof(canary), sizeof(char *));
364131554Stjr	copyout(canary, (void *)imgp->canary, sizeof(canary));
365131554Stjr
366131554Stjr	/*
367131554Stjr	 * If we have a valid auxargs ptr, prepare some room
368131554Stjr	 * on the stack.
369131554Stjr	 */
370131554Stjr	if (imgp->auxargs) {
371131554Stjr		/*
372131554Stjr		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
373131554Stjr		 * lower compatibility.
374131554Stjr		 */
375131554Stjr		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
376131554Stjr		    (LINUX_AT_COUNT * 2);
377131554Stjr		/*
378131554Stjr		 * The '+ 2' is for the null pointers at the end of each of
379131554Stjr		 * the arg and env vector sets,and imgp->auxarg_size is room
380131554Stjr		 * for argument of Runtime loader.
381131554Stjr		 */
382131554Stjr		vectp = (char **)(destp - (imgp->args->argc +
383131554Stjr		    imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *));
384131554Stjr	} else {
385131554Stjr		/*
386131554Stjr		 * The '+ 2' is for the null pointers at the end of each of
387131554Stjr		 * the arg and env vector sets
388131554Stjr		 */
389131554Stjr		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
390131554Stjr		    sizeof(char *));
391131554Stjr	}
392131554Stjr
393131554Stjr	/*
394131554Stjr	 * vectp also becomes our initial stack base
395131554Stjr	 */
396131554Stjr	stack_base = (register_t *)vectp;
397131554Stjr
398131554Stjr	stringp = imgp->args->begin_argv;
399131554Stjr	argc = imgp->args->argc;
400131554Stjr	envc = imgp->args->envc;
401131554Stjr
402131554Stjr	/*
403131554Stjr	 * Copy out strings - arguments and environment.
404131554Stjr	 */
405131554Stjr	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
406131554Stjr
407131554Stjr	/*
408131554Stjr	 * Fill in "ps_strings" struct for ps, w, etc.
409131554Stjr	 */
410131554Stjr	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
411131554Stjr	suword(&arginfo->ps_nargvstr, argc);
412131554Stjr
413131554Stjr	/*
414131554Stjr	 * Fill in argument portion of vector table.
415131554Stjr	 */
416131554Stjr	for (; argc > 0; --argc) {
417131554Stjr		suword(vectp++, (long)(intptr_t)destp);
418131554Stjr		while (*stringp++ != 0)
419131554Stjr			destp++;
420131554Stjr		destp++;
421131554Stjr	}
422131554Stjr
423131554Stjr	/* a null vector table pointer separates the argp's from the envp's */
424131554Stjr	suword(vectp++, 0);
425131554Stjr
426131554Stjr	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
427131554Stjr	suword(&arginfo->ps_nenvstr, envc);
428131554Stjr
429131554Stjr	/*
430131554Stjr	 * Fill in environment portion of vector table.
431131554Stjr	 */
432131554Stjr	for (; envc > 0; --envc) {
433131554Stjr		suword(vectp++, (long)(intptr_t)destp);
434131554Stjr		while (*stringp++ != 0)
435131554Stjr			destp++;
436131554Stjr		destp++;
437131554Stjr	}
438131554Stjr
439131554Stjr	/* end of vector table is a null pointer */
440131554Stjr	suword(vectp, 0);
441131554Stjr
442131554Stjr	return (stack_base);
443131554Stjr}
444131554Stjr
445131554Stjrstatic void
446131554Stjrlinux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
447131554Stjr{
448131554Stjr	struct thread *td = curthread;
449131554Stjr	struct proc *p = td->td_proc;
450131554Stjr	struct sigacts *psp;
451131554Stjr	struct trapframe *regs;
452131554Stjr	struct l_rt_sigframe *fp, frame;
453131554Stjr	int sig, code;
454131554Stjr	int oonstack;
455131554Stjr
456131554Stjr	sig = ksi->ksi_signo;
457131554Stjr	code = ksi->ksi_code;
458131554Stjr	PROC_LOCK_ASSERT(p, MA_OWNED);
459131554Stjr	psp = p->p_sigacts;
460131554Stjr	mtx_assert(&psp->ps_mtx, MA_OWNED);
461131554Stjr	regs = td->td_frame;
462131554Stjr	oonstack = sigonstack(regs->tf_esp);
463131554Stjr
464131554Stjr#ifdef DEBUG
465131554Stjr	if (ldebug(rt_sendsig))
466131554Stjr		printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
467131554Stjr		    catcher, sig, (void*)mask, code);
468131554Stjr#endif
469131554Stjr	/*
470131554Stjr	 * Allocate space for the signal handler context.
471131554Stjr	 */
472131554Stjr	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
473131554Stjr	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
474131554Stjr		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
475131554Stjr		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
476131554Stjr	} else
477131554Stjr		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
478131554Stjr	mtx_unlock(&psp->ps_mtx);
479131554Stjr
480131554Stjr	/*
481131554Stjr	 * Build the argument list for the signal handler.
482131554Stjr	 */
483131554Stjr	if (p->p_sysent->sv_sigtbl)
484131554Stjr		if (sig <= p->p_sysent->sv_sigsize)
485131554Stjr			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
486131554Stjr
487131554Stjr	bzero(&frame, sizeof(frame));
488131554Stjr
489131554Stjr	frame.sf_handler = catcher;
490131554Stjr	frame.sf_sig = sig;
491131554Stjr	frame.sf_siginfo = &fp->sf_si;
492131554Stjr	frame.sf_ucontext = &fp->sf_sc;
493131554Stjr
494131554Stjr	/* Fill in POSIX parts */
495131554Stjr	ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
496131554Stjr
497131554Stjr	/*
498131554Stjr	 * Build the signal context to be used by sigreturn.
499131554Stjr	 */
500131554Stjr	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
501131554Stjr	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
502131554Stjr
503131554Stjr	frame.sf_sc.uc_stack.ss_sp = td->td_sigstk.ss_sp;
504131554Stjr	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
505131554Stjr	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
506131554Stjr	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
507131554Stjr	PROC_UNLOCK(p);
508131554Stjr
509131554Stjr	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
510131554Stjr
511131554Stjr	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
512131554Stjr	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
513131554Stjr	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
514131554Stjr	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
515131554Stjr	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
516131554Stjr	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
517131554Stjr	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
518131554Stjr	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
519131554Stjr	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
520131554Stjr	frame.sf_sc.uc_mcontext.sc_esp    = regs->tf_esp;
521131554Stjr	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
522131554Stjr	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
523131554Stjr	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
524131554Stjr	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
525131554Stjr	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
526131554Stjr	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
527131554Stjr	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
528131554Stjr	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
529131554Stjr	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
530131554Stjr	frame.sf_sc.uc_mcontext.sc_cr2    = (register_t)ksi->ksi_addr;
531131554Stjr	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
532131554Stjr
533131554Stjr#ifdef DEBUG
534131554Stjr	if (ldebug(rt_sendsig))
535131554Stjr		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
536131554Stjr		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
537131554Stjr		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
538131554Stjr#endif
539131554Stjr
540131554Stjr	if (copyout(&frame, fp, sizeof(frame)) != 0) {
541131554Stjr		/*
542131554Stjr		 * Process has trashed its stack; give it an illegal
543131554Stjr		 * instruction to halt it in its tracks.
544131554Stjr		 */
545131554Stjr#ifdef DEBUG
546131554Stjr		if (ldebug(rt_sendsig))
547131554Stjr			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
548131554Stjr			    fp, oonstack);
549131554Stjr#endif
550131554Stjr		PROC_LOCK(p);
551131554Stjr		sigexit(td, SIGILL);
552131554Stjr	}
553131554Stjr
554131554Stjr	/*
555131554Stjr	 * Build context to run handler in.
556131554Stjr	 */
557131554Stjr	regs->tf_esp = (int)fp;
558131554Stjr	regs->tf_eip = linux_rt_sigcode;
559131554Stjr	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
560131554Stjr	regs->tf_cs = _ucodesel;
561131554Stjr	regs->tf_ds = _udatasel;
562131554Stjr	regs->tf_es = _udatasel;
563131554Stjr	regs->tf_fs = _udatasel;
564131554Stjr	regs->tf_ss = _udatasel;
565131554Stjr	PROC_LOCK(p);
566131554Stjr	mtx_lock(&psp->ps_mtx);
567131554Stjr}
568131554Stjr
569131554Stjr
570131554Stjr/*
571131554Stjr * Send an interrupt to process.
572131554Stjr *
573131554Stjr * Stack is set up to allow sigcode stored
574131554Stjr * in u. to call routine, followed by kcall
575131554Stjr * to sigreturn routine below.  After sigreturn
576131554Stjr * resets the signal mask, the stack, and the
577131554Stjr * frame pointer, it returns to the user
578131554Stjr * specified pc, psl.
579131554Stjr */
580131554Stjrstatic void
581131554Stjrlinux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
582131554Stjr{
583131554Stjr	struct thread *td = curthread;
584131554Stjr	struct proc *p = td->td_proc;
585131554Stjr	struct sigacts *psp;
586131554Stjr	struct trapframe *regs;
587131554Stjr	struct l_sigframe *fp, frame;
588131554Stjr	l_sigset_t lmask;
589131554Stjr	int sig, code;
590131554Stjr	int oonstack, i;
591131554Stjr
592131554Stjr	PROC_LOCK_ASSERT(p, MA_OWNED);
593131554Stjr	psp = p->p_sigacts;
594131554Stjr	sig = ksi->ksi_signo;
595131554Stjr	code = ksi->ksi_code;
596131554Stjr	mtx_assert(&psp->ps_mtx, MA_OWNED);
597131554Stjr	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
598131554Stjr		/* Signal handler installed with SA_SIGINFO. */
599131554Stjr		linux_rt_sendsig(catcher, ksi, mask);
600131554Stjr		return;
601131554Stjr	}
602131554Stjr	regs = td->td_frame;
603131554Stjr	oonstack = sigonstack(regs->tf_esp);
604131554Stjr
605131554Stjr#ifdef DEBUG
606131554Stjr	if (ldebug(sendsig))
607131554Stjr		printf(ARGS(sendsig, "%p, %d, %p, %u"),
608131554Stjr		    catcher, sig, (void*)mask, code);
609131554Stjr#endif
610131554Stjr
611131554Stjr	/*
612131554Stjr	 * Allocate space for the signal handler context.
613131554Stjr	 */
614	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
615	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
616		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
617		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
618	} else
619		fp = (struct l_sigframe *)regs->tf_esp - 1;
620	mtx_unlock(&psp->ps_mtx);
621	PROC_UNLOCK(p);
622
623	/*
624	 * Build the argument list for the signal handler.
625	 */
626	if (p->p_sysent->sv_sigtbl)
627		if (sig <= p->p_sysent->sv_sigsize)
628			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
629
630	bzero(&frame, sizeof(frame));
631
632	frame.sf_handler = catcher;
633	frame.sf_sig = sig;
634
635	bsd_to_linux_sigset(mask, &lmask);
636
637	/*
638	 * Build the signal context to be used by sigreturn.
639	 */
640	frame.sf_sc.sc_mask   = lmask.__bits[0];
641	frame.sf_sc.sc_gs     = rgs();
642	frame.sf_sc.sc_fs     = regs->tf_fs;
643	frame.sf_sc.sc_es     = regs->tf_es;
644	frame.sf_sc.sc_ds     = regs->tf_ds;
645	frame.sf_sc.sc_edi    = regs->tf_edi;
646	frame.sf_sc.sc_esi    = regs->tf_esi;
647	frame.sf_sc.sc_ebp    = regs->tf_ebp;
648	frame.sf_sc.sc_ebx    = regs->tf_ebx;
649	frame.sf_sc.sc_esp    = regs->tf_esp;
650	frame.sf_sc.sc_edx    = regs->tf_edx;
651	frame.sf_sc.sc_ecx    = regs->tf_ecx;
652	frame.sf_sc.sc_eax    = regs->tf_eax;
653	frame.sf_sc.sc_eip    = regs->tf_eip;
654	frame.sf_sc.sc_cs     = regs->tf_cs;
655	frame.sf_sc.sc_eflags = regs->tf_eflags;
656	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
657	frame.sf_sc.sc_ss     = regs->tf_ss;
658	frame.sf_sc.sc_err    = regs->tf_err;
659	frame.sf_sc.sc_cr2    = (register_t)ksi->ksi_addr;
660	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(ksi->ksi_trapno);
661
662	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
663		frame.sf_extramask[i] = lmask.__bits[i+1];
664
665	if (copyout(&frame, fp, sizeof(frame)) != 0) {
666		/*
667		 * Process has trashed its stack; give it an illegal
668		 * instruction to halt it in its tracks.
669		 */
670		PROC_LOCK(p);
671		sigexit(td, SIGILL);
672	}
673
674	/*
675	 * Build context to run handler in.
676	 */
677	regs->tf_esp = (int)fp;
678	regs->tf_eip = linux_sigcode;
679	regs->tf_eflags &= ~(PSL_T | PSL_VM | PSL_D);
680	regs->tf_cs = _ucodesel;
681	regs->tf_ds = _udatasel;
682	regs->tf_es = _udatasel;
683	regs->tf_fs = _udatasel;
684	regs->tf_ss = _udatasel;
685	PROC_LOCK(p);
686	mtx_lock(&psp->ps_mtx);
687}
688
689/*
690 * System call to cleanup state after a signal
691 * has been taken.  Reset signal mask and
692 * stack state from context left by sendsig (above).
693 * Return to previous pc and psl as specified by
694 * context left by sendsig. Check carefully to
695 * make sure that the user has not modified the
696 * psl to gain improper privileges or to cause
697 * a machine fault.
698 */
699int
700linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
701{
702	struct l_sigframe frame;
703	struct trapframe *regs;
704	l_sigset_t lmask;
705	sigset_t bmask;
706	int eflags, i;
707	ksiginfo_t ksi;
708
709	regs = td->td_frame;
710
711#ifdef DEBUG
712	if (ldebug(sigreturn))
713		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
714#endif
715	/*
716	 * The trampoline code hands us the sigframe.
717	 * It is unsafe to keep track of it ourselves, in the event that a
718	 * program jumps out of a signal handler.
719	 */
720	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
721		return (EFAULT);
722
723	/*
724	 * Check for security violations.
725	 */
726#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
727	eflags = frame.sf_sc.sc_eflags;
728	if (!EFLAGS_SECURE(eflags, regs->tf_eflags))
729		return (EINVAL);
730
731	/*
732	 * Don't allow users to load a valid privileged %cs.  Let the
733	 * hardware check for invalid selectors, excess privilege in
734	 * other selectors, invalid %eip's and invalid %esp's.
735	 */
736#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
737	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
738		ksiginfo_init_trap(&ksi);
739		ksi.ksi_signo = SIGBUS;
740		ksi.ksi_code = BUS_OBJERR;
741		ksi.ksi_trapno = T_PROTFLT;
742		ksi.ksi_addr = (void *)regs->tf_eip;
743		trapsignal(td, &ksi);
744		return (EINVAL);
745	}
746
747	lmask.__bits[0] = frame.sf_sc.sc_mask;
748	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
749		lmask.__bits[i+1] = frame.sf_extramask[i];
750	linux_to_bsd_sigset(&lmask, &bmask);
751	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
752
753	/*
754	 * Restore signal context.
755	 */
756	/* %gs was restored by the trampoline. */
757	regs->tf_fs     = frame.sf_sc.sc_fs;
758	regs->tf_es     = frame.sf_sc.sc_es;
759	regs->tf_ds     = frame.sf_sc.sc_ds;
760	regs->tf_edi    = frame.sf_sc.sc_edi;
761	regs->tf_esi    = frame.sf_sc.sc_esi;
762	regs->tf_ebp    = frame.sf_sc.sc_ebp;
763	regs->tf_ebx    = frame.sf_sc.sc_ebx;
764	regs->tf_edx    = frame.sf_sc.sc_edx;
765	regs->tf_ecx    = frame.sf_sc.sc_ecx;
766	regs->tf_eax    = frame.sf_sc.sc_eax;
767	regs->tf_eip    = frame.sf_sc.sc_eip;
768	regs->tf_cs     = frame.sf_sc.sc_cs;
769	regs->tf_eflags = eflags;
770	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
771	regs->tf_ss     = frame.sf_sc.sc_ss;
772
773	return (EJUSTRETURN);
774}
775
776/*
777 * System call to cleanup state after a signal
778 * has been taken.  Reset signal mask and
779 * stack state from context left by rt_sendsig (above).
780 * Return to previous pc and psl as specified by
781 * context left by sendsig. Check carefully to
782 * make sure that the user has not modified the
783 * psl to gain improper privileges or to cause
784 * a machine fault.
785 */
786int
787linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
788{
789	struct l_ucontext uc;
790	struct l_sigcontext *context;
791	sigset_t bmask;
792	l_stack_t *lss;
793	stack_t ss;
794	struct trapframe *regs;
795	int eflags;
796	ksiginfo_t ksi;
797
798	regs = td->td_frame;
799
800#ifdef DEBUG
801	if (ldebug(rt_sigreturn))
802		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
803#endif
804	/*
805	 * The trampoline code hands us the ucontext.
806	 * It is unsafe to keep track of it ourselves, in the event that a
807	 * program jumps out of a signal handler.
808	 */
809	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
810		return (EFAULT);
811
812	context = &uc.uc_mcontext;
813
814	/*
815	 * Check for security violations.
816	 */
817#define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
818	eflags = context->sc_eflags;
819	if (!EFLAGS_SECURE(eflags, regs->tf_eflags))
820		return (EINVAL);
821
822	/*
823	 * Don't allow users to load a valid privileged %cs.  Let the
824	 * hardware check for invalid selectors, excess privilege in
825	 * other selectors, invalid %eip's and invalid %esp's.
826	 */
827#define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
828	if (!CS_SECURE(context->sc_cs)) {
829		ksiginfo_init_trap(&ksi);
830		ksi.ksi_signo = SIGBUS;
831		ksi.ksi_code = BUS_OBJERR;
832		ksi.ksi_trapno = T_PROTFLT;
833		ksi.ksi_addr = (void *)regs->tf_eip;
834		trapsignal(td, &ksi);
835		return (EINVAL);
836	}
837
838	linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
839	kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
840
841	/*
842	 * Restore signal context
843	 */
844	/* %gs was restored by the trampoline. */
845	regs->tf_fs     = context->sc_fs;
846	regs->tf_es     = context->sc_es;
847	regs->tf_ds     = context->sc_ds;
848	regs->tf_edi    = context->sc_edi;
849	regs->tf_esi    = context->sc_esi;
850	regs->tf_ebp    = context->sc_ebp;
851	regs->tf_ebx    = context->sc_ebx;
852	regs->tf_edx    = context->sc_edx;
853	regs->tf_ecx    = context->sc_ecx;
854	regs->tf_eax    = context->sc_eax;
855	regs->tf_eip    = context->sc_eip;
856	regs->tf_cs     = context->sc_cs;
857	regs->tf_eflags = eflags;
858	regs->tf_esp    = context->sc_esp_at_signal;
859	regs->tf_ss     = context->sc_ss;
860
861	/*
862	 * call sigaltstack & ignore results..
863	 */
864	lss = &uc.uc_stack;
865	ss.ss_sp = lss->ss_sp;
866	ss.ss_size = lss->ss_size;
867	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
868
869#ifdef DEBUG
870	if (ldebug(rt_sigreturn))
871		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
872		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
873#endif
874	(void)kern_sigaltstack(td, &ss, NULL);
875
876	return (EJUSTRETURN);
877}
878
879static int
880linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
881{
882	struct proc *p;
883	struct trapframe *frame;
884
885	p = td->td_proc;
886	frame = td->td_frame;
887
888	sa->code = frame->tf_eax;
889	sa->args[0] = frame->tf_ebx;
890	sa->args[1] = frame->tf_ecx;
891	sa->args[2] = frame->tf_edx;
892	sa->args[3] = frame->tf_esi;
893	sa->args[4] = frame->tf_edi;
894	sa->args[5] = frame->tf_ebp;	/* Unconfirmed */
895
896	if (sa->code >= p->p_sysent->sv_size)
897		sa->callp = &p->p_sysent->sv_table[0];
898 	else
899 		sa->callp = &p->p_sysent->sv_table[sa->code];
900	sa->narg = sa->callp->sy_narg;
901
902	td->td_retval[0] = 0;
903	td->td_retval[1] = frame->tf_edx;
904
905	return (0);
906}
907
908/*
909 * If a linux binary is exec'ing something, try this image activator
910 * first.  We override standard shell script execution in order to
911 * be able to modify the interpreter path.  We only do this if a linux
912 * binary is doing the exec, so we do not create an EXEC module for it.
913 */
914static int	exec_linux_imgact_try(struct image_params *iparams);
915
916static int
917exec_linux_imgact_try(struct image_params *imgp)
918{
919    const char *head = (const char *)imgp->image_header;
920    char *rpath;
921    int error = -1;
922
923    /*
924     * The interpreter for shell scripts run from a linux binary needs
925     * to be located in /compat/linux if possible in order to recursively
926     * maintain linux path emulation.
927     */
928    if (((const short *)head)[0] == SHELLMAGIC) {
929	    /*
930	     * Run our normal shell image activator.  If it succeeds attempt
931	     * to use the alternate path for the interpreter.  If an alternate
932	     * path is found, use our stringspace to store it.
933	     */
934	    if ((error = exec_shell_imgact(imgp)) == 0) {
935		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
936			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0, AT_FDCWD);
937		    if (rpath != NULL)
938			    imgp->args->fname_buf =
939				imgp->interpreter_name = rpath;
940	    }
941    }
942    return (error);
943}
944
945/*
946 * exec_setregs may initialize some registers differently than Linux
947 * does, thus potentially confusing Linux binaries. If necessary, we
948 * override the exec_setregs default(s) here.
949 */
950static void
951exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
952{
953	struct pcb *pcb = td->td_pcb;
954
955	exec_setregs(td, imgp, stack);
956
957	/* Linux sets %gs to 0, we default to _udatasel */
958	pcb->pcb_gs = 0;
959	load_gs(0);
960
961	pcb->pcb_initial_npxcw = __LINUX_NPXCW__;
962}
963
964static void
965linux_get_machine(const char **dst)
966{
967
968	switch (cpu_class) {
969	case CPUCLASS_686:
970		*dst = "i686";
971		break;
972	case CPUCLASS_586:
973		*dst = "i586";
974		break;
975	case CPUCLASS_486:
976		*dst = "i486";
977		break;
978	default:
979		*dst = "i386";
980	}
981}
982
983struct sysentvec linux_sysvec = {
984	.sv_size	= LINUX_SYS_MAXSYSCALL,
985	.sv_table	= linux_sysent,
986	.sv_mask	= 0,
987	.sv_sigsize	= LINUX_SIGTBLSZ,
988	.sv_sigtbl	= bsd_to_linux_signal,
989	.sv_errsize	= ELAST + 1,
990	.sv_errtbl	= bsd_to_linux_errno,
991	.sv_transtrap	= translate_traps,
992	.sv_fixup	= linux_fixup,
993	.sv_sendsig	= linux_sendsig,
994	.sv_sigcode	= &_binary_linux_locore_o_start,
995	.sv_szsigcode	= &linux_szsigcode,
996	.sv_prepsyscall	= NULL,
997	.sv_name	= "Linux a.out",
998	.sv_coredump	= NULL,
999	.sv_imgact_try	= exec_linux_imgact_try,
1000	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1001	.sv_pagesize	= PAGE_SIZE,
1002	.sv_minuser	= VM_MIN_ADDRESS,
1003	.sv_maxuser	= VM_MAXUSER_ADDRESS,
1004	.sv_usrstack	= LINUX_USRSTACK,
1005	.sv_psstrings	= PS_STRINGS,
1006	.sv_stackprot	= VM_PROT_ALL,
1007	.sv_copyout_strings = exec_copyout_strings,
1008	.sv_setregs	= exec_linux_setregs,
1009	.sv_fixlimit	= NULL,
1010	.sv_maxssiz	= NULL,
1011	.sv_flags	= SV_ABI_LINUX | SV_AOUT | SV_IA32 | SV_ILP32,
1012	.sv_set_syscall_retval = cpu_set_syscall_retval,
1013	.sv_fetch_syscall_args = linux_fetch_syscall_args,
1014	.sv_syscallnames = NULL,
1015	.sv_shared_page_base = LINUX_SHAREDPAGE,
1016	.sv_shared_page_len = PAGE_SIZE,
1017	.sv_schedtail	= linux_schedtail,
1018	.sv_thread_detach = linux_thread_detach,
1019};
1020INIT_SYSENTVEC(aout_sysvec, &linux_sysvec);
1021
1022struct sysentvec elf_linux_sysvec = {
1023	.sv_size	= LINUX_SYS_MAXSYSCALL,
1024	.sv_table	= linux_sysent,
1025	.sv_mask	= 0,
1026	.sv_sigsize	= LINUX_SIGTBLSZ,
1027	.sv_sigtbl	= bsd_to_linux_signal,
1028	.sv_errsize	= ELAST + 1,
1029	.sv_errtbl	= bsd_to_linux_errno,
1030	.sv_transtrap	= translate_traps,
1031	.sv_fixup	= elf_linux_fixup,
1032	.sv_sendsig	= linux_sendsig,
1033	.sv_sigcode	= &_binary_linux_locore_o_start,
1034	.sv_szsigcode	= &linux_szsigcode,
1035	.sv_prepsyscall	= NULL,
1036	.sv_name	= "Linux ELF",
1037	.sv_coredump	= elf32_coredump,
1038	.sv_imgact_try	= exec_linux_imgact_try,
1039	.sv_minsigstksz	= LINUX_MINSIGSTKSZ,
1040	.sv_pagesize	= PAGE_SIZE,
1041	.sv_minuser	= VM_MIN_ADDRESS,
1042	.sv_maxuser	= VM_MAXUSER_ADDRESS,
1043	.sv_usrstack	= LINUX_USRSTACK,
1044	.sv_psstrings	= LINUX_PS_STRINGS,
1045	.sv_stackprot	= VM_PROT_ALL,
1046	.sv_copyout_strings = linux_copyout_strings,
1047	.sv_setregs	= exec_linux_setregs,
1048	.sv_fixlimit	= NULL,
1049	.sv_maxssiz	= NULL,
1050	.sv_flags	= SV_ABI_LINUX | SV_IA32 | SV_ILP32 | SV_SHP,
1051	.sv_set_syscall_retval = cpu_set_syscall_retval,
1052	.sv_fetch_syscall_args = linux_fetch_syscall_args,
1053	.sv_syscallnames = NULL,
1054	.sv_shared_page_base = LINUX_SHAREDPAGE,
1055	.sv_shared_page_len = PAGE_SIZE,
1056	.sv_schedtail	= linux_schedtail,
1057	.sv_thread_detach = linux_thread_detach,
1058};
1059
1060static void
1061linux_vdso_install(void *param)
1062{
1063
1064	linux_szsigcode = (&_binary_linux_locore_o_end -
1065	    &_binary_linux_locore_o_start);
1066
1067	if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1068		panic("Linux invalid vdso size\n");
1069
1070	__elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1071
1072	linux_shared_page_obj = __elfN(linux_shared_page_init)
1073	    (&linux_shared_page_mapping);
1074
1075	__elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX_SHAREDPAGE);
1076
1077	bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1078	    linux_szsigcode);
1079	elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1080}
1081SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1082    (sysinit_cfunc_t)linux_vdso_install, NULL);
1083
1084static void
1085linux_vdso_deinstall(void *param)
1086{
1087
1088	__elfN(linux_shared_page_fini)(linux_shared_page_obj);
1089};
1090SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1091    (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1092
1093static char GNU_ABI_VENDOR[] = "GNU";
1094static int GNULINUX_ABI_DESC = 0;
1095
1096static boolean_t
1097linux_trans_osrel(const Elf_Note *note, int32_t *osrel)
1098{
1099	const Elf32_Word *desc;
1100	uintptr_t p;
1101
1102	p = (uintptr_t)(note + 1);
1103	p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1104
1105	desc = (const Elf32_Word *)p;
1106	if (desc[0] != GNULINUX_ABI_DESC)
1107		return (FALSE);
1108
1109	/*
1110	 * For linux we encode osrel as follows (see linux_mib.c):
1111	 * VVVMMMIII (version, major, minor), see linux_mib.c.
1112	 */
1113	*osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1114
1115	return (TRUE);
1116}
1117
1118static Elf_Brandnote linux_brandnote = {
1119	.hdr.n_namesz	= sizeof(GNU_ABI_VENDOR),
1120	.hdr.n_descsz	= 16,	/* XXX at least 16 */
1121	.hdr.n_type	= 1,
1122	.vendor		= GNU_ABI_VENDOR,
1123	.flags		= BN_TRANSLATE_OSREL,
1124	.trans_osrel	= linux_trans_osrel
1125};
1126
1127static Elf32_Brandinfo linux_brand = {
1128	.brand		= ELFOSABI_LINUX,
1129	.machine	= EM_386,
1130	.compat_3_brand	= "Linux",
1131	.emul_path	= "/compat/linux",
1132	.interp_path	= "/lib/ld-linux.so.1",
1133	.sysvec		= &elf_linux_sysvec,
1134	.interp_newpath	= NULL,
1135	.brand_note	= &linux_brandnote,
1136	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1137};
1138
1139static Elf32_Brandinfo linux_glibc2brand = {
1140	.brand		= ELFOSABI_LINUX,
1141	.machine	= EM_386,
1142	.compat_3_brand	= "Linux",
1143	.emul_path	= "/compat/linux",
1144	.interp_path	= "/lib/ld-linux.so.2",
1145	.sysvec		= &elf_linux_sysvec,
1146	.interp_newpath	= NULL,
1147	.brand_note	= &linux_brandnote,
1148	.flags		= BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1149};
1150
1151Elf32_Brandinfo *linux_brandlist[] = {
1152	&linux_brand,
1153	&linux_glibc2brand,
1154	NULL
1155};
1156
1157static int
1158linux_elf_modevent(module_t mod, int type, void *data)
1159{
1160	Elf32_Brandinfo **brandinfo;
1161	int error;
1162	struct linux_ioctl_handler **lihp;
1163
1164	error = 0;
1165
1166	switch(type) {
1167	case MOD_LOAD:
1168		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1169		     ++brandinfo)
1170			if (elf32_insert_brand_entry(*brandinfo) < 0)
1171				error = EINVAL;
1172		if (error == 0) {
1173			SET_FOREACH(lihp, linux_ioctl_handler_set)
1174				linux_ioctl_register_handler(*lihp);
1175			LIST_INIT(&futex_list);
1176			mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1177			linux_exit_tag = EVENTHANDLER_REGISTER(process_exit, linux_proc_exit,
1178			      NULL, 1000);
1179			linux_exec_tag = EVENTHANDLER_REGISTER(process_exec, linux_proc_exec,
1180			      NULL, 1000);
1181			linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1182			    linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1183			linux_get_machine(&linux_kplatform);
1184			linux_szplatform = roundup(strlen(linux_kplatform) + 1,
1185			    sizeof(char *));
1186			linux_osd_jail_register();
1187			stclohz = (stathz ? stathz : hz);
1188			if (bootverbose)
1189				printf("Linux ELF exec handler installed\n");
1190		} else
1191			printf("cannot insert Linux ELF brand handler\n");
1192		break;
1193	case MOD_UNLOAD:
1194		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1195		     ++brandinfo)
1196			if (elf32_brand_inuse(*brandinfo))
1197				error = EBUSY;
1198		if (error == 0) {
1199			for (brandinfo = &linux_brandlist[0];
1200			     *brandinfo != NULL; ++brandinfo)
1201				if (elf32_remove_brand_entry(*brandinfo) < 0)
1202					error = EINVAL;
1203		}
1204		if (error == 0) {
1205			SET_FOREACH(lihp, linux_ioctl_handler_set)
1206				linux_ioctl_unregister_handler(*lihp);
1207			mtx_destroy(&futex_mtx);
1208			EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1209			EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1210			EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1211			linux_osd_jail_deregister();
1212			if (bootverbose)
1213				printf("Linux ELF exec handler removed\n");
1214		} else
1215			printf("Could not deinstall ELF interpreter entry\n");
1216		break;
1217	default:
1218		return (EOPNOTSUPP);
1219	}
1220	return (error);
1221}
1222
1223static moduledata_t linux_elf_mod = {
1224	"linuxelf",
1225	linux_elf_modevent,
1226	0
1227};
1228
1229DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
1230