linux32_machdep.c revision 293482
1250551Sjeff/*-
2250551Sjeff * Copyright (c) 2004 Tim J. Robbins
3250551Sjeff * Copyright (c) 2002 Doug Rabson
4250551Sjeff * Copyright (c) 2000 Marcel Moolenaar
5250551Sjeff * All rights reserved.
6250551Sjeff *
7250551Sjeff * Redistribution and use in source and binary forms, with or without
8250551Sjeff * modification, are permitted provided that the following conditions
9250551Sjeff * are met:
10250551Sjeff * 1. Redistributions of source code must retain the above copyright
11250551Sjeff *    notice, this list of conditions and the following disclaimer
12250551Sjeff *    in this position and unchanged.
13250551Sjeff * 2. Redistributions in binary form must reproduce the above copyright
14250551Sjeff *    notice, this list of conditions and the following disclaimer in the
15250551Sjeff *    documentation and/or other materials provided with the distribution.
16250551Sjeff * 3. The name of the author may not be used to endorse or promote products
17250551Sjeff *    derived from this software without specific prior written permission.
18250551Sjeff *
19250551Sjeff * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20250551Sjeff * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21250551Sjeff * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22250551Sjeff * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23250551Sjeff * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24250551Sjeff * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25250551Sjeff * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26250551Sjeff * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27250551Sjeff * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28250551Sjeff * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29250551Sjeff */
30250551Sjeff
31250551Sjeff#include <sys/cdefs.h>
32250551Sjeff__FBSDID("$FreeBSD: stable/10/sys/amd64/linux32/linux32_machdep.c 293482 2016-01-09 14:40:38Z dchagin $");
33250551Sjeff
34250551Sjeff#include <sys/param.h>
35250551Sjeff#include <sys/kernel.h>
36250551Sjeff#include <sys/systm.h>
37250551Sjeff#include <sys/capsicum.h>
38250551Sjeff#include <sys/file.h>
39250551Sjeff#include <sys/fcntl.h>
40250551Sjeff#include <sys/clock.h>
41250578Sjeff#include <sys/imgact.h>
42250578Sjeff#include <sys/limits.h>
43250578Sjeff#include <sys/lock.h>
44250578Sjeff#include <sys/malloc.h>
45250578Sjeff#include <sys/mman.h>
46250551Sjeff#include <sys/mutex.h>
47250551Sjeff#include <sys/priv.h>
48250551Sjeff#include <sys/proc.h>
49250551Sjeff#include <sys/resource.h>
50250551Sjeff#include <sys/resourcevar.h>
51250551Sjeff#include <sys/sched.h>
52250551Sjeff#include <sys/syscallsubr.h>
53250551Sjeff#include <sys/sysproto.h>
54250551Sjeff#include <sys/unistd.h>
55250551Sjeff#include <sys/wait.h>
56250551Sjeff
57250551Sjeff#include <machine/frame.h>
58250551Sjeff#include <machine/pcb.h>
59250551Sjeff#include <machine/psl.h>
60250551Sjeff#include <machine/segments.h>
61250551Sjeff#include <machine/specialreg.h>
62250551Sjeff
63250551Sjeff#include <vm/vm.h>
64250551Sjeff#include <vm/pmap.h>
65250551Sjeff#include <vm/vm_map.h>
66250551Sjeff
67250551Sjeff#include <compat/freebsd32/freebsd32_util.h>
68250551Sjeff#include <amd64/linux32/linux.h>
69250551Sjeff#include <amd64/linux32/linux32_proto.h>
70250551Sjeff#include <compat/linux/linux_ipc.h>
71250551Sjeff#include <compat/linux/linux_misc.h>
72250551Sjeff#include <compat/linux/linux_signal.h>
73250551Sjeff#include <compat/linux/linux_util.h>
74250551Sjeff#include <compat/linux/linux_emul.h>
75250551Sjeff
76250551Sjeffstruct l_old_select_argv {
77250551Sjeff	l_int		nfds;
78250551Sjeff	l_uintptr_t	readfds;
79250551Sjeff	l_uintptr_t	writefds;
80250551Sjeff	l_uintptr_t	exceptfds;
81250551Sjeff	l_uintptr_t	timeout;
82250551Sjeff} __packed;
83250551Sjeff
84250551Sjeffint
85250551Sjefflinux_to_bsd_sigaltstack(int lsa)
86260266Sdim{
87250551Sjeff	int bsa = 0;
88250551Sjeff
89250551Sjeff	if (lsa & LINUX_SS_DISABLE)
90250551Sjeff		bsa |= SS_DISABLE;
91250551Sjeff	if (lsa & LINUX_SS_ONSTACK)
92250551Sjeff		bsa |= SS_ONSTACK;
93260266Sdim	return (bsa);
94250551Sjeff}
95250551Sjeff
96250551Sjeffstatic int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
97250551Sjeff		    l_size_t len, l_int prot, l_int flags, l_int fd,
98250551Sjeff		    l_loff_t pos);
99250551Sjeff
100250551Sjeffint
101250551Sjeffbsd_to_linux_sigaltstack(int bsa)
102250551Sjeff{
103250551Sjeff	int lsa = 0;
104250551Sjeff
105250551Sjeff	if (bsa & SS_DISABLE)
106250551Sjeff		lsa |= LINUX_SS_DISABLE;
107250551Sjeff	if (bsa & SS_ONSTACK)
108250551Sjeff		lsa |= LINUX_SS_ONSTACK;
109250551Sjeff	return (lsa);
110250551Sjeff}
111250551Sjeff
112250551Sjeffstatic void
113250551Sjeffbsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
114250551Sjeff{
115250551Sjeff
116250551Sjeff	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
117250551Sjeff	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
118250551Sjeff	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
119250551Sjeff	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
120250551Sjeff	lru->ru_maxrss = ru->ru_maxrss;
121250551Sjeff	lru->ru_ixrss = ru->ru_ixrss;
122250551Sjeff	lru->ru_idrss = ru->ru_idrss;
123250551Sjeff	lru->ru_isrss = ru->ru_isrss;
124	lru->ru_minflt = ru->ru_minflt;
125	lru->ru_majflt = ru->ru_majflt;
126	lru->ru_nswap = ru->ru_nswap;
127	lru->ru_inblock = ru->ru_inblock;
128	lru->ru_oublock = ru->ru_oublock;
129	lru->ru_msgsnd = ru->ru_msgsnd;
130	lru->ru_msgrcv = ru->ru_msgrcv;
131	lru->ru_nsignals = ru->ru_nsignals;
132	lru->ru_nvcsw = ru->ru_nvcsw;
133	lru->ru_nivcsw = ru->ru_nivcsw;
134}
135
136int
137linux_execve(struct thread *td, struct linux_execve_args *args)
138{
139	struct image_args eargs;
140	struct vmspace *oldvmspace;
141	char *path;
142	int error;
143
144	LCONVPATHEXIST(td, args->path, &path);
145
146#ifdef DEBUG
147	if (ldebug(execve))
148		printf(ARGS(execve, "%s"), path);
149#endif
150
151	error = pre_execve(td, &oldvmspace);
152	if (error != 0) {
153		free(path, M_TEMP);
154		return (error);
155	}
156	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
157	    args->argp, args->envp);
158	free(path, M_TEMP);
159	if (error == 0)
160		error = kern_execve(td, &eargs, NULL);
161	if (error == 0) {
162		/* Linux process can execute FreeBSD one, do not attempt
163		 * to create emuldata for such process using
164		 * linux_proc_init, this leads to a panic on KASSERT
165		 * because such process has p->p_emuldata == NULL.
166		 */
167		if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
168			error = linux_proc_init(td, 0, 0);
169	}
170	post_execve(td, error, oldvmspace);
171	return (error);
172}
173
174CTASSERT(sizeof(struct l_iovec32) == 8);
175
176static int
177linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
178{
179	struct l_iovec32 iov32;
180	struct iovec *iov;
181	struct uio *uio;
182	uint32_t iovlen;
183	int error, i;
184
185	*uiop = NULL;
186	if (iovcnt > UIO_MAXIOV)
187		return (EINVAL);
188	iovlen = iovcnt * sizeof(struct iovec);
189	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
190	iov = (struct iovec *)(uio + 1);
191	for (i = 0; i < iovcnt; i++) {
192		error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
193		if (error) {
194			free(uio, M_IOV);
195			return (error);
196		}
197		iov[i].iov_base = PTRIN(iov32.iov_base);
198		iov[i].iov_len = iov32.iov_len;
199	}
200	uio->uio_iov = iov;
201	uio->uio_iovcnt = iovcnt;
202	uio->uio_segflg = UIO_USERSPACE;
203	uio->uio_offset = -1;
204	uio->uio_resid = 0;
205	for (i = 0; i < iovcnt; i++) {
206		if (iov->iov_len > INT_MAX - uio->uio_resid) {
207			free(uio, M_IOV);
208			return (EINVAL);
209		}
210		uio->uio_resid += iov->iov_len;
211		iov++;
212	}
213	*uiop = uio;
214	return (0);
215}
216
217int
218linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
219    int error)
220{
221	struct l_iovec32 iov32;
222	struct iovec *iov;
223	uint32_t iovlen;
224	int i;
225
226	*iovp = NULL;
227	if (iovcnt > UIO_MAXIOV)
228		return (error);
229	iovlen = iovcnt * sizeof(struct iovec);
230	iov = malloc(iovlen, M_IOV, M_WAITOK);
231	for (i = 0; i < iovcnt; i++) {
232		error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
233		if (error) {
234			free(iov, M_IOV);
235			return (error);
236		}
237		iov[i].iov_base = PTRIN(iov32.iov_base);
238		iov[i].iov_len = iov32.iov_len;
239	}
240	*iovp = iov;
241	return(0);
242
243}
244
245int
246linux_readv(struct thread *td, struct linux_readv_args *uap)
247{
248	struct uio *auio;
249	int error;
250
251	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
252	if (error)
253		return (error);
254	error = kern_readv(td, uap->fd, auio);
255	free(auio, M_IOV);
256	return (error);
257}
258
259int
260linux_writev(struct thread *td, struct linux_writev_args *uap)
261{
262	struct uio *auio;
263	int error;
264
265	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
266	if (error)
267		return (error);
268	error = kern_writev(td, uap->fd, auio);
269	free(auio, M_IOV);
270	return (error);
271}
272
273struct l_ipc_kludge {
274	l_uintptr_t msgp;
275	l_long msgtyp;
276} __packed;
277
278int
279linux_ipc(struct thread *td, struct linux_ipc_args *args)
280{
281
282	switch (args->what & 0xFFFF) {
283	case LINUX_SEMOP: {
284		struct linux_semop_args a;
285
286		a.semid = args->arg1;
287		a.tsops = args->ptr;
288		a.nsops = args->arg2;
289		return (linux_semop(td, &a));
290	}
291	case LINUX_SEMGET: {
292		struct linux_semget_args a;
293
294		a.key = args->arg1;
295		a.nsems = args->arg2;
296		a.semflg = args->arg3;
297		return (linux_semget(td, &a));
298	}
299	case LINUX_SEMCTL: {
300		struct linux_semctl_args a;
301		int error;
302
303		a.semid = args->arg1;
304		a.semnum = args->arg2;
305		a.cmd = args->arg3;
306		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
307		if (error)
308			return (error);
309		return (linux_semctl(td, &a));
310	}
311	case LINUX_MSGSND: {
312		struct linux_msgsnd_args a;
313
314		a.msqid = args->arg1;
315		a.msgp = args->ptr;
316		a.msgsz = args->arg2;
317		a.msgflg = args->arg3;
318		return (linux_msgsnd(td, &a));
319	}
320	case LINUX_MSGRCV: {
321		struct linux_msgrcv_args a;
322
323		a.msqid = args->arg1;
324		a.msgsz = args->arg2;
325		a.msgflg = args->arg3;
326		if ((args->what >> 16) == 0) {
327			struct l_ipc_kludge tmp;
328			int error;
329
330			if (args->ptr == 0)
331				return (EINVAL);
332			error = copyin(args->ptr, &tmp, sizeof(tmp));
333			if (error)
334				return (error);
335			a.msgp = PTRIN(tmp.msgp);
336			a.msgtyp = tmp.msgtyp;
337		} else {
338			a.msgp = args->ptr;
339			a.msgtyp = args->arg5;
340		}
341		return (linux_msgrcv(td, &a));
342	}
343	case LINUX_MSGGET: {
344		struct linux_msgget_args a;
345
346		a.key = args->arg1;
347		a.msgflg = args->arg2;
348		return (linux_msgget(td, &a));
349	}
350	case LINUX_MSGCTL: {
351		struct linux_msgctl_args a;
352
353		a.msqid = args->arg1;
354		a.cmd = args->arg2;
355		a.buf = args->ptr;
356		return (linux_msgctl(td, &a));
357	}
358	case LINUX_SHMAT: {
359		struct linux_shmat_args a;
360
361		a.shmid = args->arg1;
362		a.shmaddr = args->ptr;
363		a.shmflg = args->arg2;
364		a.raddr = PTRIN((l_uint)args->arg3);
365		return (linux_shmat(td, &a));
366	}
367	case LINUX_SHMDT: {
368		struct linux_shmdt_args a;
369
370		a.shmaddr = args->ptr;
371		return (linux_shmdt(td, &a));
372	}
373	case LINUX_SHMGET: {
374		struct linux_shmget_args a;
375
376		a.key = args->arg1;
377		a.size = args->arg2;
378		a.shmflg = args->arg3;
379		return (linux_shmget(td, &a));
380	}
381	case LINUX_SHMCTL: {
382		struct linux_shmctl_args a;
383
384		a.shmid = args->arg1;
385		a.cmd = args->arg2;
386		a.buf = args->ptr;
387		return (linux_shmctl(td, &a));
388	}
389	default:
390		break;
391	}
392
393	return (EINVAL);
394}
395
396int
397linux_old_select(struct thread *td, struct linux_old_select_args *args)
398{
399	struct l_old_select_argv linux_args;
400	struct linux_select_args newsel;
401	int error;
402
403#ifdef DEBUG
404	if (ldebug(old_select))
405		printf(ARGS(old_select, "%p"), args->ptr);
406#endif
407
408	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
409	if (error)
410		return (error);
411
412	newsel.nfds = linux_args.nfds;
413	newsel.readfds = PTRIN(linux_args.readfds);
414	newsel.writefds = PTRIN(linux_args.writefds);
415	newsel.exceptfds = PTRIN(linux_args.exceptfds);
416	newsel.timeout = PTRIN(linux_args.timeout);
417	return (linux_select(td, &newsel));
418}
419
420int
421linux_set_cloned_tls(struct thread *td, void *desc)
422{
423	struct user_segment_descriptor sd;
424	struct l_user_desc info;
425	struct pcb *pcb;
426	int error;
427	int a[2];
428
429	error = copyin(desc, &info, sizeof(struct l_user_desc));
430	if (error) {
431		printf(LMSG("copyin failed!"));
432	} else {
433		/* We might copy out the entry_number as GUGS32_SEL. */
434		info.entry_number = GUGS32_SEL;
435		error = copyout(&info, desc, sizeof(struct l_user_desc));
436		if (error)
437			printf(LMSG("copyout failed!"));
438
439		a[0] = LINUX_LDT_entry_a(&info);
440		a[1] = LINUX_LDT_entry_b(&info);
441
442		memcpy(&sd, &a, sizeof(a));
443#ifdef DEBUG
444		if (ldebug(clone))
445			printf("Segment created in clone with "
446			    "CLONE_SETTLS: lobase: %x, hibase: %x, "
447			    "lolimit: %x, hilimit: %x, type: %i, "
448			    "dpl: %i, p: %i, xx: %i, long: %i, "
449			    "def32: %i, gran: %i\n", sd.sd_lobase,
450			    sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
451			    sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
452			    sd.sd_long, sd.sd_def32, sd.sd_gran);
453#endif
454		pcb = td->td_pcb;
455		pcb->pcb_gsbase = (register_t)info.base_addr;
456		td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
457		set_pcb_flags(pcb, PCB_32BIT);
458	}
459
460	return (error);
461}
462
463int
464linux_set_upcall_kse(struct thread *td, register_t stack)
465{
466
467	td->td_frame->tf_rsp = stack;
468
469	return (0);
470}
471
472#define STACK_SIZE  (2 * 1024 * 1024)
473#define GUARD_SIZE  (4 * PAGE_SIZE)
474
475int
476linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
477{
478
479#ifdef DEBUG
480	if (ldebug(mmap2))
481		printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
482		    args->addr, args->len, args->prot,
483		    args->flags, args->fd, args->pgoff);
484#endif
485
486	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
487		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
488		PAGE_SIZE));
489}
490
491int
492linux_mmap(struct thread *td, struct linux_mmap_args *args)
493{
494	int error;
495	struct l_mmap_argv linux_args;
496
497	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
498	if (error)
499		return (error);
500
501#ifdef DEBUG
502	if (ldebug(mmap))
503		printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
504		    linux_args.addr, linux_args.len, linux_args.prot,
505		    linux_args.flags, linux_args.fd, linux_args.pgoff);
506#endif
507
508	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
509	    linux_args.prot, linux_args.flags, linux_args.fd,
510	    (uint32_t)linux_args.pgoff));
511}
512
513static int
514linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
515    l_int flags, l_int fd, l_loff_t pos)
516{
517	struct proc *p = td->td_proc;
518	struct mmap_args /* {
519		caddr_t addr;
520		size_t len;
521		int prot;
522		int flags;
523		int fd;
524		long pad;
525		off_t pos;
526	} */ bsd_args;
527	int error;
528	struct file *fp;
529	cap_rights_t rights;
530
531	error = 0;
532	bsd_args.flags = 0;
533	fp = NULL;
534
535	/*
536	 * Linux mmap(2):
537	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
538	 */
539	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
540		return (EINVAL);
541
542	if (flags & LINUX_MAP_SHARED)
543		bsd_args.flags |= MAP_SHARED;
544	if (flags & LINUX_MAP_PRIVATE)
545		bsd_args.flags |= MAP_PRIVATE;
546	if (flags & LINUX_MAP_FIXED)
547		bsd_args.flags |= MAP_FIXED;
548	if (flags & LINUX_MAP_ANON) {
549		/* Enforce pos to be on page boundary, then ignore. */
550		if ((pos & PAGE_MASK) != 0)
551			return (EINVAL);
552		pos = 0;
553		bsd_args.flags |= MAP_ANON;
554	} else
555		bsd_args.flags |= MAP_NOSYNC;
556	if (flags & LINUX_MAP_GROWSDOWN)
557		bsd_args.flags |= MAP_STACK;
558
559	/*
560	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
561	 * on Linux/i386. We do this to ensure maximum compatibility.
562	 * Linux/ia64 does the same in i386 emulation mode.
563	 */
564	bsd_args.prot = prot;
565	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
566		bsd_args.prot |= PROT_READ | PROT_EXEC;
567
568	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
569	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
570	if (bsd_args.fd != -1) {
571		/*
572		 * Linux follows Solaris mmap(2) description:
573		 * The file descriptor fildes is opened with
574		 * read permission, regardless of the
575		 * protection options specified.
576		 */
577
578		error = fget(td, bsd_args.fd,
579		    cap_rights_init(&rights, CAP_MMAP), &fp);
580		if (error != 0)
581			return (error);
582		if (fp->f_type != DTYPE_VNODE) {
583			fdrop(fp, td);
584			return (EINVAL);
585		}
586
587		/* Linux mmap() just fails for O_WRONLY files */
588		if (!(fp->f_flag & FREAD)) {
589			fdrop(fp, td);
590			return (EACCES);
591		}
592
593		fdrop(fp, td);
594	}
595
596	if (flags & LINUX_MAP_GROWSDOWN) {
597		/*
598		 * The Linux MAP_GROWSDOWN option does not limit auto
599		 * growth of the region.  Linux mmap with this option
600		 * takes as addr the inital BOS, and as len, the initial
601		 * region size.  It can then grow down from addr without
602		 * limit.  However, Linux threads has an implicit internal
603		 * limit to stack size of STACK_SIZE.  Its just not
604		 * enforced explicitly in Linux.  But, here we impose
605		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
606		 * region, since we can do this with our mmap.
607		 *
608		 * Our mmap with MAP_STACK takes addr as the maximum
609		 * downsize limit on BOS, and as len the max size of
610		 * the region.  It then maps the top SGROWSIZ bytes,
611		 * and auto grows the region down, up to the limit
612		 * in addr.
613		 *
614		 * If we don't use the MAP_STACK option, the effect
615		 * of this code is to allocate a stack region of a
616		 * fixed size of (STACK_SIZE - GUARD_SIZE).
617		 */
618
619		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
620			/*
621			 * Some Linux apps will attempt to mmap
622			 * thread stacks near the top of their
623			 * address space.  If their TOS is greater
624			 * than vm_maxsaddr, vm_map_growstack()
625			 * will confuse the thread stack with the
626			 * process stack and deliver a SEGV if they
627			 * attempt to grow the thread stack past their
628			 * current stacksize rlimit.  To avoid this,
629			 * adjust vm_maxsaddr upwards to reflect
630			 * the current stacksize rlimit rather
631			 * than the maximum possible stacksize.
632			 * It would be better to adjust the
633			 * mmap'ed region, but some apps do not check
634			 * mmap's return value.
635			 */
636			PROC_LOCK(p);
637			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
638			    lim_cur(p, RLIMIT_STACK);
639			PROC_UNLOCK(p);
640		}
641
642		/*
643		 * This gives us our maximum stack size and a new BOS.
644		 * If we're using VM_STACK, then mmap will just map
645		 * the top SGROWSIZ bytes, and let the stack grow down
646		 * to the limit at BOS.  If we're not using VM_STACK
647		 * we map the full stack, since we don't have a way
648		 * to autogrow it.
649		 */
650		if (len > STACK_SIZE - GUARD_SIZE) {
651			bsd_args.addr = (caddr_t)PTRIN(addr);
652			bsd_args.len = len;
653		} else {
654			bsd_args.addr = (caddr_t)PTRIN(addr) -
655			    (STACK_SIZE - GUARD_SIZE - len);
656			bsd_args.len = STACK_SIZE - GUARD_SIZE;
657		}
658	} else {
659		bsd_args.addr = (caddr_t)PTRIN(addr);
660		bsd_args.len  = len;
661	}
662	bsd_args.pos = pos;
663
664#ifdef DEBUG
665	if (ldebug(mmap))
666		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
667		    __func__,
668		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
669		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
670#endif
671	error = sys_mmap(td, &bsd_args);
672#ifdef DEBUG
673	if (ldebug(mmap))
674		printf("-> %s() return: 0x%x (0x%08x)\n",
675			__func__, error, (u_int)td->td_retval[0]);
676#endif
677	return (error);
678}
679
680int
681linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
682{
683	struct mprotect_args bsd_args;
684
685	bsd_args.addr = uap->addr;
686	bsd_args.len = uap->len;
687	bsd_args.prot = uap->prot;
688	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
689		bsd_args.prot |= PROT_READ | PROT_EXEC;
690	return (sys_mprotect(td, &bsd_args));
691}
692
693int
694linux_iopl(struct thread *td, struct linux_iopl_args *args)
695{
696	int error;
697
698	if (args->level < 0 || args->level > 3)
699		return (EINVAL);
700	if ((error = priv_check(td, PRIV_IO)) != 0)
701		return (error);
702	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
703		return (error);
704	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
705	    (args->level * (PSL_IOPL / 3));
706
707	return (0);
708}
709
710int
711linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
712{
713	l_osigaction_t osa;
714	l_sigaction_t act, oact;
715	int error;
716
717#ifdef DEBUG
718	if (ldebug(sigaction))
719		printf(ARGS(sigaction, "%d, %p, %p"),
720		    args->sig, (void *)args->nsa, (void *)args->osa);
721#endif
722
723	if (args->nsa != NULL) {
724		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
725		if (error)
726			return (error);
727		act.lsa_handler = osa.lsa_handler;
728		act.lsa_flags = osa.lsa_flags;
729		act.lsa_restorer = osa.lsa_restorer;
730		LINUX_SIGEMPTYSET(act.lsa_mask);
731		act.lsa_mask.__bits[0] = osa.lsa_mask;
732	}
733
734	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
735	    args->osa ? &oact : NULL);
736
737	if (args->osa != NULL && !error) {
738		osa.lsa_handler = oact.lsa_handler;
739		osa.lsa_flags = oact.lsa_flags;
740		osa.lsa_restorer = oact.lsa_restorer;
741		osa.lsa_mask = oact.lsa_mask.__bits[0];
742		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
743	}
744
745	return (error);
746}
747
748/*
749 * Linux has two extra args, restart and oldmask.  We don't use these,
750 * but it seems that "restart" is actually a context pointer that
751 * enables the signal to happen with a different register set.
752 */
753int
754linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
755{
756	sigset_t sigmask;
757	l_sigset_t mask;
758
759#ifdef DEBUG
760	if (ldebug(sigsuspend))
761		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
762#endif
763
764	LINUX_SIGEMPTYSET(mask);
765	mask.__bits[0] = args->mask;
766	linux_to_bsd_sigset(&mask, &sigmask);
767	return (kern_sigsuspend(td, sigmask));
768}
769
770int
771linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
772{
773	l_sigset_t lmask;
774	sigset_t sigmask;
775	int error;
776
777#ifdef DEBUG
778	if (ldebug(rt_sigsuspend))
779		printf(ARGS(rt_sigsuspend, "%p, %d"),
780		    (void *)uap->newset, uap->sigsetsize);
781#endif
782
783	if (uap->sigsetsize != sizeof(l_sigset_t))
784		return (EINVAL);
785
786	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
787	if (error)
788		return (error);
789
790	linux_to_bsd_sigset(&lmask, &sigmask);
791	return (kern_sigsuspend(td, sigmask));
792}
793
794int
795linux_pause(struct thread *td, struct linux_pause_args *args)
796{
797	struct proc *p = td->td_proc;
798	sigset_t sigmask;
799
800#ifdef DEBUG
801	if (ldebug(pause))
802		printf(ARGS(pause, ""));
803#endif
804
805	PROC_LOCK(p);
806	sigmask = td->td_sigmask;
807	PROC_UNLOCK(p);
808	return (kern_sigsuspend(td, sigmask));
809}
810
811int
812linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
813{
814	stack_t ss, oss;
815	l_stack_t lss;
816	int error;
817
818#ifdef DEBUG
819	if (ldebug(sigaltstack))
820		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
821#endif
822
823	if (uap->uss != NULL) {
824		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
825		if (error)
826			return (error);
827
828		ss.ss_sp = PTRIN(lss.ss_sp);
829		ss.ss_size = lss.ss_size;
830		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
831	}
832	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
833	    (uap->uoss != NULL) ? &oss : NULL);
834	if (!error && uap->uoss != NULL) {
835		lss.ss_sp = PTROUT(oss.ss_sp);
836		lss.ss_size = oss.ss_size;
837		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
838		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
839	}
840
841	return (error);
842}
843
844int
845linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
846{
847	struct ftruncate_args sa;
848
849#ifdef DEBUG
850	if (ldebug(ftruncate64))
851		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
852		    (intmax_t)args->length);
853#endif
854
855	sa.fd = args->fd;
856	sa.length = args->length;
857	return sys_ftruncate(td, &sa);
858}
859
860int
861linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
862{
863	struct timeval atv;
864	l_timeval atv32;
865	struct timezone rtz;
866	int error = 0;
867
868	if (uap->tp) {
869		microtime(&atv);
870		atv32.tv_sec = atv.tv_sec;
871		atv32.tv_usec = atv.tv_usec;
872		error = copyout(&atv32, uap->tp, sizeof(atv32));
873	}
874	if (error == 0 && uap->tzp != NULL) {
875		rtz.tz_minuteswest = tz_minuteswest;
876		rtz.tz_dsttime = tz_dsttime;
877		error = copyout(&rtz, uap->tzp, sizeof(rtz));
878	}
879	return (error);
880}
881
882int
883linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
884{
885	l_timeval atv32;
886	struct timeval atv, *tvp;
887	struct timezone atz, *tzp;
888	int error;
889
890	if (uap->tp) {
891		error = copyin(uap->tp, &atv32, sizeof(atv32));
892		if (error)
893			return (error);
894		atv.tv_sec = atv32.tv_sec;
895		atv.tv_usec = atv32.tv_usec;
896		tvp = &atv;
897	} else
898		tvp = NULL;
899	if (uap->tzp) {
900		error = copyin(uap->tzp, &atz, sizeof(atz));
901		if (error)
902			return (error);
903		tzp = &atz;
904	} else
905		tzp = NULL;
906	return (kern_settimeofday(td, tvp, tzp));
907}
908
909int
910linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
911{
912	struct l_rusage s32;
913	struct rusage s;
914	int error;
915
916	error = kern_getrusage(td, uap->who, &s);
917	if (error != 0)
918		return (error);
919	if (uap->rusage != NULL) {
920		bsd_to_linux_rusage(&s, &s32);
921		error = copyout(&s32, uap->rusage, sizeof(s32));
922	}
923	return (error);
924}
925
926int
927linux_set_thread_area(struct thread *td,
928    struct linux_set_thread_area_args *args)
929{
930	struct l_user_desc info;
931	struct user_segment_descriptor sd;
932	struct pcb *pcb;
933	int a[2];
934	int error;
935
936	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
937	if (error)
938		return (error);
939
940#ifdef DEBUG
941	if (ldebug(set_thread_area))
942		printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
943		    "%i, %i, %i"), info.entry_number, info.base_addr,
944		    info.limit, info.seg_32bit, info.contents,
945		    info.read_exec_only, info.limit_in_pages,
946		    info.seg_not_present, info.useable);
947#endif
948
949	/*
950	 * Semantics of Linux version: every thread in the system has array
951	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
952	 * This syscall loads one of the selected TLS decriptors with a value
953	 * and also loads GDT descriptors 6, 7 and 8 with the content of
954	 * the per-thread descriptors.
955	 *
956	 * Semantics of FreeBSD version: I think we can ignore that Linux has
957	 * three per-thread descriptors and use just the first one.
958	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
959	 * for loading the GDT descriptors. We use just one GDT descriptor
960	 * for TLS, so we will load just one.
961	 *
962	 * XXX: This doesn't work when a user space process tries to use more
963	 * than one TLS segment. Comment in the Linux source says wine might
964	 * do this.
965	 */
966
967	/*
968	 * GLIBC reads current %gs and call set_thread_area() with it.
969	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
970	 * we use these segments.
971	 */
972	switch (info.entry_number) {
973	case GUGS32_SEL:
974	case GUDATA_SEL:
975	case 6:
976	case -1:
977		info.entry_number = GUGS32_SEL;
978		break;
979	default:
980		return (EINVAL);
981	}
982
983	/*
984	 * We have to copy out the GDT entry we use.
985	 *
986	 * XXX: What if a user space program does not check the return value
987	 * and tries to use 6, 7 or 8?
988	 */
989	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
990	if (error)
991		return (error);
992
993	if (LINUX_LDT_empty(&info)) {
994		a[0] = 0;
995		a[1] = 0;
996	} else {
997		a[0] = LINUX_LDT_entry_a(&info);
998		a[1] = LINUX_LDT_entry_b(&info);
999	}
1000
1001	memcpy(&sd, &a, sizeof(a));
1002#ifdef DEBUG
1003	if (ldebug(set_thread_area))
1004		printf("Segment created in set_thread_area: "
1005		    "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
1006		    "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
1007		    "def32: %i, gran: %i\n",
1008		    sd.sd_lobase,
1009		    sd.sd_hibase,
1010		    sd.sd_lolimit,
1011		    sd.sd_hilimit,
1012		    sd.sd_type,
1013		    sd.sd_dpl,
1014		    sd.sd_p,
1015		    sd.sd_xx,
1016		    sd.sd_long,
1017		    sd.sd_def32,
1018		    sd.sd_gran);
1019#endif
1020
1021	pcb = td->td_pcb;
1022	pcb->pcb_gsbase = (register_t)info.base_addr;
1023	set_pcb_flags(pcb, PCB_32BIT);
1024	update_gdt_gsbase(td, info.base_addr);
1025
1026	return (0);
1027}
1028
1029int
1030linux_wait4(struct thread *td, struct linux_wait4_args *args)
1031{
1032	int error, options;
1033	struct rusage ru, *rup;
1034	struct l_rusage lru;
1035
1036#ifdef DEBUG
1037	if (ldebug(wait4))
1038		printf(ARGS(wait4, "%d, %p, %d, %p"),
1039		    args->pid, (void *)args->status, args->options,
1040		    (void *)args->rusage);
1041#endif
1042
1043	options = (args->options & (WNOHANG | WUNTRACED));
1044	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
1045	if (args->options & __WCLONE)
1046		options |= WLINUXCLONE;
1047
1048	if (args->rusage != NULL)
1049		rup = &ru;
1050	else
1051		rup = NULL;
1052	error = linux_common_wait(td, args->pid, args->status, options, rup);
1053	if (error)
1054		return (error);
1055	if (args->rusage != NULL) {
1056		bsd_to_linux_rusage(rup, &lru);
1057		error = copyout(&lru, args->rusage, sizeof(lru));
1058	}
1059
1060	return (error);
1061}
1062