linux32_machdep.c revision 293500
1/*-
2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2002 Doug Rabson
4 * Copyright (c) 2000 Marcel Moolenaar
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer
12 *    in this position and unchanged.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. The name of the author may not be used to endorse or promote products
17 *    derived from this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32__FBSDID("$FreeBSD: stable/10/sys/amd64/linux32/linux32_machdep.c 293500 2016-01-09 15:23:54Z dchagin $");
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/capsicum.h>
38#include <sys/file.h>
39#include <sys/fcntl.h>
40#include <sys/clock.h>
41#include <sys/imgact.h>
42#include <sys/limits.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/mman.h>
46#include <sys/mutex.h>
47#include <sys/priv.h>
48#include <sys/proc.h>
49#include <sys/resource.h>
50#include <sys/resourcevar.h>
51#include <sys/syscallsubr.h>
52#include <sys/sysproto.h>
53#include <sys/unistd.h>
54#include <sys/wait.h>
55
56#include <machine/frame.h>
57#include <machine/pcb.h>
58#include <machine/psl.h>
59#include <machine/segments.h>
60#include <machine/specialreg.h>
61
62#include <vm/vm.h>
63#include <vm/pmap.h>
64#include <vm/vm_map.h>
65
66#include <compat/freebsd32/freebsd32_util.h>
67#include <amd64/linux32/linux.h>
68#include <amd64/linux32/linux32_proto.h>
69#include <compat/linux/linux_ipc.h>
70#include <compat/linux/linux_misc.h>
71#include <compat/linux/linux_signal.h>
72#include <compat/linux/linux_util.h>
73#include <compat/linux/linux_emul.h>
74
75static void	bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru);
76
77struct l_old_select_argv {
78	l_int		nfds;
79	l_uintptr_t	readfds;
80	l_uintptr_t	writefds;
81	l_uintptr_t	exceptfds;
82	l_uintptr_t	timeout;
83} __packed;
84
85int
86linux_to_bsd_sigaltstack(int lsa)
87{
88	int bsa = 0;
89
90	if (lsa & LINUX_SS_DISABLE)
91		bsa |= SS_DISABLE;
92	if (lsa & LINUX_SS_ONSTACK)
93		bsa |= SS_ONSTACK;
94	return (bsa);
95}
96
97static int	linux_mmap_common(struct thread *td, l_uintptr_t addr,
98		    l_size_t len, l_int prot, l_int flags, l_int fd,
99		    l_loff_t pos);
100
101int
102bsd_to_linux_sigaltstack(int bsa)
103{
104	int lsa = 0;
105
106	if (bsa & SS_DISABLE)
107		lsa |= LINUX_SS_DISABLE;
108	if (bsa & SS_ONSTACK)
109		lsa |= LINUX_SS_ONSTACK;
110	return (lsa);
111}
112
113static void
114bsd_to_linux_rusage(struct rusage *ru, struct l_rusage *lru)
115{
116
117	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
118	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
119	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
120	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
121	lru->ru_maxrss = ru->ru_maxrss;
122	lru->ru_ixrss = ru->ru_ixrss;
123	lru->ru_idrss = ru->ru_idrss;
124	lru->ru_isrss = ru->ru_isrss;
125	lru->ru_minflt = ru->ru_minflt;
126	lru->ru_majflt = ru->ru_majflt;
127	lru->ru_nswap = ru->ru_nswap;
128	lru->ru_inblock = ru->ru_inblock;
129	lru->ru_oublock = ru->ru_oublock;
130	lru->ru_msgsnd = ru->ru_msgsnd;
131	lru->ru_msgrcv = ru->ru_msgrcv;
132	lru->ru_nsignals = ru->ru_nsignals;
133	lru->ru_nvcsw = ru->ru_nvcsw;
134	lru->ru_nivcsw = ru->ru_nivcsw;
135}
136
137int
138linux_copyout_rusage(struct rusage *ru, void *uaddr)
139{
140	struct l_rusage lru;
141
142	bsd_to_linux_rusage(ru, &lru);
143
144	return (copyout(&lru, uaddr, sizeof(struct l_rusage)));
145}
146
147int
148linux_execve(struct thread *td, struct linux_execve_args *args)
149{
150	struct image_args eargs;
151	struct vmspace *oldvmspace;
152	char *path;
153	int error;
154
155	LCONVPATHEXIST(td, args->path, &path);
156
157#ifdef DEBUG
158	if (ldebug(execve))
159		printf(ARGS(execve, "%s"), path);
160#endif
161
162	error = pre_execve(td, &oldvmspace);
163	if (error != 0) {
164		free(path, M_TEMP);
165		return (error);
166	}
167	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
168	    args->argp, args->envp);
169	free(path, M_TEMP);
170	if (error == 0)
171		error = kern_execve(td, &eargs, NULL);
172	if (error == 0)
173		error = linux_common_execve(td, &eargs);
174	post_execve(td, error, oldvmspace);
175	return (error);
176}
177
178CTASSERT(sizeof(struct l_iovec32) == 8);
179
180static int
181linux32_copyinuio(struct l_iovec32 *iovp, l_ulong iovcnt, struct uio **uiop)
182{
183	struct l_iovec32 iov32;
184	struct iovec *iov;
185	struct uio *uio;
186	uint32_t iovlen;
187	int error, i;
188
189	*uiop = NULL;
190	if (iovcnt > UIO_MAXIOV)
191		return (EINVAL);
192	iovlen = iovcnt * sizeof(struct iovec);
193	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
194	iov = (struct iovec *)(uio + 1);
195	for (i = 0; i < iovcnt; i++) {
196		error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
197		if (error) {
198			free(uio, M_IOV);
199			return (error);
200		}
201		iov[i].iov_base = PTRIN(iov32.iov_base);
202		iov[i].iov_len = iov32.iov_len;
203	}
204	uio->uio_iov = iov;
205	uio->uio_iovcnt = iovcnt;
206	uio->uio_segflg = UIO_USERSPACE;
207	uio->uio_offset = -1;
208	uio->uio_resid = 0;
209	for (i = 0; i < iovcnt; i++) {
210		if (iov->iov_len > INT_MAX - uio->uio_resid) {
211			free(uio, M_IOV);
212			return (EINVAL);
213		}
214		uio->uio_resid += iov->iov_len;
215		iov++;
216	}
217	*uiop = uio;
218	return (0);
219}
220
221int
222linux32_copyiniov(struct l_iovec32 *iovp32, l_ulong iovcnt, struct iovec **iovp,
223    int error)
224{
225	struct l_iovec32 iov32;
226	struct iovec *iov;
227	uint32_t iovlen;
228	int i;
229
230	*iovp = NULL;
231	if (iovcnt > UIO_MAXIOV)
232		return (error);
233	iovlen = iovcnt * sizeof(struct iovec);
234	iov = malloc(iovlen, M_IOV, M_WAITOK);
235	for (i = 0; i < iovcnt; i++) {
236		error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
237		if (error) {
238			free(iov, M_IOV);
239			return (error);
240		}
241		iov[i].iov_base = PTRIN(iov32.iov_base);
242		iov[i].iov_len = iov32.iov_len;
243	}
244	*iovp = iov;
245	return(0);
246
247}
248
249int
250linux_readv(struct thread *td, struct linux_readv_args *uap)
251{
252	struct uio *auio;
253	int error;
254
255	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
256	if (error)
257		return (error);
258	error = kern_readv(td, uap->fd, auio);
259	free(auio, M_IOV);
260	return (error);
261}
262
263int
264linux_writev(struct thread *td, struct linux_writev_args *uap)
265{
266	struct uio *auio;
267	int error;
268
269	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
270	if (error)
271		return (error);
272	error = kern_writev(td, uap->fd, auio);
273	free(auio, M_IOV);
274	return (error);
275}
276
277struct l_ipc_kludge {
278	l_uintptr_t msgp;
279	l_long msgtyp;
280} __packed;
281
282int
283linux_ipc(struct thread *td, struct linux_ipc_args *args)
284{
285
286	switch (args->what & 0xFFFF) {
287	case LINUX_SEMOP: {
288		struct linux_semop_args a;
289
290		a.semid = args->arg1;
291		a.tsops = args->ptr;
292		a.nsops = args->arg2;
293		return (linux_semop(td, &a));
294	}
295	case LINUX_SEMGET: {
296		struct linux_semget_args a;
297
298		a.key = args->arg1;
299		a.nsems = args->arg2;
300		a.semflg = args->arg3;
301		return (linux_semget(td, &a));
302	}
303	case LINUX_SEMCTL: {
304		struct linux_semctl_args a;
305		int error;
306
307		a.semid = args->arg1;
308		a.semnum = args->arg2;
309		a.cmd = args->arg3;
310		error = copyin(args->ptr, &a.arg, sizeof(a.arg));
311		if (error)
312			return (error);
313		return (linux_semctl(td, &a));
314	}
315	case LINUX_MSGSND: {
316		struct linux_msgsnd_args a;
317
318		a.msqid = args->arg1;
319		a.msgp = args->ptr;
320		a.msgsz = args->arg2;
321		a.msgflg = args->arg3;
322		return (linux_msgsnd(td, &a));
323	}
324	case LINUX_MSGRCV: {
325		struct linux_msgrcv_args a;
326
327		a.msqid = args->arg1;
328		a.msgsz = args->arg2;
329		a.msgflg = args->arg3;
330		if ((args->what >> 16) == 0) {
331			struct l_ipc_kludge tmp;
332			int error;
333
334			if (args->ptr == 0)
335				return (EINVAL);
336			error = copyin(args->ptr, &tmp, sizeof(tmp));
337			if (error)
338				return (error);
339			a.msgp = PTRIN(tmp.msgp);
340			a.msgtyp = tmp.msgtyp;
341		} else {
342			a.msgp = args->ptr;
343			a.msgtyp = args->arg5;
344		}
345		return (linux_msgrcv(td, &a));
346	}
347	case LINUX_MSGGET: {
348		struct linux_msgget_args a;
349
350		a.key = args->arg1;
351		a.msgflg = args->arg2;
352		return (linux_msgget(td, &a));
353	}
354	case LINUX_MSGCTL: {
355		struct linux_msgctl_args a;
356
357		a.msqid = args->arg1;
358		a.cmd = args->arg2;
359		a.buf = args->ptr;
360		return (linux_msgctl(td, &a));
361	}
362	case LINUX_SHMAT: {
363		struct linux_shmat_args a;
364
365		a.shmid = args->arg1;
366		a.shmaddr = args->ptr;
367		a.shmflg = args->arg2;
368		a.raddr = PTRIN((l_uint)args->arg3);
369		return (linux_shmat(td, &a));
370	}
371	case LINUX_SHMDT: {
372		struct linux_shmdt_args a;
373
374		a.shmaddr = args->ptr;
375		return (linux_shmdt(td, &a));
376	}
377	case LINUX_SHMGET: {
378		struct linux_shmget_args a;
379
380		a.key = args->arg1;
381		a.size = args->arg2;
382		a.shmflg = args->arg3;
383		return (linux_shmget(td, &a));
384	}
385	case LINUX_SHMCTL: {
386		struct linux_shmctl_args a;
387
388		a.shmid = args->arg1;
389		a.cmd = args->arg2;
390		a.buf = args->ptr;
391		return (linux_shmctl(td, &a));
392	}
393	default:
394		break;
395	}
396
397	return (EINVAL);
398}
399
400int
401linux_old_select(struct thread *td, struct linux_old_select_args *args)
402{
403	struct l_old_select_argv linux_args;
404	struct linux_select_args newsel;
405	int error;
406
407#ifdef DEBUG
408	if (ldebug(old_select))
409		printf(ARGS(old_select, "%p"), args->ptr);
410#endif
411
412	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
413	if (error)
414		return (error);
415
416	newsel.nfds = linux_args.nfds;
417	newsel.readfds = PTRIN(linux_args.readfds);
418	newsel.writefds = PTRIN(linux_args.writefds);
419	newsel.exceptfds = PTRIN(linux_args.exceptfds);
420	newsel.timeout = PTRIN(linux_args.timeout);
421	return (linux_select(td, &newsel));
422}
423
424int
425linux_set_cloned_tls(struct thread *td, void *desc)
426{
427	struct user_segment_descriptor sd;
428	struct l_user_desc info;
429	struct pcb *pcb;
430	int error;
431	int a[2];
432
433	error = copyin(desc, &info, sizeof(struct l_user_desc));
434	if (error) {
435		printf(LMSG("copyin failed!"));
436	} else {
437		/* We might copy out the entry_number as GUGS32_SEL. */
438		info.entry_number = GUGS32_SEL;
439		error = copyout(&info, desc, sizeof(struct l_user_desc));
440		if (error)
441			printf(LMSG("copyout failed!"));
442
443		a[0] = LINUX_LDT_entry_a(&info);
444		a[1] = LINUX_LDT_entry_b(&info);
445
446		memcpy(&sd, &a, sizeof(a));
447#ifdef DEBUG
448		if (ldebug(clone))
449			printf("Segment created in clone with "
450			    "CLONE_SETTLS: lobase: %x, hibase: %x, "
451			    "lolimit: %x, hilimit: %x, type: %i, "
452			    "dpl: %i, p: %i, xx: %i, long: %i, "
453			    "def32: %i, gran: %i\n", sd.sd_lobase,
454			    sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
455			    sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
456			    sd.sd_long, sd.sd_def32, sd.sd_gran);
457#endif
458		pcb = td->td_pcb;
459		pcb->pcb_gsbase = (register_t)info.base_addr;
460		td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
461		set_pcb_flags(pcb, PCB_32BIT);
462	}
463
464	return (error);
465}
466
467int
468linux_set_upcall_kse(struct thread *td, register_t stack)
469{
470
471	if (stack)
472		td->td_frame->tf_rsp = stack;
473
474	/*
475	 * The newly created Linux thread returns
476	 * to the user space by the same path that a parent do.
477	 */
478	td->td_frame->tf_rax = 0;
479	return (0);
480}
481
482#define STACK_SIZE  (2 * 1024 * 1024)
483#define GUARD_SIZE  (4 * PAGE_SIZE)
484
485int
486linux_mmap2(struct thread *td, struct linux_mmap2_args *args)
487{
488
489#ifdef DEBUG
490	if (ldebug(mmap2))
491		printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
492		    args->addr, args->len, args->prot,
493		    args->flags, args->fd, args->pgoff);
494#endif
495
496	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
497		args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
498		PAGE_SIZE));
499}
500
501int
502linux_mmap(struct thread *td, struct linux_mmap_args *args)
503{
504	int error;
505	struct l_mmap_argv linux_args;
506
507	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
508	if (error)
509		return (error);
510
511#ifdef DEBUG
512	if (ldebug(mmap))
513		printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
514		    linux_args.addr, linux_args.len, linux_args.prot,
515		    linux_args.flags, linux_args.fd, linux_args.pgoff);
516#endif
517
518	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
519	    linux_args.prot, linux_args.flags, linux_args.fd,
520	    (uint32_t)linux_args.pgoff));
521}
522
523static int
524linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
525    l_int flags, l_int fd, l_loff_t pos)
526{
527	struct proc *p = td->td_proc;
528	struct mmap_args /* {
529		caddr_t addr;
530		size_t len;
531		int prot;
532		int flags;
533		int fd;
534		long pad;
535		off_t pos;
536	} */ bsd_args;
537	int error;
538	struct file *fp;
539	cap_rights_t rights;
540
541	error = 0;
542	bsd_args.flags = 0;
543	fp = NULL;
544
545	/*
546	 * Linux mmap(2):
547	 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE
548	 */
549	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
550		return (EINVAL);
551
552	if (flags & LINUX_MAP_SHARED)
553		bsd_args.flags |= MAP_SHARED;
554	if (flags & LINUX_MAP_PRIVATE)
555		bsd_args.flags |= MAP_PRIVATE;
556	if (flags & LINUX_MAP_FIXED)
557		bsd_args.flags |= MAP_FIXED;
558	if (flags & LINUX_MAP_ANON) {
559		/* Enforce pos to be on page boundary, then ignore. */
560		if ((pos & PAGE_MASK) != 0)
561			return (EINVAL);
562		pos = 0;
563		bsd_args.flags |= MAP_ANON;
564	} else
565		bsd_args.flags |= MAP_NOSYNC;
566	if (flags & LINUX_MAP_GROWSDOWN)
567		bsd_args.flags |= MAP_STACK;
568
569	/*
570	 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
571	 * on Linux/i386. We do this to ensure maximum compatibility.
572	 * Linux/ia64 does the same in i386 emulation mode.
573	 */
574	bsd_args.prot = prot;
575	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
576		bsd_args.prot |= PROT_READ | PROT_EXEC;
577
578	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
579	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
580	if (bsd_args.fd != -1) {
581		/*
582		 * Linux follows Solaris mmap(2) description:
583		 * The file descriptor fildes is opened with
584		 * read permission, regardless of the
585		 * protection options specified.
586		 */
587
588		error = fget(td, bsd_args.fd,
589		    cap_rights_init(&rights, CAP_MMAP), &fp);
590		if (error != 0)
591			return (error);
592		if (fp->f_type != DTYPE_VNODE) {
593			fdrop(fp, td);
594			return (EINVAL);
595		}
596
597		/* Linux mmap() just fails for O_WRONLY files */
598		if (!(fp->f_flag & FREAD)) {
599			fdrop(fp, td);
600			return (EACCES);
601		}
602
603		fdrop(fp, td);
604	}
605
606	if (flags & LINUX_MAP_GROWSDOWN) {
607		/*
608		 * The Linux MAP_GROWSDOWN option does not limit auto
609		 * growth of the region.  Linux mmap with this option
610		 * takes as addr the inital BOS, and as len, the initial
611		 * region size.  It can then grow down from addr without
612		 * limit.  However, Linux threads has an implicit internal
613		 * limit to stack size of STACK_SIZE.  Its just not
614		 * enforced explicitly in Linux.  But, here we impose
615		 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack
616		 * region, since we can do this with our mmap.
617		 *
618		 * Our mmap with MAP_STACK takes addr as the maximum
619		 * downsize limit on BOS, and as len the max size of
620		 * the region.  It then maps the top SGROWSIZ bytes,
621		 * and auto grows the region down, up to the limit
622		 * in addr.
623		 *
624		 * If we don't use the MAP_STACK option, the effect
625		 * of this code is to allocate a stack region of a
626		 * fixed size of (STACK_SIZE - GUARD_SIZE).
627		 */
628
629		if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
630			/*
631			 * Some Linux apps will attempt to mmap
632			 * thread stacks near the top of their
633			 * address space.  If their TOS is greater
634			 * than vm_maxsaddr, vm_map_growstack()
635			 * will confuse the thread stack with the
636			 * process stack and deliver a SEGV if they
637			 * attempt to grow the thread stack past their
638			 * current stacksize rlimit.  To avoid this,
639			 * adjust vm_maxsaddr upwards to reflect
640			 * the current stacksize rlimit rather
641			 * than the maximum possible stacksize.
642			 * It would be better to adjust the
643			 * mmap'ed region, but some apps do not check
644			 * mmap's return value.
645			 */
646			PROC_LOCK(p);
647			p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
648			    lim_cur(p, RLIMIT_STACK);
649			PROC_UNLOCK(p);
650		}
651
652		/*
653		 * This gives us our maximum stack size and a new BOS.
654		 * If we're using VM_STACK, then mmap will just map
655		 * the top SGROWSIZ bytes, and let the stack grow down
656		 * to the limit at BOS.  If we're not using VM_STACK
657		 * we map the full stack, since we don't have a way
658		 * to autogrow it.
659		 */
660		if (len > STACK_SIZE - GUARD_SIZE) {
661			bsd_args.addr = (caddr_t)PTRIN(addr);
662			bsd_args.len = len;
663		} else {
664			bsd_args.addr = (caddr_t)PTRIN(addr) -
665			    (STACK_SIZE - GUARD_SIZE - len);
666			bsd_args.len = STACK_SIZE - GUARD_SIZE;
667		}
668	} else {
669		bsd_args.addr = (caddr_t)PTRIN(addr);
670		bsd_args.len  = len;
671	}
672	bsd_args.pos = pos;
673
674#ifdef DEBUG
675	if (ldebug(mmap))
676		printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
677		    __func__,
678		    (void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
679		    bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
680#endif
681	error = sys_mmap(td, &bsd_args);
682#ifdef DEBUG
683	if (ldebug(mmap))
684		printf("-> %s() return: 0x%x (0x%08x)\n",
685			__func__, error, (u_int)td->td_retval[0]);
686#endif
687	return (error);
688}
689
690int
691linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
692{
693	struct mprotect_args bsd_args;
694
695	bsd_args.addr = uap->addr;
696	bsd_args.len = uap->len;
697	bsd_args.prot = uap->prot;
698	if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC))
699		bsd_args.prot |= PROT_READ | PROT_EXEC;
700	return (sys_mprotect(td, &bsd_args));
701}
702
703int
704linux_iopl(struct thread *td, struct linux_iopl_args *args)
705{
706	int error;
707
708	if (args->level < 0 || args->level > 3)
709		return (EINVAL);
710	if ((error = priv_check(td, PRIV_IO)) != 0)
711		return (error);
712	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
713		return (error);
714	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) |
715	    (args->level * (PSL_IOPL / 3));
716
717	return (0);
718}
719
720int
721linux_sigaction(struct thread *td, struct linux_sigaction_args *args)
722{
723	l_osigaction_t osa;
724	l_sigaction_t act, oact;
725	int error;
726
727#ifdef DEBUG
728	if (ldebug(sigaction))
729		printf(ARGS(sigaction, "%d, %p, %p"),
730		    args->sig, (void *)args->nsa, (void *)args->osa);
731#endif
732
733	if (args->nsa != NULL) {
734		error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
735		if (error)
736			return (error);
737		act.lsa_handler = osa.lsa_handler;
738		act.lsa_flags = osa.lsa_flags;
739		act.lsa_restorer = osa.lsa_restorer;
740		LINUX_SIGEMPTYSET(act.lsa_mask);
741		act.lsa_mask.__bits[0] = osa.lsa_mask;
742	}
743
744	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
745	    args->osa ? &oact : NULL);
746
747	if (args->osa != NULL && !error) {
748		osa.lsa_handler = oact.lsa_handler;
749		osa.lsa_flags = oact.lsa_flags;
750		osa.lsa_restorer = oact.lsa_restorer;
751		osa.lsa_mask = oact.lsa_mask.__bits[0];
752		error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
753	}
754
755	return (error);
756}
757
758/*
759 * Linux has two extra args, restart and oldmask.  We don't use these,
760 * but it seems that "restart" is actually a context pointer that
761 * enables the signal to happen with a different register set.
762 */
763int
764linux_sigsuspend(struct thread *td, struct linux_sigsuspend_args *args)
765{
766	sigset_t sigmask;
767	l_sigset_t mask;
768
769#ifdef DEBUG
770	if (ldebug(sigsuspend))
771		printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
772#endif
773
774	LINUX_SIGEMPTYSET(mask);
775	mask.__bits[0] = args->mask;
776	linux_to_bsd_sigset(&mask, &sigmask);
777	return (kern_sigsuspend(td, sigmask));
778}
779
780int
781linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap)
782{
783	l_sigset_t lmask;
784	sigset_t sigmask;
785	int error;
786
787#ifdef DEBUG
788	if (ldebug(rt_sigsuspend))
789		printf(ARGS(rt_sigsuspend, "%p, %d"),
790		    (void *)uap->newset, uap->sigsetsize);
791#endif
792
793	if (uap->sigsetsize != sizeof(l_sigset_t))
794		return (EINVAL);
795
796	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
797	if (error)
798		return (error);
799
800	linux_to_bsd_sigset(&lmask, &sigmask);
801	return (kern_sigsuspend(td, sigmask));
802}
803
804int
805linux_pause(struct thread *td, struct linux_pause_args *args)
806{
807	struct proc *p = td->td_proc;
808	sigset_t sigmask;
809
810#ifdef DEBUG
811	if (ldebug(pause))
812		printf(ARGS(pause, ""));
813#endif
814
815	PROC_LOCK(p);
816	sigmask = td->td_sigmask;
817	PROC_UNLOCK(p);
818	return (kern_sigsuspend(td, sigmask));
819}
820
821int
822linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap)
823{
824	stack_t ss, oss;
825	l_stack_t lss;
826	int error;
827
828#ifdef DEBUG
829	if (ldebug(sigaltstack))
830		printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
831#endif
832
833	if (uap->uss != NULL) {
834		error = copyin(uap->uss, &lss, sizeof(l_stack_t));
835		if (error)
836			return (error);
837
838		ss.ss_sp = PTRIN(lss.ss_sp);
839		ss.ss_size = lss.ss_size;
840		ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
841	}
842	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
843	    (uap->uoss != NULL) ? &oss : NULL);
844	if (!error && uap->uoss != NULL) {
845		lss.ss_sp = PTROUT(oss.ss_sp);
846		lss.ss_size = oss.ss_size;
847		lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
848		error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
849	}
850
851	return (error);
852}
853
854int
855linux_ftruncate64(struct thread *td, struct linux_ftruncate64_args *args)
856{
857	struct ftruncate_args sa;
858
859#ifdef DEBUG
860	if (ldebug(ftruncate64))
861		printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
862		    (intmax_t)args->length);
863#endif
864
865	sa.fd = args->fd;
866	sa.length = args->length;
867	return sys_ftruncate(td, &sa);
868}
869
870int
871linux_gettimeofday(struct thread *td, struct linux_gettimeofday_args *uap)
872{
873	struct timeval atv;
874	l_timeval atv32;
875	struct timezone rtz;
876	int error = 0;
877
878	if (uap->tp) {
879		microtime(&atv);
880		atv32.tv_sec = atv.tv_sec;
881		atv32.tv_usec = atv.tv_usec;
882		error = copyout(&atv32, uap->tp, sizeof(atv32));
883	}
884	if (error == 0 && uap->tzp != NULL) {
885		rtz.tz_minuteswest = tz_minuteswest;
886		rtz.tz_dsttime = tz_dsttime;
887		error = copyout(&rtz, uap->tzp, sizeof(rtz));
888	}
889	return (error);
890}
891
892int
893linux_settimeofday(struct thread *td, struct linux_settimeofday_args *uap)
894{
895	l_timeval atv32;
896	struct timeval atv, *tvp;
897	struct timezone atz, *tzp;
898	int error;
899
900	if (uap->tp) {
901		error = copyin(uap->tp, &atv32, sizeof(atv32));
902		if (error)
903			return (error);
904		atv.tv_sec = atv32.tv_sec;
905		atv.tv_usec = atv32.tv_usec;
906		tvp = &atv;
907	} else
908		tvp = NULL;
909	if (uap->tzp) {
910		error = copyin(uap->tzp, &atz, sizeof(atz));
911		if (error)
912			return (error);
913		tzp = &atz;
914	} else
915		tzp = NULL;
916	return (kern_settimeofday(td, tvp, tzp));
917}
918
919int
920linux_getrusage(struct thread *td, struct linux_getrusage_args *uap)
921{
922	struct rusage s;
923	int error;
924
925	error = kern_getrusage(td, uap->who, &s);
926	if (error != 0)
927		return (error);
928	if (uap->rusage != NULL)
929		error = linux_copyout_rusage(&s, uap->rusage);
930	return (error);
931}
932
933int
934linux_set_thread_area(struct thread *td,
935    struct linux_set_thread_area_args *args)
936{
937	struct l_user_desc info;
938	struct user_segment_descriptor sd;
939	struct pcb *pcb;
940	int a[2];
941	int error;
942
943	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
944	if (error)
945		return (error);
946
947#ifdef DEBUG
948	if (ldebug(set_thread_area))
949		printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
950		    "%i, %i, %i"), info.entry_number, info.base_addr,
951		    info.limit, info.seg_32bit, info.contents,
952		    info.read_exec_only, info.limit_in_pages,
953		    info.seg_not_present, info.useable);
954#endif
955
956	/*
957	 * Semantics of Linux version: every thread in the system has array
958	 * of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
959	 * This syscall loads one of the selected TLS decriptors with a value
960	 * and also loads GDT descriptors 6, 7 and 8 with the content of
961	 * the per-thread descriptors.
962	 *
963	 * Semantics of FreeBSD version: I think we can ignore that Linux has
964	 * three per-thread descriptors and use just the first one.
965	 * The tls_array[] is used only in [gs]et_thread_area() syscalls and
966	 * for loading the GDT descriptors. We use just one GDT descriptor
967	 * for TLS, so we will load just one.
968	 *
969	 * XXX: This doesn't work when a user space process tries to use more
970	 * than one TLS segment. Comment in the Linux source says wine might
971	 * do this.
972	 */
973
974	/*
975	 * GLIBC reads current %gs and call set_thread_area() with it.
976	 * We should let GUDATA_SEL and GUGS32_SEL proceed as well because
977	 * we use these segments.
978	 */
979	switch (info.entry_number) {
980	case GUGS32_SEL:
981	case GUDATA_SEL:
982	case 6:
983	case -1:
984		info.entry_number = GUGS32_SEL;
985		break;
986	default:
987		return (EINVAL);
988	}
989
990	/*
991	 * We have to copy out the GDT entry we use.
992	 *
993	 * XXX: What if a user space program does not check the return value
994	 * and tries to use 6, 7 or 8?
995	 */
996	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
997	if (error)
998		return (error);
999
1000	if (LINUX_LDT_empty(&info)) {
1001		a[0] = 0;
1002		a[1] = 0;
1003	} else {
1004		a[0] = LINUX_LDT_entry_a(&info);
1005		a[1] = LINUX_LDT_entry_b(&info);
1006	}
1007
1008	memcpy(&sd, &a, sizeof(a));
1009#ifdef DEBUG
1010	if (ldebug(set_thread_area))
1011		printf("Segment created in set_thread_area: "
1012		    "lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
1013		    "type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
1014		    "def32: %i, gran: %i\n",
1015		    sd.sd_lobase,
1016		    sd.sd_hibase,
1017		    sd.sd_lolimit,
1018		    sd.sd_hilimit,
1019		    sd.sd_type,
1020		    sd.sd_dpl,
1021		    sd.sd_p,
1022		    sd.sd_xx,
1023		    sd.sd_long,
1024		    sd.sd_def32,
1025		    sd.sd_gran);
1026#endif
1027
1028	pcb = td->td_pcb;
1029	pcb->pcb_gsbase = (register_t)info.base_addr;
1030	set_pcb_flags(pcb, PCB_32BIT);
1031	update_gdt_gsbase(td, info.base_addr);
1032
1033	return (0);
1034}
1035