linux_machdep.c revision 293572
1/*- 2 * Copyright (c) 2013 Dmitry Chagin 3 * Copyright (c) 2004 Tim J. Robbins 4 * Copyright (c) 2002 Doug Rabson 5 * Copyright (c) 2000 Marcel Moolenaar 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer 13 * in this position and unchanged. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. The name of the author may not be used to endorse or promote products 18 * derived from this software without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32#include <sys/cdefs.h> 33__FBSDID("$FreeBSD: stable/10/sys/amd64/linux/linux_machdep.c 293572 2016-01-09 17:22:51Z dchagin $"); 34 35#include <sys/param.h> 36#include <sys/kernel.h> 37#include <sys/systm.h> 38#include <sys/capability.h> 39#include <sys/dirent.h> 40#include <sys/file.h> 41#include <sys/fcntl.h> 42#include <sys/filedesc.h> 43#include <sys/clock.h> 44#include <sys/imgact.h> 45#include <sys/ktr.h> 46#include <sys/limits.h> 47#include <sys/lock.h> 48#include <sys/malloc.h> 49#include <sys/mman.h> 50#include <sys/mutex.h> 51#include <sys/priv.h> 52#include <sys/proc.h> 53#include <sys/resource.h> 54#include <sys/resourcevar.h> 55#include <sys/sched.h> 56#include <sys/syscallsubr.h> 57#include <sys/sysproto.h> 58#include <sys/vnode.h> 59#include <sys/unistd.h> 60#include <sys/wait.h> 61 62#include <security/mac/mac_framework.h> 63 64#include <ufs/ufs/extattr.h> 65#include <ufs/ufs/quota.h> 66#include <ufs/ufs/ufsmount.h> 67 68#include <machine/frame.h> 69#include <machine/md_var.h> 70#include <machine/pcb.h> 71#include <machine/psl.h> 72#include <machine/segments.h> 73#include <machine/specialreg.h> 74 75#include <vm/vm.h> 76#include <vm/pmap.h> 77#include <vm/vm_extern.h> 78#include <vm/vm_kern.h> 79#include <vm/vm_map.h> 80 81#include <amd64/linux/linux.h> 82#include <amd64/linux/linux_proto.h> 83#include <compat/linux/linux_ipc.h> 84#include <compat/linux/linux_file.h> 85#include <compat/linux/linux_misc.h> 86#include <compat/linux/linux_signal.h> 87#include <compat/linux/linux_util.h> 88#include <compat/linux/linux_emul.h> 89 90 91int 92linux_execve(struct thread *td, struct linux_execve_args *args) 93{ 94 struct image_args eargs; 95 char *path; 96 int error; 97 98 LCONVPATHEXIST(td, args->path, &path); 99 100 LINUX_CTR(execve); 101 102 error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, args->argp, 103 args->envp); 104 free(path, M_TEMP); 105 if (error == 0) 106 error = linux_common_execve(td, &eargs); 107 return (error); 108} 109 110int 111linux_set_upcall_kse(struct thread *td, register_t stack) 112{ 113 114 if (stack) 115 td->td_frame->tf_rsp = stack; 116 117 /* 118 * The newly created Linux thread returns 119 * to the user space by the same path that a parent do. 120 */ 121 td->td_frame->tf_rax = 0; 122 return (0); 123} 124 125#define STACK_SIZE (2 * 1024 * 1024) 126#define GUARD_SIZE (4 * PAGE_SIZE) 127 128int 129linux_mmap2(struct thread *td, struct linux_mmap2_args *args) 130{ 131 struct proc *p = td->td_proc; 132 struct mmap_args /* { 133 caddr_t addr; 134 size_t len; 135 int prot; 136 int flags; 137 int fd; 138 long pad; 139 off_t pos; 140 } */ bsd_args; 141 int error; 142 struct file *fp; 143 cap_rights_t rights; 144 145 LINUX_CTR6(mmap2, "0x%lx, %ld, %ld, 0x%08lx, %ld, 0x%lx", 146 args->addr, args->len, args->prot, 147 args->flags, args->fd, args->pgoff); 148 149 error = 0; 150 bsd_args.flags = 0; 151 fp = NULL; 152 153 /* 154 * Linux mmap(2): 155 * You must specify exactly one of MAP_SHARED and MAP_PRIVATE 156 */ 157 if (! ((args->flags & LINUX_MAP_SHARED) ^ 158 (args->flags & LINUX_MAP_PRIVATE))) 159 return (EINVAL); 160 161 if (args->flags & LINUX_MAP_SHARED) 162 bsd_args.flags |= MAP_SHARED; 163 if (args->flags & LINUX_MAP_PRIVATE) 164 bsd_args.flags |= MAP_PRIVATE; 165 if (args->flags & LINUX_MAP_FIXED) 166 bsd_args.flags |= MAP_FIXED; 167 if (args->flags & LINUX_MAP_ANON) 168 bsd_args.flags |= MAP_ANON; 169 else 170 bsd_args.flags |= MAP_NOSYNC; 171 if (args->flags & LINUX_MAP_GROWSDOWN) 172 bsd_args.flags |= MAP_STACK; 173 174 /* 175 * PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC 176 * on Linux/i386. We do this to ensure maximum compatibility. 177 * Linux/ia64 does the same in i386 emulation mode. 178 */ 179 bsd_args.prot = args->prot; 180 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 181 bsd_args.prot |= PROT_READ | PROT_EXEC; 182 183 /* Linux does not check file descriptor when MAP_ANONYMOUS is set. */ 184 bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : args->fd; 185 if (bsd_args.fd != -1) { 186 /* 187 * Linux follows Solaris mmap(2) description: 188 * The file descriptor fildes is opened with 189 * read permission, regardless of the 190 * protection options specified. 191 */ 192 193 error = fget(td, bsd_args.fd, 194 cap_rights_init(&rights, CAP_MMAP), &fp); 195 if (error != 0 ) 196 return (error); 197 if (fp->f_type != DTYPE_VNODE) { 198 fdrop(fp, td); 199 return (EINVAL); 200 } 201 202 /* Linux mmap() just fails for O_WRONLY files */ 203 if (!(fp->f_flag & FREAD)) { 204 fdrop(fp, td); 205 return (EACCES); 206 } 207 208 fdrop(fp, td); 209 } 210 211 if (args->flags & LINUX_MAP_GROWSDOWN) { 212 /* 213 * The Linux MAP_GROWSDOWN option does not limit auto 214 * growth of the region. Linux mmap with this option 215 * takes as addr the inital BOS, and as len, the initial 216 * region size. It can then grow down from addr without 217 * limit. However, Linux threads has an implicit internal 218 * limit to stack size of STACK_SIZE. Its just not 219 * enforced explicitly in Linux. But, here we impose 220 * a limit of (STACK_SIZE - GUARD_SIZE) on the stack 221 * region, since we can do this with our mmap. 222 * 223 * Our mmap with MAP_STACK takes addr as the maximum 224 * downsize limit on BOS, and as len the max size of 225 * the region. It then maps the top SGROWSIZ bytes, 226 * and auto grows the region down, up to the limit 227 * in addr. 228 * 229 * If we don't use the MAP_STACK option, the effect 230 * of this code is to allocate a stack region of a 231 * fixed size of (STACK_SIZE - GUARD_SIZE). 232 */ 233 234 if ((caddr_t)PTRIN(args->addr) + args->len > 235 p->p_vmspace->vm_maxsaddr) { 236 /* 237 * Some Linux apps will attempt to mmap 238 * thread stacks near the top of their 239 * address space. If their TOS is greater 240 * than vm_maxsaddr, vm_map_growstack() 241 * will confuse the thread stack with the 242 * process stack and deliver a SEGV if they 243 * attempt to grow the thread stack past their 244 * current stacksize rlimit. To avoid this, 245 * adjust vm_maxsaddr upwards to reflect 246 * the current stacksize rlimit rather 247 * than the maximum possible stacksize. 248 * It would be better to adjust the 249 * mmap'ed region, but some apps do not check 250 * mmap's return value. 251 */ 252 PROC_LOCK(p); 253 p->p_vmspace->vm_maxsaddr = (char *)USRSTACK - 254 lim_cur(p, RLIMIT_STACK); 255 PROC_UNLOCK(p); 256 } 257 258 /* 259 * This gives us our maximum stack size and a new BOS. 260 * If we're using VM_STACK, then mmap will just map 261 * the top SGROWSIZ bytes, and let the stack grow down 262 * to the limit at BOS. If we're not using VM_STACK 263 * we map the full stack, since we don't have a way 264 * to autogrow it. 265 */ 266 if (args->len > STACK_SIZE - GUARD_SIZE) { 267 bsd_args.addr = (caddr_t)PTRIN(args->addr); 268 bsd_args.len = args->len; 269 } else { 270 bsd_args.addr = (caddr_t)PTRIN(args->addr) - 271 (STACK_SIZE - GUARD_SIZE - args->len); 272 bsd_args.len = STACK_SIZE - GUARD_SIZE; 273 } 274 } else { 275 bsd_args.addr = (caddr_t)PTRIN(args->addr); 276 bsd_args.len = args->len; 277 } 278 bsd_args.pos = (off_t)args->pgoff; 279 280 error = sys_mmap(td, &bsd_args); 281 282 LINUX_CTR2(mmap2, "return: %d (%p)", 283 error, td->td_retval[0]); 284 return (error); 285} 286 287int 288linux_mprotect(struct thread *td, struct linux_mprotect_args *uap) 289{ 290 struct mprotect_args bsd_args; 291 292 LINUX_CTR(mprotect); 293 294 bsd_args.addr = uap->addr; 295 bsd_args.len = uap->len; 296 bsd_args.prot = uap->prot; 297 if (bsd_args.prot & (PROT_READ | PROT_WRITE | PROT_EXEC)) 298 bsd_args.prot |= PROT_READ | PROT_EXEC; 299 return (sys_mprotect(td, &bsd_args)); 300} 301 302int 303linux_iopl(struct thread *td, struct linux_iopl_args *args) 304{ 305 int error; 306 307 LINUX_CTR(iopl); 308 309 if (args->level > 3) 310 return (EINVAL); 311 if ((error = priv_check(td, PRIV_IO)) != 0) 312 return (error); 313 if ((error = securelevel_gt(td->td_ucred, 0)) != 0) 314 return (error); 315 td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) | 316 (args->level * (PSL_IOPL / 3)); 317 318 return (0); 319} 320 321int 322linux_rt_sigsuspend(struct thread *td, struct linux_rt_sigsuspend_args *uap) 323{ 324 l_sigset_t lmask; 325 sigset_t sigmask; 326 int error; 327 328 LINUX_CTR2(rt_sigsuspend, "%p, %ld", 329 uap->newset, uap->sigsetsize); 330 331 if (uap->sigsetsize != sizeof(l_sigset_t)) 332 return (EINVAL); 333 334 error = copyin(uap->newset, &lmask, sizeof(l_sigset_t)); 335 if (error) 336 return (error); 337 338 linux_to_bsd_sigset(&lmask, &sigmask); 339 return (kern_sigsuspend(td, sigmask)); 340} 341 342int 343linux_pause(struct thread *td, struct linux_pause_args *args) 344{ 345 struct proc *p = td->td_proc; 346 sigset_t sigmask; 347 348 LINUX_CTR(pause); 349 350 PROC_LOCK(p); 351 sigmask = td->td_sigmask; 352 PROC_UNLOCK(p); 353 return (kern_sigsuspend(td, sigmask)); 354} 355 356int 357linux_sigaltstack(struct thread *td, struct linux_sigaltstack_args *uap) 358{ 359 stack_t ss, oss; 360 l_stack_t lss; 361 int error; 362 363 LINUX_CTR2(sigaltstack, "%p, %p", uap->uss, uap->uoss); 364 365 if (uap->uss != NULL) { 366 error = copyin(uap->uss, &lss, sizeof(l_stack_t)); 367 if (error) 368 return (error); 369 370 ss.ss_sp = PTRIN(lss.ss_sp); 371 ss.ss_size = lss.ss_size; 372 ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags); 373 } 374 error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL, 375 (uap->uoss != NULL) ? &oss : NULL); 376 if (!error && uap->uoss != NULL) { 377 lss.ss_sp = PTROUT(oss.ss_sp); 378 lss.ss_size = oss.ss_size; 379 lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags); 380 error = copyout(&lss, uap->uoss, sizeof(l_stack_t)); 381 } 382 383 return (error); 384} 385 386/* XXX do all */ 387int 388linux_arch_prctl(struct thread *td, struct linux_arch_prctl_args *args) 389{ 390 int error; 391 struct pcb *pcb; 392 393 LINUX_CTR2(arch_prctl, "0x%x, %p", args->code, args->addr); 394 395 error = ENOTSUP; 396 pcb = td->td_pcb; 397 398 switch (args->code) { 399 case LINUX_ARCH_GET_GS: 400 error = copyout(&pcb->pcb_gsbase, (unsigned long *)args->addr, 401 sizeof(args->addr)); 402 break; 403 case LINUX_ARCH_SET_GS: 404 if (args->addr >= VM_MAXUSER_ADDRESS) 405 return(EPERM); 406 break; 407 case LINUX_ARCH_GET_FS: 408 error = copyout(&pcb->pcb_fsbase, (unsigned long *)args->addr, 409 sizeof(args->addr)); 410 break; 411 case LINUX_ARCH_SET_FS: 412 error = linux_set_cloned_tls(td, (void *)args->addr); 413 break; 414 default: 415 error = EINVAL; 416 } 417 return (error); 418} 419 420int 421linux_set_cloned_tls(struct thread *td, void *desc) 422{ 423 struct pcb *pcb; 424 425 if ((uint64_t)desc >= VM_MAXUSER_ADDRESS) 426 return (EPERM); 427 428 pcb = td->td_pcb; 429 pcb->pcb_fsbase = (register_t)desc; 430 td->td_frame->tf_fs = _ufssel; 431 432 return (0); 433} 434 435void 436linux_to_bsd_sigset(l_sigset_t *lss, sigset_t *bss) 437{ 438 int b, l; 439 440 SIGEMPTYSET(*bss); 441 for (l = 1; l <= LINUX_NSIG; l++) { 442 if (LINUX_SIGISMEMBER(*lss, l)) { 443 if (l <= LINUX_SIGTBLSZ) 444 b = linux_to_bsd_signal[_SIG_IDX(l)]; 445 else 446 b = l; 447 if (b) 448 SIGADDSET(*bss, b); 449 } 450 } 451} 452 453void 454bsd_to_linux_sigset(sigset_t *bss, l_sigset_t *lss) 455{ 456 int b, l; 457 458 LINUX_SIGEMPTYSET(*lss); 459 for (b = 1; b <= LINUX_NSIG; b++) { 460 if (SIGISMEMBER(*bss, b)) { 461 if (b <= LINUX_SIGTBLSZ) 462 l = bsd_to_linux_signal[_SIG_IDX(b)]; 463 else 464 l = b; 465 if (l) 466 LINUX_SIGADDSET(*lss, l); 467 } 468 } 469} 470