1/*- 2 * SPDX-License-Identifier: BSD-4-Clause 3 * 4 * Copyright (c) 2003 Peter Wemm. 5 * Copyright (c) 1992 Terrence R. Lambert. 6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * William Jolitz. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by the University of 23 * California, Berkeley and its contributors. 24 * 4. Neither the name of the University nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 38 * SUCH DAMAGE. 39 * 40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 41 */ 42 43#include <sys/cdefs.h> 44__FBSDID("$FreeBSD$"); 45 46#include "opt_atpic.h" 47#include "opt_cpu.h" 48#include "opt_ddb.h" 49#include "opt_inet.h" 50#include "opt_isa.h" 51#include "opt_kstack_pages.h" 52#include "opt_maxmem.h" 53#include "opt_mp_watchdog.h" 54#include "opt_pci.h" 55#include "opt_platform.h" 56#include "opt_sched.h" 57 58#include <sys/param.h> 59#include <sys/proc.h> 60#include <sys/systm.h> 61#include <sys/bio.h> 62#include <sys/buf.h> 63#include <sys/bus.h> 64#include <sys/callout.h> 65#include <sys/cons.h> 66#include <sys/cpu.h> 67#include <sys/csan.h> 68#include <sys/efi.h> 69#include <sys/eventhandler.h> 70#include <sys/exec.h> 71#include <sys/imgact.h> 72#include <sys/kdb.h> 73#include <sys/kernel.h> 74#include <sys/ktr.h> 75#include <sys/linker.h> 76#include <sys/lock.h> 77#include <sys/malloc.h> 78#include <sys/memrange.h> 79#include <sys/msgbuf.h> 80#include <sys/mutex.h> 81#include <sys/pcpu.h> 82#include <sys/ptrace.h> 83#include <sys/reboot.h> 84#include <sys/rwlock.h> 85#include <sys/sched.h> 86#include <sys/signalvar.h> 87#ifdef SMP 88#include <sys/smp.h> 89#endif 90#include <sys/syscallsubr.h> 91#include <sys/sysctl.h> 92#include <sys/sysent.h> 93#include <sys/sysproto.h> 94#include <sys/ucontext.h> 95#include <sys/vmmeter.h> 96 97#include <vm/vm.h> 98#include <vm/vm_param.h> 99#include <vm/vm_extern.h> 100#include <vm/vm_kern.h> 101#include <vm/vm_page.h> 102#include <vm/vm_map.h> 103#include <vm/vm_object.h> 104#include <vm/vm_pager.h> 105#include <vm/vm_phys.h> 106#include <vm/vm_dumpset.h> 107 108#ifdef DDB 109#ifndef KDB 110#error KDB must be enabled in order for DDB to work! 111#endif 112#include <ddb/ddb.h> 113#include <ddb/db_sym.h> 114#endif 115 116#include <net/netisr.h> 117 118#include <machine/clock.h> 119#include <machine/cpu.h> 120#include <machine/cputypes.h> 121#include <machine/frame.h> 122#include <machine/intr_machdep.h> 123#include <x86/mca.h> 124#include <machine/md_var.h> 125#include <machine/metadata.h> 126#include <machine/mp_watchdog.h> 127#include <machine/pc/bios.h> 128#include <machine/pcb.h> 129#include <machine/proc.h> 130#include <machine/reg.h> 131#include <machine/sigframe.h> 132#include <machine/specialreg.h> 133#include <machine/trap.h> 134#include <machine/tss.h> 135#include <x86/ucode.h> 136#include <x86/ifunc.h> 137#ifdef SMP 138#include <machine/smp.h> 139#endif 140#ifdef FDT 141#include <x86/fdt.h> 142#endif 143 144#ifdef DEV_ATPIC 145#include <x86/isa/icu.h> 146#else 147#include <x86/apicvar.h> 148#endif 149 150#include <isa/isareg.h> 151#include <isa/rtc.h> 152#include <x86/init.h> 153 154/* Sanity check for __curthread() */ 155CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); 156 157/* 158 * The PTI trampoline stack needs enough space for a hardware trapframe and a 159 * couple of scratch registers, as well as the trapframe left behind after an 160 * iret fault. 161 */ 162CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) - 163 offsetof(struct pti_frame, pti_rip)); 164 165extern u_int64_t hammer_time(u_int64_t, u_int64_t); 166 167#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 168#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 169 170static void cpu_startup(void *); 171static void get_fpcontext(struct thread *td, mcontext_t *mcp, 172 char *xfpusave, size_t xfpusave_len); 173static int set_fpcontext(struct thread *td, mcontext_t *mcp, 174 char *xfpustate, size_t xfpustate_len); 175SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); 176 177/* Preload data parse function */ 178static caddr_t native_parse_preload_data(u_int64_t); 179 180/* Native function to fetch and parse the e820 map */ 181static void native_parse_memmap(caddr_t, vm_paddr_t *, int *); 182 183/* Default init_ops implementation. */ 184struct init_ops init_ops = { 185 .parse_preload_data = native_parse_preload_data, 186 .early_clock_source_init = i8254_init, 187 .early_delay = i8254_delay, 188 .parse_memmap = native_parse_memmap, 189#ifdef SMP 190 .mp_bootaddress = mp_bootaddress, 191 .start_all_aps = native_start_all_aps, 192#endif 193#ifdef DEV_PCI 194 .msi_init = msi_init, 195#endif 196}; 197 198/* 199 * Physical address of the EFI System Table. Stashed from the metadata hints 200 * passed into the kernel and used by the EFI code to call runtime services. 201 */ 202vm_paddr_t efi_systbl_phys; 203 204/* Intel ICH registers */ 205#define ICH_PMBASE 0x400 206#define ICH_SMI_EN ICH_PMBASE + 0x30 207 208int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel; 209 210int cold = 1; 211 212long Maxmem = 0; 213long realmem = 0; 214 215struct kva_md_info kmi; 216 217static struct trapframe proc0_tf; 218struct region_descriptor r_idt; 219 220struct pcpu *__pcpu; 221struct pcpu temp_bsp_pcpu; 222 223struct mtx icu_lock; 224 225struct mem_range_softc mem_range_softc; 226 227struct mtx dt_lock; /* lock for GDT and LDT */ 228 229void (*vmm_resume_p)(void); 230 231static void 232cpu_startup(dummy) 233 void *dummy; 234{ 235 uintmax_t memsize; 236 char *sysenv; 237 238 /* 239 * On MacBooks, we need to disallow the legacy USB circuit to 240 * generate an SMI# because this can cause several problems, 241 * namely: incorrect CPU frequency detection and failure to 242 * start the APs. 243 * We do this by disabling a bit in the SMI_EN (SMI Control and 244 * Enable register) of the Intel ICH LPC Interface Bridge. 245 */ 246 sysenv = kern_getenv("smbios.system.product"); 247 if (sysenv != NULL) { 248 if (strncmp(sysenv, "MacBook1,1", 10) == 0 || 249 strncmp(sysenv, "MacBook3,1", 10) == 0 || 250 strncmp(sysenv, "MacBook4,1", 10) == 0 || 251 strncmp(sysenv, "MacBookPro1,1", 13) == 0 || 252 strncmp(sysenv, "MacBookPro1,2", 13) == 0 || 253 strncmp(sysenv, "MacBookPro3,1", 13) == 0 || 254 strncmp(sysenv, "MacBookPro4,1", 13) == 0 || 255 strncmp(sysenv, "Macmini1,1", 10) == 0) { 256 if (bootverbose) 257 printf("Disabling LEGACY_USB_EN bit on " 258 "Intel ICH.\n"); 259 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); 260 } 261 freeenv(sysenv); 262 } 263 264 /* 265 * Good {morning,afternoon,evening,night}. 266 */ 267 startrtclock(); 268 printcpuinfo(); 269 270 /* 271 * Display physical memory if SMBIOS reports reasonable amount. 272 */ 273 memsize = 0; 274 sysenv = kern_getenv("smbios.memory.enabled"); 275 if (sysenv != NULL) { 276 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; 277 freeenv(sysenv); 278 } 279 if (memsize < ptoa((uintmax_t)vm_free_count())) 280 memsize = ptoa((uintmax_t)Maxmem); 281 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); 282 realmem = atop(memsize); 283 284 /* 285 * Display any holes after the first chunk of extended memory. 286 */ 287 if (bootverbose) { 288 int indx; 289 290 printf("Physical memory chunk(s):\n"); 291 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { 292 vm_paddr_t size; 293 294 size = phys_avail[indx + 1] - phys_avail[indx]; 295 printf( 296 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", 297 (uintmax_t)phys_avail[indx], 298 (uintmax_t)phys_avail[indx + 1] - 1, 299 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); 300 } 301 } 302 303 vm_ksubmap_init(&kmi); 304 305 printf("avail memory = %ju (%ju MB)\n", 306 ptoa((uintmax_t)vm_free_count()), 307 ptoa((uintmax_t)vm_free_count()) / 1048576); 308#ifdef DEV_PCI 309 if (bootverbose && intel_graphics_stolen_base != 0) 310 printf("intel stolen mem: base %#jx size %ju MB\n", 311 (uintmax_t)intel_graphics_stolen_base, 312 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024); 313#endif 314 315 /* 316 * Set up buffers, so they can be used to read disk labels. 317 */ 318 bufinit(); 319 vm_pager_bufferinit(); 320 321 cpu_setregs(); 322} 323 324static void 325late_ifunc_resolve(void *dummy __unused) 326{ 327 link_elf_late_ireloc(); 328} 329SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL); 330 331/* 332 * Send an interrupt to process. 333 * 334 * Stack is set up to allow sigcode stored 335 * at top to call routine, followed by call 336 * to sigreturn routine below. After sigreturn 337 * resets the signal mask, the stack, and the 338 * frame pointer, it returns to the user 339 * specified pc, psl. 340 */ 341void 342sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 343{ 344 struct sigframe sf, *sfp; 345 struct pcb *pcb; 346 struct proc *p; 347 struct thread *td; 348 struct sigacts *psp; 349 char *sp; 350 struct trapframe *regs; 351 char *xfpusave; 352 size_t xfpusave_len; 353 int sig; 354 int oonstack; 355 356 td = curthread; 357 pcb = td->td_pcb; 358 p = td->td_proc; 359 PROC_LOCK_ASSERT(p, MA_OWNED); 360 sig = ksi->ksi_signo; 361 psp = p->p_sigacts; 362 mtx_assert(&psp->ps_mtx, MA_OWNED); 363 regs = td->td_frame; 364 oonstack = sigonstack(regs->tf_rsp); 365 366 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) { 367 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu); 368 xfpusave = __builtin_alloca(xfpusave_len); 369 } else { 370 xfpusave_len = 0; 371 xfpusave = NULL; 372 } 373 374 /* Save user context. */ 375 bzero(&sf, sizeof(sf)); 376 sf.sf_uc.uc_sigmask = *mask; 377 sf.sf_uc.uc_stack = td->td_sigstk; 378 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 379 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; 380 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; 381 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs)); 382 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ 383 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); 384 fpstate_drop(td); 385 update_pcb_bases(pcb); 386 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase; 387 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase; 388 bzero(sf.sf_uc.uc_mcontext.mc_spare, 389 sizeof(sf.sf_uc.uc_mcontext.mc_spare)); 390 391 /* Allocate space for the signal handler context. */ 392 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 393 SIGISMEMBER(psp->ps_sigonstack, sig)) { 394 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; 395#if defined(COMPAT_43) 396 td->td_sigstk.ss_flags |= SS_ONSTACK; 397#endif 398 } else 399 sp = (char *)regs->tf_rsp - 128; 400 if (xfpusave != NULL) { 401 sp -= xfpusave_len; 402 sp = (char *)((unsigned long)sp & ~0x3Ful); 403 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; 404 } 405 sp -= sizeof(struct sigframe); 406 /* Align to 16 bytes. */ 407 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul); 408 409 /* Build the argument list for the signal handler. */ 410 regs->tf_rdi = sig; /* arg 1 in %rdi */ 411 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */ 412 bzero(&sf.sf_si, sizeof(sf.sf_si)); 413 if (SIGISMEMBER(psp->ps_siginfo, sig)) { 414 /* Signal handler installed with SA_SIGINFO. */ 415 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 416 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; 417 418 /* Fill in POSIX parts */ 419 sf.sf_si = ksi->ksi_info; 420 sf.sf_si.si_signo = sig; /* maybe a translated signal */ 421 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 422 } else { 423 /* Old FreeBSD-style arguments. */ 424 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */ 425 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */ 426 sf.sf_ahu.sf_handler = catcher; 427 } 428 mtx_unlock(&psp->ps_mtx); 429 PROC_UNLOCK(p); 430 431 /* 432 * Copy the sigframe out to the user's stack. 433 */ 434 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || 435 (xfpusave != NULL && copyout(xfpusave, 436 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) 437 != 0)) { 438#ifdef DEBUG 439 printf("process %ld has trashed its stack\n", (long)p->p_pid); 440#endif 441 PROC_LOCK(p); 442 sigexit(td, SIGILL); 443 } 444 445 regs->tf_rsp = (long)sfp; 446 regs->tf_rip = p->p_sysent->sv_sigcode_base; 447 regs->tf_rflags &= ~(PSL_T | PSL_D); 448 regs->tf_cs = _ucodesel; 449 regs->tf_ds = _udatasel; 450 regs->tf_ss = _udatasel; 451 regs->tf_es = _udatasel; 452 regs->tf_fs = _ufssel; 453 regs->tf_gs = _ugssel; 454 regs->tf_flags = TF_HASSEGS; 455 PROC_LOCK(p); 456 mtx_lock(&psp->ps_mtx); 457} 458 459/* 460 * System call to cleanup state after a signal 461 * has been taken. Reset signal mask and 462 * stack state from context left by sendsig (above). 463 * Return to previous pc and psl as specified by 464 * context left by sendsig. Check carefully to 465 * make sure that the user has not modified the 466 * state to gain improper privileges. 467 * 468 * MPSAFE 469 */ 470int 471sys_sigreturn(td, uap) 472 struct thread *td; 473 struct sigreturn_args /* { 474 const struct __ucontext *sigcntxp; 475 } */ *uap; 476{ 477 ucontext_t uc; 478 struct pcb *pcb; 479 struct proc *p; 480 struct trapframe *regs; 481 ucontext_t *ucp; 482 char *xfpustate; 483 size_t xfpustate_len; 484 long rflags; 485 int cs, error, ret; 486 ksiginfo_t ksi; 487 488 pcb = td->td_pcb; 489 p = td->td_proc; 490 491 error = copyin(uap->sigcntxp, &uc, sizeof(uc)); 492 if (error != 0) { 493 uprintf("pid %d (%s): sigreturn copyin failed\n", 494 p->p_pid, td->td_name); 495 return (error); 496 } 497 ucp = &uc; 498 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { 499 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, 500 td->td_name, ucp->uc_mcontext.mc_flags); 501 return (EINVAL); 502 } 503 regs = td->td_frame; 504 rflags = ucp->uc_mcontext.mc_rflags; 505 /* 506 * Don't allow users to change privileged or reserved flags. 507 */ 508 if (!EFL_SECURE(rflags, regs->tf_rflags)) { 509 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid, 510 td->td_name, rflags); 511 return (EINVAL); 512 } 513 514 /* 515 * Don't allow users to load a valid privileged %cs. Let the 516 * hardware check for invalid selectors, excess privilege in 517 * other selectors, invalid %eip's and invalid %esp's. 518 */ 519 cs = ucp->uc_mcontext.mc_cs; 520 if (!CS_SECURE(cs)) { 521 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid, 522 td->td_name, cs); 523 ksiginfo_init_trap(&ksi); 524 ksi.ksi_signo = SIGBUS; 525 ksi.ksi_code = BUS_OBJERR; 526 ksi.ksi_trapno = T_PROTFLT; 527 ksi.ksi_addr = (void *)regs->tf_rip; 528 trapsignal(td, &ksi); 529 return (EINVAL); 530 } 531 532 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { 533 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; 534 if (xfpustate_len > cpu_max_ext_state_size - 535 sizeof(struct savefpu)) { 536 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", 537 p->p_pid, td->td_name, xfpustate_len); 538 return (EINVAL); 539 } 540 xfpustate = __builtin_alloca(xfpustate_len); 541 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, 542 xfpustate, xfpustate_len); 543 if (error != 0) { 544 uprintf( 545 "pid %d (%s): sigreturn copying xfpustate failed\n", 546 p->p_pid, td->td_name); 547 return (error); 548 } 549 } else { 550 xfpustate = NULL; 551 xfpustate_len = 0; 552 } 553 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); 554 if (ret != 0) { 555 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n", 556 p->p_pid, td->td_name, ret); 557 return (ret); 558 } 559 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs)); 560 update_pcb_bases(pcb); 561 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase; 562 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase; 563 564#if defined(COMPAT_43) 565 if (ucp->uc_mcontext.mc_onstack & 1) 566 td->td_sigstk.ss_flags |= SS_ONSTACK; 567 else 568 td->td_sigstk.ss_flags &= ~SS_ONSTACK; 569#endif 570 571 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); 572 return (EJUSTRETURN); 573} 574 575#ifdef COMPAT_FREEBSD4 576int 577freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap) 578{ 579 580 return sys_sigreturn(td, (struct sigreturn_args *)uap); 581} 582#endif 583 584/* 585 * Reset the hardware debug registers if they were in use. 586 * They won't have any meaning for the newly exec'd process. 587 */ 588void 589x86_clear_dbregs(struct pcb *pcb) 590{ 591 if ((pcb->pcb_flags & PCB_DBREGS) == 0) 592 return; 593 594 pcb->pcb_dr0 = 0; 595 pcb->pcb_dr1 = 0; 596 pcb->pcb_dr2 = 0; 597 pcb->pcb_dr3 = 0; 598 pcb->pcb_dr6 = 0; 599 pcb->pcb_dr7 = 0; 600 601 if (pcb == curpcb) { 602 /* 603 * Clear the debug registers on the running CPU, 604 * otherwise they will end up affecting the next 605 * process we switch to. 606 */ 607 reset_dbregs(); 608 } 609 clear_pcb_flags(pcb, PCB_DBREGS); 610} 611 612/* 613 * Reset registers to default values on exec. 614 */ 615void 616exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack) 617{ 618 struct trapframe *regs; 619 struct pcb *pcb; 620 register_t saved_rflags; 621 622 regs = td->td_frame; 623 pcb = td->td_pcb; 624 625 if (td->td_proc->p_md.md_ldt != NULL) 626 user_ldt_free(td); 627 628 update_pcb_bases(pcb); 629 pcb->pcb_fsbase = 0; 630 pcb->pcb_gsbase = 0; 631 clear_pcb_flags(pcb, PCB_32BIT); 632 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__; 633 634 saved_rflags = regs->tf_rflags & PSL_T; 635 bzero((char *)regs, sizeof(struct trapframe)); 636 regs->tf_rip = imgp->entry_addr; 637 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; 638 regs->tf_rdi = stack; /* argv */ 639 regs->tf_rflags = PSL_USER | saved_rflags; 640 regs->tf_ss = _udatasel; 641 regs->tf_cs = _ucodesel; 642 regs->tf_ds = _udatasel; 643 regs->tf_es = _udatasel; 644 regs->tf_fs = _ufssel; 645 regs->tf_gs = _ugssel; 646 regs->tf_flags = TF_HASSEGS; 647 648 x86_clear_dbregs(pcb); 649 650 /* 651 * Drop the FP state if we hold it, so that the process gets a 652 * clean FP state if it uses the FPU again. 653 */ 654 fpstate_drop(td); 655} 656 657void 658cpu_setregs(void) 659{ 660 register_t cr0; 661 662 cr0 = rcr0(); 663 /* 664 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the 665 * BSP. See the comments there about why we set them. 666 */ 667 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; 668 load_cr0(cr0); 669} 670 671/* 672 * Initialize amd64 and configure to run kernel 673 */ 674 675/* 676 * Initialize segments & interrupt table 677 */ 678static struct gate_descriptor idt0[NIDT]; 679struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ 680 681static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16); 682static char mce0_stack[MCE_STACK_SIZE] __aligned(16); 683static char nmi0_stack[NMI_STACK_SIZE] __aligned(16); 684static char dbg0_stack[DBG_STACK_SIZE] __aligned(16); 685CTASSERT(sizeof(struct nmi_pcpu) == 16); 686 687/* 688 * Software prototypes -- in more palatable form. 689 * 690 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same 691 * slots as corresponding segments for i386 kernel. 692 */ 693struct soft_segment_descriptor gdt_segs[] = { 694/* GNULL_SEL 0 Null Descriptor */ 695{ .ssd_base = 0x0, 696 .ssd_limit = 0x0, 697 .ssd_type = 0, 698 .ssd_dpl = 0, 699 .ssd_p = 0, 700 .ssd_long = 0, 701 .ssd_def32 = 0, 702 .ssd_gran = 0 }, 703/* GNULL2_SEL 1 Null Descriptor */ 704{ .ssd_base = 0x0, 705 .ssd_limit = 0x0, 706 .ssd_type = 0, 707 .ssd_dpl = 0, 708 .ssd_p = 0, 709 .ssd_long = 0, 710 .ssd_def32 = 0, 711 .ssd_gran = 0 }, 712/* GUFS32_SEL 2 32 bit %gs Descriptor for user */ 713{ .ssd_base = 0x0, 714 .ssd_limit = 0xfffff, 715 .ssd_type = SDT_MEMRWA, 716 .ssd_dpl = SEL_UPL, 717 .ssd_p = 1, 718 .ssd_long = 0, 719 .ssd_def32 = 1, 720 .ssd_gran = 1 }, 721/* GUGS32_SEL 3 32 bit %fs Descriptor for user */ 722{ .ssd_base = 0x0, 723 .ssd_limit = 0xfffff, 724 .ssd_type = SDT_MEMRWA, 725 .ssd_dpl = SEL_UPL, 726 .ssd_p = 1, 727 .ssd_long = 0, 728 .ssd_def32 = 1, 729 .ssd_gran = 1 }, 730/* GCODE_SEL 4 Code Descriptor for kernel */ 731{ .ssd_base = 0x0, 732 .ssd_limit = 0xfffff, 733 .ssd_type = SDT_MEMERA, 734 .ssd_dpl = SEL_KPL, 735 .ssd_p = 1, 736 .ssd_long = 1, 737 .ssd_def32 = 0, 738 .ssd_gran = 1 }, 739/* GDATA_SEL 5 Data Descriptor for kernel */ 740{ .ssd_base = 0x0, 741 .ssd_limit = 0xfffff, 742 .ssd_type = SDT_MEMRWA, 743 .ssd_dpl = SEL_KPL, 744 .ssd_p = 1, 745 .ssd_long = 1, 746 .ssd_def32 = 0, 747 .ssd_gran = 1 }, 748/* GUCODE32_SEL 6 32 bit Code Descriptor for user */ 749{ .ssd_base = 0x0, 750 .ssd_limit = 0xfffff, 751 .ssd_type = SDT_MEMERA, 752 .ssd_dpl = SEL_UPL, 753 .ssd_p = 1, 754 .ssd_long = 0, 755 .ssd_def32 = 1, 756 .ssd_gran = 1 }, 757/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */ 758{ .ssd_base = 0x0, 759 .ssd_limit = 0xfffff, 760 .ssd_type = SDT_MEMRWA, 761 .ssd_dpl = SEL_UPL, 762 .ssd_p = 1, 763 .ssd_long = 0, 764 .ssd_def32 = 1, 765 .ssd_gran = 1 }, 766/* GUCODE_SEL 8 64 bit Code Descriptor for user */ 767{ .ssd_base = 0x0, 768 .ssd_limit = 0xfffff, 769 .ssd_type = SDT_MEMERA, 770 .ssd_dpl = SEL_UPL, 771 .ssd_p = 1, 772 .ssd_long = 1, 773 .ssd_def32 = 0, 774 .ssd_gran = 1 }, 775/* GPROC0_SEL 9 Proc 0 Tss Descriptor */ 776{ .ssd_base = 0x0, 777 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1, 778 .ssd_type = SDT_SYSTSS, 779 .ssd_dpl = SEL_KPL, 780 .ssd_p = 1, 781 .ssd_long = 0, 782 .ssd_def32 = 0, 783 .ssd_gran = 0 }, 784/* Actually, the TSS is a system descriptor which is double size */ 785{ .ssd_base = 0x0, 786 .ssd_limit = 0x0, 787 .ssd_type = 0, 788 .ssd_dpl = 0, 789 .ssd_p = 0, 790 .ssd_long = 0, 791 .ssd_def32 = 0, 792 .ssd_gran = 0 }, 793/* GUSERLDT_SEL 11 LDT Descriptor */ 794{ .ssd_base = 0x0, 795 .ssd_limit = 0x0, 796 .ssd_type = 0, 797 .ssd_dpl = 0, 798 .ssd_p = 0, 799 .ssd_long = 0, 800 .ssd_def32 = 0, 801 .ssd_gran = 0 }, 802/* GUSERLDT_SEL 12 LDT Descriptor, double size */ 803{ .ssd_base = 0x0, 804 .ssd_limit = 0x0, 805 .ssd_type = 0, 806 .ssd_dpl = 0, 807 .ssd_p = 0, 808 .ssd_long = 0, 809 .ssd_def32 = 0, 810 .ssd_gran = 0 }, 811}; 812_Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT"); 813 814void 815setidt(int idx, inthand_t *func, int typ, int dpl, int ist) 816{ 817 struct gate_descriptor *ip; 818 819 ip = idt + idx; 820 ip->gd_looffset = (uintptr_t)func; 821 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL); 822 ip->gd_ist = ist; 823 ip->gd_xx = 0; 824 ip->gd_type = typ; 825 ip->gd_dpl = dpl; 826 ip->gd_p = 1; 827 ip->gd_hioffset = ((uintptr_t)func)>>16 ; 828} 829 830extern inthand_t 831 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), 832 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), 833 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), 834 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), 835 IDTVEC(xmm), IDTVEC(dblfault), 836 IDTVEC(div_pti), IDTVEC(bpt_pti), 837 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti), 838 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti), 839 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), 840 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti), 841 IDTVEC(xmm_pti), 842#ifdef KDTRACE_HOOKS 843 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti), 844#endif 845#ifdef XENHVM 846 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti), 847#endif 848 IDTVEC(fast_syscall), IDTVEC(fast_syscall32), 849 IDTVEC(fast_syscall_pti); 850 851#ifdef DDB 852/* 853 * Display the index and function name of any IDT entries that don't use 854 * the default 'rsvd' entry point. 855 */ 856DB_SHOW_COMMAND(idt, db_show_idt) 857{ 858 struct gate_descriptor *ip; 859 int idx; 860 uintptr_t func; 861 862 ip = idt; 863 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { 864 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset); 865 if (func != (uintptr_t)&IDTVEC(rsvd)) { 866 db_printf("%3d\t", idx); 867 db_printsym(func, DB_STGY_PROC); 868 db_printf("\n"); 869 } 870 ip++; 871 } 872} 873 874/* Show privileged registers. */ 875DB_SHOW_COMMAND(sysregs, db_show_sysregs) 876{ 877 struct { 878 uint16_t limit; 879 uint64_t base; 880 } __packed idtr, gdtr; 881 uint16_t ldt, tr; 882 883 __asm __volatile("sidt %0" : "=m" (idtr)); 884 db_printf("idtr\t0x%016lx/%04x\n", 885 (u_long)idtr.base, (u_int)idtr.limit); 886 __asm __volatile("sgdt %0" : "=m" (gdtr)); 887 db_printf("gdtr\t0x%016lx/%04x\n", 888 (u_long)gdtr.base, (u_int)gdtr.limit); 889 __asm __volatile("sldt %0" : "=r" (ldt)); 890 db_printf("ldtr\t0x%04x\n", ldt); 891 __asm __volatile("str %0" : "=r" (tr)); 892 db_printf("tr\t0x%04x\n", tr); 893 db_printf("cr0\t0x%016lx\n", rcr0()); 894 db_printf("cr2\t0x%016lx\n", rcr2()); 895 db_printf("cr3\t0x%016lx\n", rcr3()); 896 db_printf("cr4\t0x%016lx\n", rcr4()); 897 if (rcr4() & CR4_XSAVE) 898 db_printf("xcr0\t0x%016lx\n", rxcr(0)); 899 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER)); 900 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) 901 db_printf("FEATURES_CTL\t%016lx\n", 902 rdmsr(MSR_IA32_FEATURE_CONTROL)); 903 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR)); 904 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT)); 905 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE)); 906} 907 908DB_SHOW_COMMAND(dbregs, db_show_dbregs) 909{ 910 911 db_printf("dr0\t0x%016lx\n", rdr0()); 912 db_printf("dr1\t0x%016lx\n", rdr1()); 913 db_printf("dr2\t0x%016lx\n", rdr2()); 914 db_printf("dr3\t0x%016lx\n", rdr3()); 915 db_printf("dr6\t0x%016lx\n", rdr6()); 916 db_printf("dr7\t0x%016lx\n", rdr7()); 917} 918#endif 919 920void 921sdtossd(sd, ssd) 922 struct user_segment_descriptor *sd; 923 struct soft_segment_descriptor *ssd; 924{ 925 926 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; 927 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; 928 ssd->ssd_type = sd->sd_type; 929 ssd->ssd_dpl = sd->sd_dpl; 930 ssd->ssd_p = sd->sd_p; 931 ssd->ssd_long = sd->sd_long; 932 ssd->ssd_def32 = sd->sd_def32; 933 ssd->ssd_gran = sd->sd_gran; 934} 935 936void 937ssdtosd(ssd, sd) 938 struct soft_segment_descriptor *ssd; 939 struct user_segment_descriptor *sd; 940{ 941 942 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 943 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff; 944 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 945 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 946 sd->sd_type = ssd->ssd_type; 947 sd->sd_dpl = ssd->ssd_dpl; 948 sd->sd_p = ssd->ssd_p; 949 sd->sd_long = ssd->ssd_long; 950 sd->sd_def32 = ssd->ssd_def32; 951 sd->sd_gran = ssd->ssd_gran; 952} 953 954void 955ssdtosyssd(ssd, sd) 956 struct soft_segment_descriptor *ssd; 957 struct system_segment_descriptor *sd; 958{ 959 960 sd->sd_lobase = (ssd->ssd_base) & 0xffffff; 961 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful; 962 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff; 963 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf; 964 sd->sd_type = ssd->ssd_type; 965 sd->sd_dpl = ssd->ssd_dpl; 966 sd->sd_p = ssd->ssd_p; 967 sd->sd_gran = ssd->ssd_gran; 968} 969 970u_int basemem; 971 972static int 973add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, 974 int *physmap_idxp) 975{ 976 int i, insert_idx, physmap_idx; 977 978 physmap_idx = *physmap_idxp; 979 980 if (length == 0) 981 return (1); 982 983 /* 984 * Find insertion point while checking for overlap. Start off by 985 * assuming the new entry will be added to the end. 986 * 987 * NB: physmap_idx points to the next free slot. 988 */ 989 insert_idx = physmap_idx; 990 for (i = 0; i <= physmap_idx; i += 2) { 991 if (base < physmap[i + 1]) { 992 if (base + length <= physmap[i]) { 993 insert_idx = i; 994 break; 995 } 996 if (boothowto & RB_VERBOSE) 997 printf( 998 "Overlapping memory regions, ignoring second region\n"); 999 return (1); 1000 } 1001 } 1002 1003 /* See if we can prepend to the next entry. */ 1004 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { 1005 physmap[insert_idx] = base; 1006 return (1); 1007 } 1008 1009 /* See if we can append to the previous entry. */ 1010 if (insert_idx > 0 && base == physmap[insert_idx - 1]) { 1011 physmap[insert_idx - 1] += length; 1012 return (1); 1013 } 1014 1015 physmap_idx += 2; 1016 *physmap_idxp = physmap_idx; 1017 if (physmap_idx == PHYS_AVAIL_ENTRIES) { 1018 printf( 1019 "Too many segments in the physical address map, giving up\n"); 1020 return (0); 1021 } 1022 1023 /* 1024 * Move the last 'N' entries down to make room for the new 1025 * entry if needed. 1026 */ 1027 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) { 1028 physmap[i] = physmap[i - 2]; 1029 physmap[i + 1] = physmap[i - 1]; 1030 } 1031 1032 /* Insert the new entry. */ 1033 physmap[insert_idx] = base; 1034 physmap[insert_idx + 1] = base + length; 1035 return (1); 1036} 1037 1038void 1039bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize, 1040 vm_paddr_t *physmap, int *physmap_idx) 1041{ 1042 struct bios_smap *smap, *smapend; 1043 1044 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); 1045 1046 for (smap = smapbase; smap < smapend; smap++) { 1047 if (boothowto & RB_VERBOSE) 1048 printf("SMAP type=%02x base=%016lx len=%016lx\n", 1049 smap->type, smap->base, smap->length); 1050 1051 if (smap->type != SMAP_TYPE_MEMORY) 1052 continue; 1053 1054 if (!add_physmap_entry(smap->base, smap->length, physmap, 1055 physmap_idx)) 1056 break; 1057 } 1058} 1059 1060static void 1061add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap, 1062 int *physmap_idx) 1063{ 1064 struct efi_md *map, *p; 1065 const char *type; 1066 size_t efisz; 1067 int ndesc, i; 1068 1069 static const char *types[] = { 1070 "Reserved", 1071 "LoaderCode", 1072 "LoaderData", 1073 "BootServicesCode", 1074 "BootServicesData", 1075 "RuntimeServicesCode", 1076 "RuntimeServicesData", 1077 "ConventionalMemory", 1078 "UnusableMemory", 1079 "ACPIReclaimMemory", 1080 "ACPIMemoryNVS", 1081 "MemoryMappedIO", 1082 "MemoryMappedIOPortSpace", 1083 "PalCode", 1084 "PersistentMemory" 1085 }; 1086 1087 /* 1088 * Memory map data provided by UEFI via the GetMemoryMap 1089 * Boot Services API. 1090 */ 1091 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf; 1092 map = (struct efi_md *)((uint8_t *)efihdr + efisz); 1093 1094 if (efihdr->descriptor_size == 0) 1095 return; 1096 ndesc = efihdr->memory_size / efihdr->descriptor_size; 1097 1098 if (boothowto & RB_VERBOSE) 1099 printf("%23s %12s %12s %8s %4s\n", 1100 "Type", "Physical", "Virtual", "#Pages", "Attr"); 1101 1102 for (i = 0, p = map; i < ndesc; i++, 1103 p = efi_next_descriptor(p, efihdr->descriptor_size)) { 1104 if (boothowto & RB_VERBOSE) { 1105 if (p->md_type < nitems(types)) 1106 type = types[p->md_type]; 1107 else 1108 type = "<INVALID>"; 1109 printf("%23s %012lx %12p %08lx ", type, p->md_phys, 1110 p->md_virt, p->md_pages); 1111 if (p->md_attr & EFI_MD_ATTR_UC) 1112 printf("UC "); 1113 if (p->md_attr & EFI_MD_ATTR_WC) 1114 printf("WC "); 1115 if (p->md_attr & EFI_MD_ATTR_WT) 1116 printf("WT "); 1117 if (p->md_attr & EFI_MD_ATTR_WB) 1118 printf("WB "); 1119 if (p->md_attr & EFI_MD_ATTR_UCE) 1120 printf("UCE "); 1121 if (p->md_attr & EFI_MD_ATTR_WP) 1122 printf("WP "); 1123 if (p->md_attr & EFI_MD_ATTR_RP) 1124 printf("RP "); 1125 if (p->md_attr & EFI_MD_ATTR_XP) 1126 printf("XP "); 1127 if (p->md_attr & EFI_MD_ATTR_NV) 1128 printf("NV "); 1129 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE) 1130 printf("MORE_RELIABLE "); 1131 if (p->md_attr & EFI_MD_ATTR_RO) 1132 printf("RO "); 1133 if (p->md_attr & EFI_MD_ATTR_RT) 1134 printf("RUNTIME"); 1135 printf("\n"); 1136 } 1137 1138 switch (p->md_type) { 1139 case EFI_MD_TYPE_CODE: 1140 case EFI_MD_TYPE_DATA: 1141 case EFI_MD_TYPE_BS_CODE: 1142 case EFI_MD_TYPE_BS_DATA: 1143 case EFI_MD_TYPE_FREE: 1144 /* 1145 * We're allowed to use any entry with these types. 1146 */ 1147 break; 1148 default: 1149 continue; 1150 } 1151 1152 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE), 1153 physmap, physmap_idx)) 1154 break; 1155 } 1156} 1157 1158static char bootmethod[16] = ""; 1159SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, 1160 "System firmware boot method"); 1161 1162static void 1163native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) 1164{ 1165 struct bios_smap *smap; 1166 struct efi_map_header *efihdr; 1167 u_int32_t size; 1168 1169 /* 1170 * Memory map from INT 15:E820. 1171 * 1172 * subr_module.c says: 1173 * "Consumer may safely assume that size value precedes data." 1174 * ie: an int32_t immediately precedes smap. 1175 */ 1176 1177 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1178 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1179 smap = (struct bios_smap *)preload_search_info(kmdp, 1180 MODINFO_METADATA | MODINFOMD_SMAP); 1181 if (efihdr == NULL && smap == NULL) 1182 panic("No BIOS smap or EFI map info from loader!"); 1183 1184 if (efihdr != NULL) { 1185 add_efi_map_entries(efihdr, physmap, physmap_idx); 1186 strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); 1187 } else { 1188 size = *((u_int32_t *)smap - 1); 1189 bios_add_smap_entries(smap, size, physmap, physmap_idx); 1190 strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); 1191 } 1192} 1193 1194#define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE) 1195 1196/* 1197 * Populate the (physmap) array with base/bound pairs describing the 1198 * available physical memory in the system, then test this memory and 1199 * build the phys_avail array describing the actually-available memory. 1200 * 1201 * Total memory size may be set by the kernel environment variable 1202 * hw.physmem or the compile-time define MAXMEM. 1203 * 1204 * XXX first should be vm_paddr_t. 1205 */ 1206static void 1207getmemsize(caddr_t kmdp, u_int64_t first) 1208{ 1209 int i, physmap_idx, pa_indx, da_indx; 1210 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES]; 1211 u_long physmem_start, physmem_tunable, memtest; 1212 pt_entry_t *pte; 1213 quad_t dcons_addr, dcons_size; 1214 int page_counter; 1215 1216 /* 1217 * Tell the physical memory allocator about pages used to store 1218 * the kernel and preloaded data. See kmem_bootstrap_free(). 1219 */ 1220 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first)); 1221 1222 bzero(physmap, sizeof(physmap)); 1223 physmap_idx = 0; 1224 1225 init_ops.parse_memmap(kmdp, physmap, &physmap_idx); 1226 physmap_idx -= 2; 1227 1228 /* 1229 * Find the 'base memory' segment for SMP 1230 */ 1231 basemem = 0; 1232 for (i = 0; i <= physmap_idx; i += 2) { 1233 if (physmap[i] <= 0xA0000) { 1234 basemem = physmap[i + 1] / 1024; 1235 break; 1236 } 1237 } 1238 if (basemem == 0 || basemem > 640) { 1239 if (bootverbose) 1240 printf( 1241 "Memory map doesn't contain a basemem segment, faking it"); 1242 basemem = 640; 1243 } 1244 1245 /* 1246 * Maxmem isn't the "maximum memory", it's one larger than the 1247 * highest page of the physical address space. It should be 1248 * called something like "Maxphyspage". We may adjust this 1249 * based on ``hw.physmem'' and the results of the memory test. 1250 */ 1251 Maxmem = atop(physmap[physmap_idx + 1]); 1252 1253#ifdef MAXMEM 1254 Maxmem = MAXMEM / 4; 1255#endif 1256 1257 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) 1258 Maxmem = atop(physmem_tunable); 1259 1260 /* 1261 * The boot memory test is disabled by default, as it takes a 1262 * significant amount of time on large-memory systems, and is 1263 * unfriendly to virtual machines as it unnecessarily touches all 1264 * pages. 1265 * 1266 * A general name is used as the code may be extended to support 1267 * additional tests beyond the current "page present" test. 1268 */ 1269 memtest = 0; 1270 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); 1271 1272 /* 1273 * Don't allow MAXMEM or hw.physmem to extend the amount of memory 1274 * in the system. 1275 */ 1276 if (Maxmem > atop(physmap[physmap_idx + 1])) 1277 Maxmem = atop(physmap[physmap_idx + 1]); 1278 1279 if (atop(physmap[physmap_idx + 1]) != Maxmem && 1280 (boothowto & RB_VERBOSE)) 1281 printf("Physical memory use set to %ldK\n", Maxmem * 4); 1282 1283 /* 1284 * Make hole for "AP -> long mode" bootstrap code. The 1285 * mp_bootaddress vector is only available when the kernel 1286 * is configured to support APs and APs for the system start 1287 * in real mode mode (e.g. SMP bare metal). 1288 */ 1289 if (init_ops.mp_bootaddress) 1290 init_ops.mp_bootaddress(physmap, &physmap_idx); 1291 1292 /* call pmap initialization to make new kernel address space */ 1293 pmap_bootstrap(&first); 1294 1295 /* 1296 * Size up each available chunk of physical memory. 1297 * 1298 * XXX Some BIOSes corrupt low 64KB between suspend and resume. 1299 * By default, mask off the first 16 pages unless we appear to be 1300 * running in a VM. 1301 */ 1302 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT; 1303 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start); 1304 if (physmap[0] < physmem_start) { 1305 if (physmem_start < PAGE_SIZE) 1306 physmap[0] = PAGE_SIZE; 1307 else if (physmem_start >= physmap[1]) 1308 physmap[0] = round_page(physmap[1] - PAGE_SIZE); 1309 else 1310 physmap[0] = round_page(physmem_start); 1311 } 1312 pa_indx = 0; 1313 da_indx = 1; 1314 phys_avail[pa_indx++] = physmap[0]; 1315 phys_avail[pa_indx] = physmap[0]; 1316 dump_avail[da_indx] = physmap[0]; 1317 pte = CMAP1; 1318 1319 /* 1320 * Get dcons buffer address 1321 */ 1322 if (getenv_quad("dcons.addr", &dcons_addr) == 0 || 1323 getenv_quad("dcons.size", &dcons_size) == 0) 1324 dcons_addr = 0; 1325 1326 /* 1327 * physmap is in bytes, so when converting to page boundaries, 1328 * round up the start address and round down the end address. 1329 */ 1330 page_counter = 0; 1331 if (memtest != 0) 1332 printf("Testing system memory"); 1333 for (i = 0; i <= physmap_idx; i += 2) { 1334 vm_paddr_t end; 1335 1336 end = ptoa((vm_paddr_t)Maxmem); 1337 if (physmap[i + 1] < end) 1338 end = trunc_page(physmap[i + 1]); 1339 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { 1340 int tmp, page_bad, full; 1341 int *ptr = (int *)CADDR1; 1342 1343 full = FALSE; 1344 /* 1345 * block out kernel memory as not available. 1346 */ 1347 if (pa >= (vm_paddr_t)kernphys && pa < first) 1348 goto do_dump_avail; 1349 1350 /* 1351 * block out dcons buffer 1352 */ 1353 if (dcons_addr > 0 1354 && pa >= trunc_page(dcons_addr) 1355 && pa < dcons_addr + dcons_size) 1356 goto do_dump_avail; 1357 1358 page_bad = FALSE; 1359 if (memtest == 0) 1360 goto skip_memtest; 1361 1362 /* 1363 * Print a "." every GB to show we're making 1364 * progress. 1365 */ 1366 page_counter++; 1367 if ((page_counter % PAGES_PER_GB) == 0) 1368 printf("."); 1369 1370 /* 1371 * map page into kernel: valid, read/write,non-cacheable 1372 */ 1373 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD; 1374 invltlb(); 1375 1376 tmp = *(int *)ptr; 1377 /* 1378 * Test for alternating 1's and 0's 1379 */ 1380 *(volatile int *)ptr = 0xaaaaaaaa; 1381 if (*(volatile int *)ptr != 0xaaaaaaaa) 1382 page_bad = TRUE; 1383 /* 1384 * Test for alternating 0's and 1's 1385 */ 1386 *(volatile int *)ptr = 0x55555555; 1387 if (*(volatile int *)ptr != 0x55555555) 1388 page_bad = TRUE; 1389 /* 1390 * Test for all 1's 1391 */ 1392 *(volatile int *)ptr = 0xffffffff; 1393 if (*(volatile int *)ptr != 0xffffffff) 1394 page_bad = TRUE; 1395 /* 1396 * Test for all 0's 1397 */ 1398 *(volatile int *)ptr = 0x0; 1399 if (*(volatile int *)ptr != 0x0) 1400 page_bad = TRUE; 1401 /* 1402 * Restore original value. 1403 */ 1404 *(int *)ptr = tmp; 1405 1406skip_memtest: 1407 /* 1408 * Adjust array of valid/good pages. 1409 */ 1410 if (page_bad == TRUE) 1411 continue; 1412 /* 1413 * If this good page is a continuation of the 1414 * previous set of good pages, then just increase 1415 * the end pointer. Otherwise start a new chunk. 1416 * Note that "end" points one higher than end, 1417 * making the range >= start and < end. 1418 * If we're also doing a speculative memory 1419 * test and we at or past the end, bump up Maxmem 1420 * so that we keep going. The first bad page 1421 * will terminate the loop. 1422 */ 1423 if (phys_avail[pa_indx] == pa) { 1424 phys_avail[pa_indx] += PAGE_SIZE; 1425 } else { 1426 pa_indx++; 1427 if (pa_indx == PHYS_AVAIL_ENTRIES) { 1428 printf( 1429 "Too many holes in the physical address space, giving up\n"); 1430 pa_indx--; 1431 full = TRUE; 1432 goto do_dump_avail; 1433 } 1434 phys_avail[pa_indx++] = pa; /* start */ 1435 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ 1436 } 1437 physmem++; 1438do_dump_avail: 1439 if (dump_avail[da_indx] == pa) { 1440 dump_avail[da_indx] += PAGE_SIZE; 1441 } else { 1442 da_indx++; 1443 if (da_indx == PHYS_AVAIL_ENTRIES) { 1444 da_indx--; 1445 goto do_next; 1446 } 1447 dump_avail[da_indx++] = pa; /* start */ 1448 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ 1449 } 1450do_next: 1451 if (full) 1452 break; 1453 } 1454 } 1455 *pte = 0; 1456 invltlb(); 1457 if (memtest != 0) 1458 printf("\n"); 1459 1460 /* 1461 * XXX 1462 * The last chunk must contain at least one page plus the message 1463 * buffer to avoid complicating other code (message buffer address 1464 * calculation, etc.). 1465 */ 1466 while (phys_avail[pa_indx - 1] + PAGE_SIZE + 1467 round_page(msgbufsize) >= phys_avail[pa_indx]) { 1468 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); 1469 phys_avail[pa_indx--] = 0; 1470 phys_avail[pa_indx--] = 0; 1471 } 1472 1473 Maxmem = atop(phys_avail[pa_indx]); 1474 1475 /* Trim off space for the message buffer. */ 1476 phys_avail[pa_indx] -= round_page(msgbufsize); 1477 1478 /* Map the message buffer. */ 1479 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]); 1480} 1481 1482static caddr_t 1483native_parse_preload_data(u_int64_t modulep) 1484{ 1485 caddr_t kmdp; 1486 char *envp; 1487#ifdef DDB 1488 vm_offset_t ksym_start; 1489 vm_offset_t ksym_end; 1490#endif 1491 1492 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE); 1493 preload_bootstrap_relocate(KERNBASE); 1494 kmdp = preload_search_by_type("elf kernel"); 1495 if (kmdp == NULL) 1496 kmdp = preload_search_by_type("elf64 kernel"); 1497 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); 1498 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); 1499 if (envp != NULL) 1500 envp += KERNBASE; 1501 init_static_kenv(envp, 0); 1502#ifdef DDB 1503 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t); 1504 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t); 1505 db_fetch_ksymtab(ksym_start, ksym_end, 0); 1506#endif 1507 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t); 1508 1509 return (kmdp); 1510} 1511 1512static void 1513amd64_kdb_init(void) 1514{ 1515 kdb_init(); 1516#ifdef KDB 1517 if (boothowto & RB_KDB) 1518 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); 1519#endif 1520} 1521 1522/* Set up the fast syscall stuff */ 1523void 1524amd64_conf_fast_syscall(void) 1525{ 1526 uint64_t msr; 1527 1528 msr = rdmsr(MSR_EFER) | EFER_SCE; 1529 wrmsr(MSR_EFER, msr); 1530 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) : 1531 (u_int64_t)IDTVEC(fast_syscall)); 1532 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); 1533 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | 1534 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); 1535 wrmsr(MSR_STAR, msr); 1536 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC); 1537} 1538 1539void 1540amd64_bsp_pcpu_init1(struct pcpu *pc) 1541{ 1542 struct user_segment_descriptor *gdt; 1543 1544 PCPU_SET(prvspace, pc); 1545 gdt = *PCPU_PTR(gdt); 1546 PCPU_SET(curthread, &thread0); 1547 PCPU_SET(tssp, PCPU_PTR(common_tss)); 1548 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1549 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); 1550 PCPU_SET(fs32p, &gdt[GUFS32_SEL]); 1551 PCPU_SET(gs32p, &gdt[GUGS32_SEL]); 1552 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); 1553 PCPU_SET(smp_tlb_gen, 1); 1554} 1555 1556void 1557amd64_bsp_pcpu_init2(uint64_t rsp0) 1558{ 1559 1560 PCPU_SET(rsp0, rsp0); 1561 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) + 1562 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful); 1563 PCPU_SET(curpcb, thread0.td_pcb); 1564} 1565 1566void 1567amd64_bsp_ist_init(struct pcpu *pc) 1568{ 1569 struct nmi_pcpu *np; 1570 struct amd64tss *tssp; 1571 1572 tssp = &pc->pc_common_tss; 1573 1574 /* doublefault stack space, runs on ist1 */ 1575 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1; 1576 np->np_pcpu = (register_t)pc; 1577 tssp->tss_ist1 = (long)np; 1578 1579 /* 1580 * NMI stack, runs on ist2. The pcpu pointer is stored just 1581 * above the start of the ist2 stack. 1582 */ 1583 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1; 1584 np->np_pcpu = (register_t)pc; 1585 tssp->tss_ist2 = (long)np; 1586 1587 /* 1588 * MC# stack, runs on ist3. The pcpu pointer is stored just 1589 * above the start of the ist3 stack. 1590 */ 1591 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1; 1592 np->np_pcpu = (register_t)pc; 1593 tssp->tss_ist3 = (long)np; 1594 1595 /* 1596 * DB# stack, runs on ist4. 1597 */ 1598 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1; 1599 np->np_pcpu = (register_t)pc; 1600 tssp->tss_ist4 = (long)np; 1601} 1602 1603u_int64_t 1604hammer_time(u_int64_t modulep, u_int64_t physfree) 1605{ 1606 caddr_t kmdp; 1607 int gsel_tss, x; 1608 struct pcpu *pc; 1609 struct xstate_hdr *xhdr; 1610 u_int64_t rsp0; 1611 char *env; 1612 struct user_segment_descriptor *gdt; 1613 struct region_descriptor r_gdt; 1614 size_t kstack0_sz; 1615 int late_console; 1616 1617 TSRAW(&thread0, TS_ENTER, __func__, NULL); 1618 1619 kmdp = init_ops.parse_preload_data(modulep); 1620 1621 physfree += ucode_load_bsp(physfree + KERNBASE); 1622 physfree = roundup2(physfree, PAGE_SIZE); 1623 1624 identify_cpu1(); 1625 identify_hypervisor(); 1626 identify_cpu_fixup_bsp(); 1627 identify_cpu2(); 1628 initializecpucache(); 1629 1630 /* 1631 * Check for pti, pcid, and invpcid before ifuncs are 1632 * resolved, to correctly select the implementation for 1633 * pmap_activate_sw_mode(). 1634 */ 1635 pti = pti_get_default(); 1636 TUNABLE_INT_FETCH("vm.pmap.pti", &pti); 1637 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled); 1638 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) { 1639 invpcid_works = (cpu_stdext_feature & 1640 CPUID_STDEXT_INVPCID) != 0; 1641 } else { 1642 pmap_pcid_enabled = 0; 1643 } 1644 1645 link_elf_ireloc(kmdp); 1646 1647 /* 1648 * This may be done better later if it gets more high level 1649 * components in it. If so just link td->td_proc here. 1650 */ 1651 proc_linkup0(&proc0, &thread0); 1652 1653 /* Init basic tunables, hz etc */ 1654 init_param1(); 1655 1656 thread0.td_kstack = physfree + KERNBASE; 1657 thread0.td_kstack_pages = kstack_pages; 1658 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; 1659 bzero((void *)thread0.td_kstack, kstack0_sz); 1660 physfree += kstack0_sz; 1661 1662 /* 1663 * Initialize enough of thread0 for delayed invalidation to 1664 * work very early. Rely on thread0.td_base_pri 1665 * zero-initialization, it is reset to PVM at proc0_init(). 1666 */ 1667 pmap_thread_init_invl_gen(&thread0); 1668 1669 pc = &temp_bsp_pcpu; 1670 pcpu_init(pc, 0, sizeof(struct pcpu)); 1671 gdt = &temp_bsp_pcpu.pc_gdt[0]; 1672 1673 /* 1674 * make gdt memory segments 1675 */ 1676 for (x = 0; x < NGDT; x++) { 1677 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && 1678 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1) 1679 ssdtosd(&gdt_segs[x], &gdt[x]); 1680 } 1681 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss; 1682 ssdtosyssd(&gdt_segs[GPROC0_SEL], 1683 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]); 1684 1685 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; 1686 r_gdt.rd_base = (long)gdt; 1687 lgdt(&r_gdt); 1688 1689 wrmsr(MSR_FSBASE, 0); /* User value */ 1690 wrmsr(MSR_GSBASE, (u_int64_t)pc); 1691 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ 1692 1693 dpcpu_init((void *)(physfree + KERNBASE), 0); 1694 physfree += DPCPU_SIZE; 1695 amd64_bsp_pcpu_init1(pc); 1696 /* Non-late cninit() and printf() can be moved up to here. */ 1697 1698 /* 1699 * Initialize mutexes. 1700 * 1701 * icu_lock: in order to allow an interrupt to occur in a critical 1702 * section, to set pcpu->ipending (etc...) properly, we 1703 * must be able to get the icu lock, so it can't be 1704 * under witness. 1705 */ 1706 mutex_init(); 1707 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS); 1708 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF); 1709 1710 /* exceptions */ 1711 for (x = 0; x < NIDT; x++) 1712 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT, 1713 SEL_KPL, 0); 1714 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT, 1715 SEL_KPL, 0); 1716 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4); 1717 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2); 1718 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT, 1719 SEL_UPL, 0); 1720 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT, 1721 SEL_UPL, 0); 1722 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT, 1723 SEL_KPL, 0); 1724 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT, 1725 SEL_KPL, 0); 1726 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT, 1727 SEL_KPL, 0); 1728 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); 1729 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm), 1730 SDT_SYSIGT, SEL_KPL, 0); 1731 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT, 1732 SEL_KPL, 0); 1733 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing), 1734 SDT_SYSIGT, SEL_KPL, 0); 1735 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT, 1736 SEL_KPL, 0); 1737 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT, 1738 SEL_KPL, 0); 1739 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT, 1740 SEL_KPL, 0); 1741 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT, 1742 SEL_KPL, 0); 1743 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT, 1744 SEL_KPL, 0); 1745 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3); 1746 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT, 1747 SEL_KPL, 0); 1748#ifdef KDTRACE_HOOKS 1749 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) : 1750 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0); 1751#endif 1752#ifdef XENHVM 1753 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) : 1754 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0); 1755#endif 1756 r_idt.rd_limit = sizeof(idt0) - 1; 1757 r_idt.rd_base = (long) idt; 1758 lidt(&r_idt); 1759 1760 /* 1761 * Initialize the clock before the console so that console 1762 * initialization can use DELAY(). 1763 */ 1764 clock_init(); 1765 1766 /* 1767 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4) 1768 * transition). 1769 * Once bootblocks have updated, we can test directly for 1770 * efi_systbl != NULL here... 1771 */ 1772 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP) 1773 != NULL) 1774 vty_set_preferred(VTY_VT); 1775 1776 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable); 1777 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable); 1778 1779 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable); 1780 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable); 1781 1782 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush", 1783 &syscall_ret_l1d_flush_mode); 1784 1785 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable); 1786 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable); 1787 1788 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable); 1789 1790 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable", 1791 &x86_rngds_mitg_enable); 1792 1793 finishidentcpu(); /* Final stage of CPU initialization */ 1794 initializecpu(); /* Initialize CPU registers */ 1795 1796 amd64_bsp_ist_init(pc); 1797 1798 /* Set the IO permission bitmap (empty due to tss seg limit) */ 1799 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) + 1800 IOPERM_BITMAP_SIZE; 1801 1802 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); 1803 ltr(gsel_tss); 1804 1805 amd64_conf_fast_syscall(); 1806 1807 /* 1808 * We initialize the PCB pointer early so that exception 1809 * handlers will work. Also set up td_critnest to short-cut 1810 * the page fault handler. 1811 */ 1812 cpu_max_ext_state_size = sizeof(struct savefpu); 1813 set_top_of_stack_td(&thread0); 1814 thread0.td_pcb = get_pcb_td(&thread0); 1815 thread0.td_critnest = 1; 1816 1817 /* 1818 * The console and kdb should be initialized even earlier than here, 1819 * but some console drivers don't work until after getmemsize(). 1820 * Default to late console initialization to support these drivers. 1821 * This loses mainly printf()s in getmemsize() and early debugging. 1822 */ 1823 late_console = 1; 1824 TUNABLE_INT_FETCH("debug.late_console", &late_console); 1825 if (!late_console) { 1826 cninit(); 1827 amd64_kdb_init(); 1828 } 1829 1830 getmemsize(kmdp, physfree); 1831 init_param2(physmem); 1832 1833 /* now running on new page tables, configured,and u/iom is accessible */ 1834 1835#ifdef DEV_PCI 1836 /* This call might adjust phys_avail[]. */ 1837 pci_early_quirks(); 1838#endif 1839 1840 if (late_console) 1841 cninit(); 1842 1843 /* 1844 * Dump the boot metadata. We have to wait for cninit() since console 1845 * output is required. If it's grossly incorrect the kernel will never 1846 * make it this far. 1847 */ 1848 if (getenv_is_true("debug.dump_modinfo_at_boot")) 1849 preload_dump(); 1850 1851#ifdef DEV_ISA 1852#ifdef DEV_ATPIC 1853 elcr_probe(); 1854 atpic_startup(); 1855#else 1856 /* Reset and mask the atpics and leave them shut down. */ 1857 atpic_reset(); 1858 1859 /* 1860 * Point the ICU spurious interrupt vectors at the APIC spurious 1861 * interrupt handler. 1862 */ 1863 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1864 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0); 1865#endif 1866#else 1867#error "have you forgotten the isa device?" 1868#endif 1869 1870 if (late_console) 1871 amd64_kdb_init(); 1872 1873 msgbufinit(msgbufp, msgbufsize); 1874 fpuinit(); 1875 1876 /* 1877 * Reinitialize thread0's stack base now that the xsave area size is 1878 * known. Set up thread0's pcb save area after fpuinit calculated fpu 1879 * save area size. Zero out the extended state header in fpu save area. 1880 */ 1881 set_top_of_stack_td(&thread0); 1882 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0); 1883 bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size); 1884 if (use_xsave) { 1885 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1886 1); 1887 xhdr->xstate_bv = xsave_mask; 1888 } 1889 /* make an initial tss so cpu can get interrupt stack on syscall! */ 1890 rsp0 = thread0.td_md.md_stack_base; 1891 /* Ensure the stack is aligned to 16 bytes */ 1892 rsp0 &= ~0xFul; 1893 PCPU_PTR(common_tss)->tss_rsp0 = rsp0; 1894 amd64_bsp_pcpu_init2(rsp0); 1895 1896 /* transfer to user mode */ 1897 1898 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); 1899 _udatasel = GSEL(GUDATA_SEL, SEL_UPL); 1900 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL); 1901 _ufssel = GSEL(GUFS32_SEL, SEL_UPL); 1902 _ugssel = GSEL(GUGS32_SEL, SEL_UPL); 1903 1904 load_ds(_udatasel); 1905 load_es(_udatasel); 1906 load_fs(_ufssel); 1907 1908 /* setup proc 0's pcb */ 1909 thread0.td_pcb->pcb_flags = 0; 1910 thread0.td_frame = &proc0_tf; 1911 1912 env = kern_getenv("kernelname"); 1913 if (env != NULL) 1914 strlcpy(kernelname, env, sizeof(kernelname)); 1915 1916 kcsan_cpu_init(0); 1917 1918#ifdef FDT 1919 x86_init_fdt(); 1920#endif 1921 thread0.td_critnest = 0; 1922 1923 TSEXIT(); 1924 1925 /* Location of kernel stack for locore */ 1926 return (thread0.td_md.md_stack_base); 1927} 1928 1929void 1930cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) 1931{ 1932 1933 pcpu->pc_acpi_id = 0xffffffff; 1934} 1935 1936static int 1937smap_sysctl_handler(SYSCTL_HANDLER_ARGS) 1938{ 1939 struct bios_smap *smapbase; 1940 struct bios_smap_xattr smap; 1941 caddr_t kmdp; 1942 uint32_t *smapattr; 1943 int count, error, i; 1944 1945 /* Retrieve the system memory map from the loader. */ 1946 kmdp = preload_search_by_type("elf kernel"); 1947 if (kmdp == NULL) 1948 kmdp = preload_search_by_type("elf64 kernel"); 1949 smapbase = (struct bios_smap *)preload_search_info(kmdp, 1950 MODINFO_METADATA | MODINFOMD_SMAP); 1951 if (smapbase == NULL) 1952 return (0); 1953 smapattr = (uint32_t *)preload_search_info(kmdp, 1954 MODINFO_METADATA | MODINFOMD_SMAP_XATTR); 1955 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase); 1956 error = 0; 1957 for (i = 0; i < count; i++) { 1958 smap.base = smapbase[i].base; 1959 smap.length = smapbase[i].length; 1960 smap.type = smapbase[i].type; 1961 if (smapattr != NULL) 1962 smap.xattr = smapattr[i]; 1963 else 1964 smap.xattr = 0; 1965 error = SYSCTL_OUT(req, &smap, sizeof(smap)); 1966 } 1967 return (error); 1968} 1969SYSCTL_PROC(_machdep, OID_AUTO, smap, 1970 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1971 smap_sysctl_handler, "S,bios_smap_xattr", 1972 "Raw BIOS SMAP data"); 1973 1974static int 1975efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS) 1976{ 1977 struct efi_map_header *efihdr; 1978 caddr_t kmdp; 1979 uint32_t efisize; 1980 1981 kmdp = preload_search_by_type("elf kernel"); 1982 if (kmdp == NULL) 1983 kmdp = preload_search_by_type("elf64 kernel"); 1984 efihdr = (struct efi_map_header *)preload_search_info(kmdp, 1985 MODINFO_METADATA | MODINFOMD_EFI_MAP); 1986 if (efihdr == NULL) 1987 return (0); 1988 efisize = *((uint32_t *)efihdr - 1); 1989 return (SYSCTL_OUT(req, efihdr, efisize)); 1990} 1991SYSCTL_PROC(_machdep, OID_AUTO, efi_map, 1992 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 1993 efi_map_sysctl_handler, "S,efi_map_header", 1994 "Raw EFI Memory Map"); 1995 1996void 1997spinlock_enter(void) 1998{ 1999 struct thread *td; 2000 register_t flags; 2001 2002 td = curthread; 2003 if (td->td_md.md_spinlock_count == 0) { 2004 flags = intr_disable(); 2005 td->td_md.md_spinlock_count = 1; 2006 td->td_md.md_saved_flags = flags; 2007 critical_enter(); 2008 } else 2009 td->td_md.md_spinlock_count++; 2010} 2011 2012void 2013spinlock_exit(void) 2014{ 2015 struct thread *td; 2016 register_t flags; 2017 2018 td = curthread; 2019 flags = td->td_md.md_saved_flags; 2020 td->td_md.md_spinlock_count--; 2021 if (td->td_md.md_spinlock_count == 0) { 2022 critical_exit(); 2023 intr_restore(flags); 2024 } 2025} 2026 2027/* 2028 * Construct a PCB from a trapframe. This is called from kdb_trap() where 2029 * we want to start a backtrace from the function that caused us to enter 2030 * the debugger. We have the context in the trapframe, but base the trace 2031 * on the PCB. The PCB doesn't have to be perfect, as long as it contains 2032 * enough for a backtrace. 2033 */ 2034void 2035makectx(struct trapframe *tf, struct pcb *pcb) 2036{ 2037 2038 pcb->pcb_r12 = tf->tf_r12; 2039 pcb->pcb_r13 = tf->tf_r13; 2040 pcb->pcb_r14 = tf->tf_r14; 2041 pcb->pcb_r15 = tf->tf_r15; 2042 pcb->pcb_rbp = tf->tf_rbp; 2043 pcb->pcb_rbx = tf->tf_rbx; 2044 pcb->pcb_rip = tf->tf_rip; 2045 pcb->pcb_rsp = tf->tf_rsp; 2046} 2047 2048int 2049ptrace_set_pc(struct thread *td, unsigned long addr) 2050{ 2051 2052 td->td_frame->tf_rip = addr; 2053 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2054 return (0); 2055} 2056 2057int 2058ptrace_single_step(struct thread *td) 2059{ 2060 2061 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2062 if ((td->td_frame->tf_rflags & PSL_T) == 0) { 2063 td->td_frame->tf_rflags |= PSL_T; 2064 td->td_dbgflags |= TDB_STEP; 2065 } 2066 return (0); 2067} 2068 2069int 2070ptrace_clear_single_step(struct thread *td) 2071{ 2072 2073 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); 2074 td->td_frame->tf_rflags &= ~PSL_T; 2075 td->td_dbgflags &= ~TDB_STEP; 2076 return (0); 2077} 2078 2079int 2080fill_regs(struct thread *td, struct reg *regs) 2081{ 2082 struct trapframe *tp; 2083 2084 tp = td->td_frame; 2085 return (fill_frame_regs(tp, regs)); 2086} 2087 2088int 2089fill_frame_regs(struct trapframe *tp, struct reg *regs) 2090{ 2091 2092 regs->r_r15 = tp->tf_r15; 2093 regs->r_r14 = tp->tf_r14; 2094 regs->r_r13 = tp->tf_r13; 2095 regs->r_r12 = tp->tf_r12; 2096 regs->r_r11 = tp->tf_r11; 2097 regs->r_r10 = tp->tf_r10; 2098 regs->r_r9 = tp->tf_r9; 2099 regs->r_r8 = tp->tf_r8; 2100 regs->r_rdi = tp->tf_rdi; 2101 regs->r_rsi = tp->tf_rsi; 2102 regs->r_rbp = tp->tf_rbp; 2103 regs->r_rbx = tp->tf_rbx; 2104 regs->r_rdx = tp->tf_rdx; 2105 regs->r_rcx = tp->tf_rcx; 2106 regs->r_rax = tp->tf_rax; 2107 regs->r_rip = tp->tf_rip; 2108 regs->r_cs = tp->tf_cs; 2109 regs->r_rflags = tp->tf_rflags; 2110 regs->r_rsp = tp->tf_rsp; 2111 regs->r_ss = tp->tf_ss; 2112 if (tp->tf_flags & TF_HASSEGS) { 2113 regs->r_ds = tp->tf_ds; 2114 regs->r_es = tp->tf_es; 2115 regs->r_fs = tp->tf_fs; 2116 regs->r_gs = tp->tf_gs; 2117 } else { 2118 regs->r_ds = 0; 2119 regs->r_es = 0; 2120 regs->r_fs = 0; 2121 regs->r_gs = 0; 2122 } 2123 regs->r_err = 0; 2124 regs->r_trapno = 0; 2125 return (0); 2126} 2127 2128int 2129set_regs(struct thread *td, struct reg *regs) 2130{ 2131 struct trapframe *tp; 2132 register_t rflags; 2133 2134 tp = td->td_frame; 2135 rflags = regs->r_rflags & 0xffffffff; 2136 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs)) 2137 return (EINVAL); 2138 tp->tf_r15 = regs->r_r15; 2139 tp->tf_r14 = regs->r_r14; 2140 tp->tf_r13 = regs->r_r13; 2141 tp->tf_r12 = regs->r_r12; 2142 tp->tf_r11 = regs->r_r11; 2143 tp->tf_r10 = regs->r_r10; 2144 tp->tf_r9 = regs->r_r9; 2145 tp->tf_r8 = regs->r_r8; 2146 tp->tf_rdi = regs->r_rdi; 2147 tp->tf_rsi = regs->r_rsi; 2148 tp->tf_rbp = regs->r_rbp; 2149 tp->tf_rbx = regs->r_rbx; 2150 tp->tf_rdx = regs->r_rdx; 2151 tp->tf_rcx = regs->r_rcx; 2152 tp->tf_rax = regs->r_rax; 2153 tp->tf_rip = regs->r_rip; 2154 tp->tf_cs = regs->r_cs; 2155 tp->tf_rflags = rflags; 2156 tp->tf_rsp = regs->r_rsp; 2157 tp->tf_ss = regs->r_ss; 2158 if (0) { /* XXXKIB */ 2159 tp->tf_ds = regs->r_ds; 2160 tp->tf_es = regs->r_es; 2161 tp->tf_fs = regs->r_fs; 2162 tp->tf_gs = regs->r_gs; 2163 tp->tf_flags = TF_HASSEGS; 2164 } 2165 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 2166 return (0); 2167} 2168 2169/* XXX check all this stuff! */ 2170/* externalize from sv_xmm */ 2171static void 2172fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs) 2173{ 2174 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2175 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2176 int i; 2177 2178 /* pcb -> fpregs */ 2179 bzero(fpregs, sizeof(*fpregs)); 2180 2181 /* FPU control/status */ 2182 penv_fpreg->en_cw = penv_xmm->en_cw; 2183 penv_fpreg->en_sw = penv_xmm->en_sw; 2184 penv_fpreg->en_tw = penv_xmm->en_tw; 2185 penv_fpreg->en_opcode = penv_xmm->en_opcode; 2186 penv_fpreg->en_rip = penv_xmm->en_rip; 2187 penv_fpreg->en_rdp = penv_xmm->en_rdp; 2188 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr; 2189 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask; 2190 2191 /* FPU registers */ 2192 for (i = 0; i < 8; ++i) 2193 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10); 2194 2195 /* SSE registers */ 2196 for (i = 0; i < 16; ++i) 2197 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16); 2198} 2199 2200/* internalize from fpregs into sv_xmm */ 2201static void 2202set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm) 2203{ 2204 struct envxmm *penv_xmm = &sv_xmm->sv_env; 2205 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env; 2206 int i; 2207 2208 /* fpregs -> pcb */ 2209 /* FPU control/status */ 2210 penv_xmm->en_cw = penv_fpreg->en_cw; 2211 penv_xmm->en_sw = penv_fpreg->en_sw; 2212 penv_xmm->en_tw = penv_fpreg->en_tw; 2213 penv_xmm->en_opcode = penv_fpreg->en_opcode; 2214 penv_xmm->en_rip = penv_fpreg->en_rip; 2215 penv_xmm->en_rdp = penv_fpreg->en_rdp; 2216 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr; 2217 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask; 2218 2219 /* FPU registers */ 2220 for (i = 0; i < 8; ++i) 2221 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10); 2222 2223 /* SSE registers */ 2224 for (i = 0; i < 16; ++i) 2225 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16); 2226} 2227 2228/* externalize from td->pcb */ 2229int 2230fill_fpregs(struct thread *td, struct fpreg *fpregs) 2231{ 2232 2233 KASSERT(td == curthread || TD_IS_SUSPENDED(td) || 2234 P_SHOULDSTOP(td->td_proc), 2235 ("not suspended thread %p", td)); 2236 fpugetregs(td); 2237 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs); 2238 return (0); 2239} 2240 2241/* internalize to td->pcb */ 2242int 2243set_fpregs(struct thread *td, struct fpreg *fpregs) 2244{ 2245 2246 critical_enter(); 2247 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td)); 2248 fpuuserinited(td); 2249 critical_exit(); 2250 return (0); 2251} 2252 2253/* 2254 * Get machine context. 2255 */ 2256int 2257get_mcontext(struct thread *td, mcontext_t *mcp, int flags) 2258{ 2259 struct pcb *pcb; 2260 struct trapframe *tp; 2261 2262 pcb = td->td_pcb; 2263 tp = td->td_frame; 2264 PROC_LOCK(curthread->td_proc); 2265 mcp->mc_onstack = sigonstack(tp->tf_rsp); 2266 PROC_UNLOCK(curthread->td_proc); 2267 mcp->mc_r15 = tp->tf_r15; 2268 mcp->mc_r14 = tp->tf_r14; 2269 mcp->mc_r13 = tp->tf_r13; 2270 mcp->mc_r12 = tp->tf_r12; 2271 mcp->mc_r11 = tp->tf_r11; 2272 mcp->mc_r10 = tp->tf_r10; 2273 mcp->mc_r9 = tp->tf_r9; 2274 mcp->mc_r8 = tp->tf_r8; 2275 mcp->mc_rdi = tp->tf_rdi; 2276 mcp->mc_rsi = tp->tf_rsi; 2277 mcp->mc_rbp = tp->tf_rbp; 2278 mcp->mc_rbx = tp->tf_rbx; 2279 mcp->mc_rcx = tp->tf_rcx; 2280 mcp->mc_rflags = tp->tf_rflags; 2281 if (flags & GET_MC_CLEAR_RET) { 2282 mcp->mc_rax = 0; 2283 mcp->mc_rdx = 0; 2284 mcp->mc_rflags &= ~PSL_C; 2285 } else { 2286 mcp->mc_rax = tp->tf_rax; 2287 mcp->mc_rdx = tp->tf_rdx; 2288 } 2289 mcp->mc_rip = tp->tf_rip; 2290 mcp->mc_cs = tp->tf_cs; 2291 mcp->mc_rsp = tp->tf_rsp; 2292 mcp->mc_ss = tp->tf_ss; 2293 mcp->mc_ds = tp->tf_ds; 2294 mcp->mc_es = tp->tf_es; 2295 mcp->mc_fs = tp->tf_fs; 2296 mcp->mc_gs = tp->tf_gs; 2297 mcp->mc_flags = tp->tf_flags; 2298 mcp->mc_len = sizeof(*mcp); 2299 get_fpcontext(td, mcp, NULL, 0); 2300 update_pcb_bases(pcb); 2301 mcp->mc_fsbase = pcb->pcb_fsbase; 2302 mcp->mc_gsbase = pcb->pcb_gsbase; 2303 mcp->mc_xfpustate = 0; 2304 mcp->mc_xfpustate_len = 0; 2305 bzero(mcp->mc_spare, sizeof(mcp->mc_spare)); 2306 return (0); 2307} 2308 2309/* 2310 * Set machine context. 2311 * 2312 * However, we don't set any but the user modifiable flags, and we won't 2313 * touch the cs selector. 2314 */ 2315int 2316set_mcontext(struct thread *td, mcontext_t *mcp) 2317{ 2318 struct pcb *pcb; 2319 struct trapframe *tp; 2320 char *xfpustate; 2321 long rflags; 2322 int ret; 2323 2324 pcb = td->td_pcb; 2325 tp = td->td_frame; 2326 if (mcp->mc_len != sizeof(*mcp) || 2327 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) 2328 return (EINVAL); 2329 rflags = (mcp->mc_rflags & PSL_USERCHANGE) | 2330 (tp->tf_rflags & ~PSL_USERCHANGE); 2331 if (mcp->mc_flags & _MC_HASFPXSTATE) { 2332 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - 2333 sizeof(struct savefpu)) 2334 return (EINVAL); 2335 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); 2336 ret = copyin((void *)mcp->mc_xfpustate, xfpustate, 2337 mcp->mc_xfpustate_len); 2338 if (ret != 0) 2339 return (ret); 2340 } else 2341 xfpustate = NULL; 2342 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); 2343 if (ret != 0) 2344 return (ret); 2345 tp->tf_r15 = mcp->mc_r15; 2346 tp->tf_r14 = mcp->mc_r14; 2347 tp->tf_r13 = mcp->mc_r13; 2348 tp->tf_r12 = mcp->mc_r12; 2349 tp->tf_r11 = mcp->mc_r11; 2350 tp->tf_r10 = mcp->mc_r10; 2351 tp->tf_r9 = mcp->mc_r9; 2352 tp->tf_r8 = mcp->mc_r8; 2353 tp->tf_rdi = mcp->mc_rdi; 2354 tp->tf_rsi = mcp->mc_rsi; 2355 tp->tf_rbp = mcp->mc_rbp; 2356 tp->tf_rbx = mcp->mc_rbx; 2357 tp->tf_rdx = mcp->mc_rdx; 2358 tp->tf_rcx = mcp->mc_rcx; 2359 tp->tf_rax = mcp->mc_rax; 2360 tp->tf_rip = mcp->mc_rip; 2361 tp->tf_rflags = rflags; 2362 tp->tf_rsp = mcp->mc_rsp; 2363 tp->tf_ss = mcp->mc_ss; 2364 tp->tf_flags = mcp->mc_flags; 2365 if (tp->tf_flags & TF_HASSEGS) { 2366 tp->tf_ds = mcp->mc_ds; 2367 tp->tf_es = mcp->mc_es; 2368 tp->tf_fs = mcp->mc_fs; 2369 tp->tf_gs = mcp->mc_gs; 2370 } 2371 set_pcb_flags(pcb, PCB_FULL_IRET); 2372 if (mcp->mc_flags & _MC_HASBASES) { 2373 pcb->pcb_fsbase = mcp->mc_fsbase; 2374 pcb->pcb_gsbase = mcp->mc_gsbase; 2375 } 2376 return (0); 2377} 2378 2379static void 2380get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, 2381 size_t xfpusave_len) 2382{ 2383 size_t max_len, len; 2384 2385 mcp->mc_ownedfp = fpugetregs(td); 2386 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], 2387 sizeof(mcp->mc_fpstate)); 2388 mcp->mc_fpformat = fpuformat(); 2389 if (!use_xsave || xfpusave_len == 0) 2390 return; 2391 max_len = cpu_max_ext_state_size - sizeof(struct savefpu); 2392 len = xfpusave_len; 2393 if (len > max_len) { 2394 len = max_len; 2395 bzero(xfpusave + max_len, len - max_len); 2396 } 2397 mcp->mc_flags |= _MC_HASFPXSTATE; 2398 mcp->mc_xfpustate_len = len; 2399 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); 2400} 2401 2402static int 2403set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, 2404 size_t xfpustate_len) 2405{ 2406 int error; 2407 2408 if (mcp->mc_fpformat == _MC_FPFMT_NODEV) 2409 return (0); 2410 else if (mcp->mc_fpformat != _MC_FPFMT_XMM) 2411 return (EINVAL); 2412 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { 2413 /* We don't care what state is left in the FPU or PCB. */ 2414 fpstate_drop(td); 2415 error = 0; 2416 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || 2417 mcp->mc_ownedfp == _MC_FPOWNED_PCB) { 2418 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate, 2419 xfpustate, xfpustate_len); 2420 } else 2421 return (EINVAL); 2422 return (error); 2423} 2424 2425void 2426fpstate_drop(struct thread *td) 2427{ 2428 2429 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); 2430 critical_enter(); 2431 if (PCPU_GET(fpcurthread) == td) 2432 fpudrop(); 2433 /* 2434 * XXX force a full drop of the fpu. The above only drops it if we 2435 * owned it. 2436 * 2437 * XXX I don't much like fpugetuserregs()'s semantics of doing a full 2438 * drop. Dropping only to the pcb matches fnsave's behaviour. 2439 * We only need to drop to !PCB_INITDONE in sendsig(). But 2440 * sendsig() is the only caller of fpugetuserregs()... perhaps we just 2441 * have too many layers. 2442 */ 2443 clear_pcb_flags(curthread->td_pcb, 2444 PCB_FPUINITDONE | PCB_USERFPUINITDONE); 2445 critical_exit(); 2446} 2447 2448int 2449fill_dbregs(struct thread *td, struct dbreg *dbregs) 2450{ 2451 struct pcb *pcb; 2452 2453 if (td == NULL) { 2454 dbregs->dr[0] = rdr0(); 2455 dbregs->dr[1] = rdr1(); 2456 dbregs->dr[2] = rdr2(); 2457 dbregs->dr[3] = rdr3(); 2458 dbregs->dr[6] = rdr6(); 2459 dbregs->dr[7] = rdr7(); 2460 } else { 2461 pcb = td->td_pcb; 2462 dbregs->dr[0] = pcb->pcb_dr0; 2463 dbregs->dr[1] = pcb->pcb_dr1; 2464 dbregs->dr[2] = pcb->pcb_dr2; 2465 dbregs->dr[3] = pcb->pcb_dr3; 2466 dbregs->dr[6] = pcb->pcb_dr6; 2467 dbregs->dr[7] = pcb->pcb_dr7; 2468 } 2469 dbregs->dr[4] = 0; 2470 dbregs->dr[5] = 0; 2471 dbregs->dr[8] = 0; 2472 dbregs->dr[9] = 0; 2473 dbregs->dr[10] = 0; 2474 dbregs->dr[11] = 0; 2475 dbregs->dr[12] = 0; 2476 dbregs->dr[13] = 0; 2477 dbregs->dr[14] = 0; 2478 dbregs->dr[15] = 0; 2479 return (0); 2480} 2481 2482int 2483set_dbregs(struct thread *td, struct dbreg *dbregs) 2484{ 2485 struct pcb *pcb; 2486 int i; 2487 2488 if (td == NULL) { 2489 load_dr0(dbregs->dr[0]); 2490 load_dr1(dbregs->dr[1]); 2491 load_dr2(dbregs->dr[2]); 2492 load_dr3(dbregs->dr[3]); 2493 load_dr6(dbregs->dr[6]); 2494 load_dr7(dbregs->dr[7]); 2495 } else { 2496 /* 2497 * Don't let an illegal value for dr7 get set. Specifically, 2498 * check for undefined settings. Setting these bit patterns 2499 * result in undefined behaviour and can lead to an unexpected 2500 * TRCTRAP or a general protection fault right here. 2501 * Upper bits of dr6 and dr7 must not be set 2502 */ 2503 for (i = 0; i < 4; i++) { 2504 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) 2505 return (EINVAL); 2506 if (td->td_frame->tf_cs == _ucode32sel && 2507 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8) 2508 return (EINVAL); 2509 } 2510 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 || 2511 (dbregs->dr[7] & 0xffffffff00000000ul) != 0) 2512 return (EINVAL); 2513 2514 pcb = td->td_pcb; 2515 2516 /* 2517 * Don't let a process set a breakpoint that is not within the 2518 * process's address space. If a process could do this, it 2519 * could halt the system by setting a breakpoint in the kernel 2520 * (if ddb was enabled). Thus, we need to check to make sure 2521 * that no breakpoints are being enabled for addresses outside 2522 * process's address space. 2523 * 2524 * XXX - what about when the watched area of the user's 2525 * address space is written into from within the kernel 2526 * ... wouldn't that still cause a breakpoint to be generated 2527 * from within kernel mode? 2528 */ 2529 2530 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { 2531 /* dr0 is enabled */ 2532 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) 2533 return (EINVAL); 2534 } 2535 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { 2536 /* dr1 is enabled */ 2537 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) 2538 return (EINVAL); 2539 } 2540 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { 2541 /* dr2 is enabled */ 2542 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) 2543 return (EINVAL); 2544 } 2545 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { 2546 /* dr3 is enabled */ 2547 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) 2548 return (EINVAL); 2549 } 2550 2551 pcb->pcb_dr0 = dbregs->dr[0]; 2552 pcb->pcb_dr1 = dbregs->dr[1]; 2553 pcb->pcb_dr2 = dbregs->dr[2]; 2554 pcb->pcb_dr3 = dbregs->dr[3]; 2555 pcb->pcb_dr6 = dbregs->dr[6]; 2556 pcb->pcb_dr7 = dbregs->dr[7]; 2557 2558 set_pcb_flags(pcb, PCB_DBREGS); 2559 } 2560 2561 return (0); 2562} 2563 2564void 2565reset_dbregs(void) 2566{ 2567 2568 load_dr7(0); /* Turn off the control bits first */ 2569 load_dr0(0); 2570 load_dr1(0); 2571 load_dr2(0); 2572 load_dr3(0); 2573 load_dr6(0); 2574} 2575 2576/* 2577 * Return > 0 if a hardware breakpoint has been hit, and the 2578 * breakpoint was in user space. Return 0, otherwise. 2579 */ 2580int 2581user_dbreg_trap(register_t dr6) 2582{ 2583 u_int64_t dr7; 2584 u_int64_t bp; /* breakpoint bits extracted from dr6 */ 2585 int nbp; /* number of breakpoints that triggered */ 2586 caddr_t addr[4]; /* breakpoint addresses */ 2587 int i; 2588 2589 bp = dr6 & DBREG_DR6_BMASK; 2590 if (bp == 0) { 2591 /* 2592 * None of the breakpoint bits are set meaning this 2593 * trap was not caused by any of the debug registers 2594 */ 2595 return 0; 2596 } 2597 2598 dr7 = rdr7(); 2599 if ((dr7 & 0x000000ff) == 0) { 2600 /* 2601 * all GE and LE bits in the dr7 register are zero, 2602 * thus the trap couldn't have been caused by the 2603 * hardware debug registers 2604 */ 2605 return 0; 2606 } 2607 2608 nbp = 0; 2609 2610 /* 2611 * at least one of the breakpoints were hit, check to see 2612 * which ones and if any of them are user space addresses 2613 */ 2614 2615 if (bp & 0x01) { 2616 addr[nbp++] = (caddr_t)rdr0(); 2617 } 2618 if (bp & 0x02) { 2619 addr[nbp++] = (caddr_t)rdr1(); 2620 } 2621 if (bp & 0x04) { 2622 addr[nbp++] = (caddr_t)rdr2(); 2623 } 2624 if (bp & 0x08) { 2625 addr[nbp++] = (caddr_t)rdr3(); 2626 } 2627 2628 for (i = 0; i < nbp; i++) { 2629 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { 2630 /* 2631 * addr[i] is in user space 2632 */ 2633 return nbp; 2634 } 2635 } 2636 2637 /* 2638 * None of the breakpoints are in user space. 2639 */ 2640 return 0; 2641} 2642 2643/* 2644 * The pcb_flags is only modified by current thread, or by other threads 2645 * when current thread is stopped. However, current thread may change it 2646 * from the interrupt context in cpu_switch(), or in the trap handler. 2647 * When we read-modify-write pcb_flags from C sources, compiler may generate 2648 * code that is not atomic regarding the interrupt handler. If a trap or 2649 * interrupt happens and any flag is modified from the handler, it can be 2650 * clobbered with the cached value later. Therefore, we implement setting 2651 * and clearing flags with single-instruction functions, which do not race 2652 * with possible modification of the flags from the trap or interrupt context, 2653 * because traps and interrupts are executed only on instruction boundary. 2654 */ 2655void 2656set_pcb_flags_raw(struct pcb *pcb, const u_int flags) 2657{ 2658 2659 __asm __volatile("orl %1,%0" 2660 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags) 2661 : "cc", "memory"); 2662 2663} 2664 2665/* 2666 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs 2667 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into 2668 * pcb if user space modified the bases. We must save on the context 2669 * switch or if the return to usermode happens through the doreti. 2670 * 2671 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET, 2672 * which have a consequence that the base MSRs must be saved each time 2673 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with 2674 * context switches. 2675 */ 2676static void 2677set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags) 2678{ 2679 register_t r; 2680 2681 if (curpcb == pcb && 2682 (flags & PCB_FULL_IRET) != 0 && 2683 (pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2684 r = intr_disable(); 2685 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) { 2686 if (rfs() == _ufssel) 2687 pcb->pcb_fsbase = rdfsbase(); 2688 if (rgs() == _ugssel) 2689 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE); 2690 } 2691 set_pcb_flags_raw(pcb, flags); 2692 intr_restore(r); 2693 } else { 2694 set_pcb_flags_raw(pcb, flags); 2695 } 2696} 2697 2698DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int)) 2699{ 2700 2701 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ? 2702 set_pcb_flags_fsgsbase : set_pcb_flags_raw); 2703} 2704 2705void 2706clear_pcb_flags(struct pcb *pcb, const u_int flags) 2707{ 2708 2709 __asm __volatile("andl %1,%0" 2710 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags) 2711 : "cc", "memory"); 2712} 2713 2714#ifdef KDB 2715 2716/* 2717 * Provide inb() and outb() as functions. They are normally only available as 2718 * inline functions, thus cannot be called from the debugger. 2719 */ 2720 2721/* silence compiler warnings */ 2722u_char inb_(u_short); 2723void outb_(u_short, u_char); 2724 2725u_char 2726inb_(u_short port) 2727{ 2728 return inb(port); 2729} 2730 2731void 2732outb_(u_short port, u_char data) 2733{ 2734 outb(port, data); 2735} 2736 2737#endif /* KDB */ 2738 2739#undef memset 2740#undef memmove 2741#undef memcpy 2742 2743void *memset_std(void *buf, int c, size_t len); 2744void *memset_erms(void *buf, int c, size_t len); 2745void *memmove_std(void * _Nonnull dst, const void * _Nonnull src, 2746 size_t len); 2747void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src, 2748 size_t len); 2749void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src, 2750 size_t len); 2751void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src, 2752 size_t len); 2753 2754#ifdef KCSAN 2755/* 2756 * These fail to build as ifuncs when used with KCSAN. 2757 */ 2758void * 2759memset(void *buf, int c, size_t len) 2760{ 2761 2762 return (memset_std(buf, c, len)); 2763} 2764 2765void * 2766memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2767{ 2768 2769 return (memmove_std(dst, src, len)); 2770} 2771 2772void * 2773memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len) 2774{ 2775 2776 return (memcpy_std(dst, src, len)); 2777} 2778#else 2779DEFINE_IFUNC(, void *, memset, (void *, int, size_t)) 2780{ 2781 2782 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2783 memset_erms : memset_std); 2784} 2785 2786DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull, 2787 size_t)) 2788{ 2789 2790 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2791 memmove_erms : memmove_std); 2792} 2793 2794DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t)) 2795{ 2796 2797 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2798 memcpy_erms : memcpy_std); 2799} 2800#endif 2801 2802void pagezero_std(void *addr); 2803void pagezero_erms(void *addr); 2804DEFINE_IFUNC(, void , pagezero, (void *)) 2805{ 2806 2807 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ? 2808 pagezero_erms : pagezero_std); 2809} 2810