1/*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 *    must display the following acknowledgement:
22 *	This product includes software developed by the University of
23 *	California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 *    may be used to endorse or promote products derived from this software
26 *    without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41 */
42
43#include <sys/cdefs.h>
44__FBSDID("$FreeBSD$");
45
46#include "opt_atpic.h"
47#include "opt_cpu.h"
48#include "opt_ddb.h"
49#include "opt_inet.h"
50#include "opt_isa.h"
51#include "opt_kstack_pages.h"
52#include "opt_maxmem.h"
53#include "opt_mp_watchdog.h"
54#include "opt_pci.h"
55#include "opt_platform.h"
56#include "opt_sched.h"
57
58#include <sys/param.h>
59#include <sys/proc.h>
60#include <sys/systm.h>
61#include <sys/bio.h>
62#include <sys/buf.h>
63#include <sys/bus.h>
64#include <sys/callout.h>
65#include <sys/cons.h>
66#include <sys/cpu.h>
67#include <sys/csan.h>
68#include <sys/efi.h>
69#include <sys/eventhandler.h>
70#include <sys/exec.h>
71#include <sys/imgact.h>
72#include <sys/kdb.h>
73#include <sys/kernel.h>
74#include <sys/ktr.h>
75#include <sys/linker.h>
76#include <sys/lock.h>
77#include <sys/malloc.h>
78#include <sys/memrange.h>
79#include <sys/msgbuf.h>
80#include <sys/mutex.h>
81#include <sys/pcpu.h>
82#include <sys/ptrace.h>
83#include <sys/reboot.h>
84#include <sys/rwlock.h>
85#include <sys/sched.h>
86#include <sys/signalvar.h>
87#ifdef SMP
88#include <sys/smp.h>
89#endif
90#include <sys/syscallsubr.h>
91#include <sys/sysctl.h>
92#include <sys/sysent.h>
93#include <sys/sysproto.h>
94#include <sys/ucontext.h>
95#include <sys/vmmeter.h>
96
97#include <vm/vm.h>
98#include <vm/vm_param.h>
99#include <vm/vm_extern.h>
100#include <vm/vm_kern.h>
101#include <vm/vm_page.h>
102#include <vm/vm_map.h>
103#include <vm/vm_object.h>
104#include <vm/vm_pager.h>
105#include <vm/vm_phys.h>
106#include <vm/vm_dumpset.h>
107
108#ifdef DDB
109#ifndef KDB
110#error KDB must be enabled in order for DDB to work!
111#endif
112#include <ddb/ddb.h>
113#include <ddb/db_sym.h>
114#endif
115
116#include <net/netisr.h>
117
118#include <machine/clock.h>
119#include <machine/cpu.h>
120#include <machine/cputypes.h>
121#include <machine/frame.h>
122#include <machine/intr_machdep.h>
123#include <x86/mca.h>
124#include <machine/md_var.h>
125#include <machine/metadata.h>
126#include <machine/mp_watchdog.h>
127#include <machine/pc/bios.h>
128#include <machine/pcb.h>
129#include <machine/proc.h>
130#include <machine/reg.h>
131#include <machine/sigframe.h>
132#include <machine/specialreg.h>
133#include <machine/trap.h>
134#include <machine/tss.h>
135#include <x86/ucode.h>
136#include <x86/ifunc.h>
137#ifdef SMP
138#include <machine/smp.h>
139#endif
140#ifdef FDT
141#include <x86/fdt.h>
142#endif
143
144#ifdef DEV_ATPIC
145#include <x86/isa/icu.h>
146#else
147#include <x86/apicvar.h>
148#endif
149
150#include <isa/isareg.h>
151#include <isa/rtc.h>
152#include <x86/init.h>
153
154/* Sanity check for __curthread() */
155CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156
157/*
158 * The PTI trampoline stack needs enough space for a hardware trapframe and a
159 * couple of scratch registers, as well as the trapframe left behind after an
160 * iret fault.
161 */
162CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163    offsetof(struct pti_frame, pti_rip));
164
165extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166
167#define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
168#define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
169
170static void cpu_startup(void *);
171static void get_fpcontext(struct thread *td, mcontext_t *mcp,
172    char *xfpusave, size_t xfpusave_len);
173static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
174    char *xfpustate, size_t xfpustate_len);
175SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
176
177/* Preload data parse function */
178static caddr_t native_parse_preload_data(u_int64_t);
179
180/* Native function to fetch and parse the e820 map */
181static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
182
183/* Default init_ops implementation. */
184struct init_ops init_ops = {
185	.parse_preload_data =	native_parse_preload_data,
186	.early_clock_source_init =	i8254_init,
187	.early_delay =			i8254_delay,
188	.parse_memmap =			native_parse_memmap,
189#ifdef SMP
190	.mp_bootaddress =		mp_bootaddress,
191	.start_all_aps =		native_start_all_aps,
192#endif
193#ifdef DEV_PCI
194	.msi_init =			msi_init,
195#endif
196};
197
198/*
199 * Physical address of the EFI System Table. Stashed from the metadata hints
200 * passed into the kernel and used by the EFI code to call runtime services.
201 */
202vm_paddr_t efi_systbl_phys;
203
204/* Intel ICH registers */
205#define ICH_PMBASE	0x400
206#define ICH_SMI_EN	ICH_PMBASE + 0x30
207
208int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
209
210int cold = 1;
211
212long Maxmem = 0;
213long realmem = 0;
214
215struct kva_md_info kmi;
216
217static struct trapframe proc0_tf;
218struct region_descriptor r_idt;
219
220struct pcpu *__pcpu;
221struct pcpu temp_bsp_pcpu;
222
223struct mtx icu_lock;
224
225struct mem_range_softc mem_range_softc;
226
227struct mtx dt_lock;	/* lock for GDT and LDT */
228
229void (*vmm_resume_p)(void);
230
231static void
232cpu_startup(dummy)
233	void *dummy;
234{
235	uintmax_t memsize;
236	char *sysenv;
237
238	/*
239	 * On MacBooks, we need to disallow the legacy USB circuit to
240	 * generate an SMI# because this can cause several problems,
241	 * namely: incorrect CPU frequency detection and failure to
242	 * start the APs.
243	 * We do this by disabling a bit in the SMI_EN (SMI Control and
244	 * Enable register) of the Intel ICH LPC Interface Bridge.
245	 */
246	sysenv = kern_getenv("smbios.system.product");
247	if (sysenv != NULL) {
248		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
249		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
250		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
251		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
252		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
253		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
254		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
255		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
256			if (bootverbose)
257				printf("Disabling LEGACY_USB_EN bit on "
258				    "Intel ICH.\n");
259			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
260		}
261		freeenv(sysenv);
262	}
263
264	/*
265	 * Good {morning,afternoon,evening,night}.
266	 */
267	startrtclock();
268	printcpuinfo();
269
270	/*
271	 * Display physical memory if SMBIOS reports reasonable amount.
272	 */
273	memsize = 0;
274	sysenv = kern_getenv("smbios.memory.enabled");
275	if (sysenv != NULL) {
276		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
277		freeenv(sysenv);
278	}
279	if (memsize < ptoa((uintmax_t)vm_free_count()))
280		memsize = ptoa((uintmax_t)Maxmem);
281	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
282	realmem = atop(memsize);
283
284	/*
285	 * Display any holes after the first chunk of extended memory.
286	 */
287	if (bootverbose) {
288		int indx;
289
290		printf("Physical memory chunk(s):\n");
291		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
292			vm_paddr_t size;
293
294			size = phys_avail[indx + 1] - phys_avail[indx];
295			printf(
296			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
297			    (uintmax_t)phys_avail[indx],
298			    (uintmax_t)phys_avail[indx + 1] - 1,
299			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
300		}
301	}
302
303	vm_ksubmap_init(&kmi);
304
305	printf("avail memory = %ju (%ju MB)\n",
306	    ptoa((uintmax_t)vm_free_count()),
307	    ptoa((uintmax_t)vm_free_count()) / 1048576);
308#ifdef DEV_PCI
309	if (bootverbose && intel_graphics_stolen_base != 0)
310		printf("intel stolen mem: base %#jx size %ju MB\n",
311		    (uintmax_t)intel_graphics_stolen_base,
312		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
313#endif
314
315	/*
316	 * Set up buffers, so they can be used to read disk labels.
317	 */
318	bufinit();
319	vm_pager_bufferinit();
320
321	cpu_setregs();
322}
323
324static void
325late_ifunc_resolve(void *dummy __unused)
326{
327	link_elf_late_ireloc();
328}
329SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
330
331/*
332 * Send an interrupt to process.
333 *
334 * Stack is set up to allow sigcode stored
335 * at top to call routine, followed by call
336 * to sigreturn routine below.  After sigreturn
337 * resets the signal mask, the stack, and the
338 * frame pointer, it returns to the user
339 * specified pc, psl.
340 */
341void
342sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
343{
344	struct sigframe sf, *sfp;
345	struct pcb *pcb;
346	struct proc *p;
347	struct thread *td;
348	struct sigacts *psp;
349	char *sp;
350	struct trapframe *regs;
351	char *xfpusave;
352	size_t xfpusave_len;
353	int sig;
354	int oonstack;
355
356	td = curthread;
357	pcb = td->td_pcb;
358	p = td->td_proc;
359	PROC_LOCK_ASSERT(p, MA_OWNED);
360	sig = ksi->ksi_signo;
361	psp = p->p_sigacts;
362	mtx_assert(&psp->ps_mtx, MA_OWNED);
363	regs = td->td_frame;
364	oonstack = sigonstack(regs->tf_rsp);
365
366	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
367		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
368		xfpusave = __builtin_alloca(xfpusave_len);
369	} else {
370		xfpusave_len = 0;
371		xfpusave = NULL;
372	}
373
374	/* Save user context. */
375	bzero(&sf, sizeof(sf));
376	sf.sf_uc.uc_sigmask = *mask;
377	sf.sf_uc.uc_stack = td->td_sigstk;
378	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
379	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
380	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
381	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
382	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
383	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
384	fpstate_drop(td);
385	update_pcb_bases(pcb);
386	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
387	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
388	bzero(sf.sf_uc.uc_mcontext.mc_spare,
389	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
390
391	/* Allocate space for the signal handler context. */
392	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
393	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
394		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
395#if defined(COMPAT_43)
396		td->td_sigstk.ss_flags |= SS_ONSTACK;
397#endif
398	} else
399		sp = (char *)regs->tf_rsp - 128;
400	if (xfpusave != NULL) {
401		sp -= xfpusave_len;
402		sp = (char *)((unsigned long)sp & ~0x3Ful);
403		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
404	}
405	sp -= sizeof(struct sigframe);
406	/* Align to 16 bytes. */
407	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
408
409	/* Build the argument list for the signal handler. */
410	regs->tf_rdi = sig;			/* arg 1 in %rdi */
411	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
412	bzero(&sf.sf_si, sizeof(sf.sf_si));
413	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
414		/* Signal handler installed with SA_SIGINFO. */
415		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
416		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
417
418		/* Fill in POSIX parts */
419		sf.sf_si = ksi->ksi_info;
420		sf.sf_si.si_signo = sig; /* maybe a translated signal */
421		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
422	} else {
423		/* Old FreeBSD-style arguments. */
424		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
425		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
426		sf.sf_ahu.sf_handler = catcher;
427	}
428	mtx_unlock(&psp->ps_mtx);
429	PROC_UNLOCK(p);
430
431	/*
432	 * Copy the sigframe out to the user's stack.
433	 */
434	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
435	    (xfpusave != NULL && copyout(xfpusave,
436	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
437	    != 0)) {
438#ifdef DEBUG
439		printf("process %ld has trashed its stack\n", (long)p->p_pid);
440#endif
441		PROC_LOCK(p);
442		sigexit(td, SIGILL);
443	}
444
445	regs->tf_rsp = (long)sfp;
446	regs->tf_rip = p->p_sysent->sv_sigcode_base;
447	regs->tf_rflags &= ~(PSL_T | PSL_D);
448	regs->tf_cs = _ucodesel;
449	regs->tf_ds = _udatasel;
450	regs->tf_ss = _udatasel;
451	regs->tf_es = _udatasel;
452	regs->tf_fs = _ufssel;
453	regs->tf_gs = _ugssel;
454	regs->tf_flags = TF_HASSEGS;
455	PROC_LOCK(p);
456	mtx_lock(&psp->ps_mtx);
457}
458
459/*
460 * System call to cleanup state after a signal
461 * has been taken.  Reset signal mask and
462 * stack state from context left by sendsig (above).
463 * Return to previous pc and psl as specified by
464 * context left by sendsig. Check carefully to
465 * make sure that the user has not modified the
466 * state to gain improper privileges.
467 *
468 * MPSAFE
469 */
470int
471sys_sigreturn(td, uap)
472	struct thread *td;
473	struct sigreturn_args /* {
474		const struct __ucontext *sigcntxp;
475	} */ *uap;
476{
477	ucontext_t uc;
478	struct pcb *pcb;
479	struct proc *p;
480	struct trapframe *regs;
481	ucontext_t *ucp;
482	char *xfpustate;
483	size_t xfpustate_len;
484	long rflags;
485	int cs, error, ret;
486	ksiginfo_t ksi;
487
488	pcb = td->td_pcb;
489	p = td->td_proc;
490
491	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
492	if (error != 0) {
493		uprintf("pid %d (%s): sigreturn copyin failed\n",
494		    p->p_pid, td->td_name);
495		return (error);
496	}
497	ucp = &uc;
498	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
499		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
500		    td->td_name, ucp->uc_mcontext.mc_flags);
501		return (EINVAL);
502	}
503	regs = td->td_frame;
504	rflags = ucp->uc_mcontext.mc_rflags;
505	/*
506	 * Don't allow users to change privileged or reserved flags.
507	 */
508	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
509		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
510		    td->td_name, rflags);
511		return (EINVAL);
512	}
513
514	/*
515	 * Don't allow users to load a valid privileged %cs.  Let the
516	 * hardware check for invalid selectors, excess privilege in
517	 * other selectors, invalid %eip's and invalid %esp's.
518	 */
519	cs = ucp->uc_mcontext.mc_cs;
520	if (!CS_SECURE(cs)) {
521		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
522		    td->td_name, cs);
523		ksiginfo_init_trap(&ksi);
524		ksi.ksi_signo = SIGBUS;
525		ksi.ksi_code = BUS_OBJERR;
526		ksi.ksi_trapno = T_PROTFLT;
527		ksi.ksi_addr = (void *)regs->tf_rip;
528		trapsignal(td, &ksi);
529		return (EINVAL);
530	}
531
532	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
533		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
534		if (xfpustate_len > cpu_max_ext_state_size -
535		    sizeof(struct savefpu)) {
536			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
537			    p->p_pid, td->td_name, xfpustate_len);
538			return (EINVAL);
539		}
540		xfpustate = __builtin_alloca(xfpustate_len);
541		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
542		    xfpustate, xfpustate_len);
543		if (error != 0) {
544			uprintf(
545	"pid %d (%s): sigreturn copying xfpustate failed\n",
546			    p->p_pid, td->td_name);
547			return (error);
548		}
549	} else {
550		xfpustate = NULL;
551		xfpustate_len = 0;
552	}
553	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
554	if (ret != 0) {
555		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
556		    p->p_pid, td->td_name, ret);
557		return (ret);
558	}
559	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
560	update_pcb_bases(pcb);
561	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
562	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
563
564#if defined(COMPAT_43)
565	if (ucp->uc_mcontext.mc_onstack & 1)
566		td->td_sigstk.ss_flags |= SS_ONSTACK;
567	else
568		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
569#endif
570
571	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
572	return (EJUSTRETURN);
573}
574
575#ifdef COMPAT_FREEBSD4
576int
577freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
578{
579
580	return sys_sigreturn(td, (struct sigreturn_args *)uap);
581}
582#endif
583
584/*
585 * Reset the hardware debug registers if they were in use.
586 * They won't have any meaning for the newly exec'd process.
587 */
588void
589x86_clear_dbregs(struct pcb *pcb)
590{
591	if ((pcb->pcb_flags & PCB_DBREGS) == 0)
592		return;
593
594	pcb->pcb_dr0 = 0;
595	pcb->pcb_dr1 = 0;
596	pcb->pcb_dr2 = 0;
597	pcb->pcb_dr3 = 0;
598	pcb->pcb_dr6 = 0;
599	pcb->pcb_dr7 = 0;
600
601	if (pcb == curpcb) {
602		/*
603		 * Clear the debug registers on the running CPU,
604		 * otherwise they will end up affecting the next
605		 * process we switch to.
606		 */
607		reset_dbregs();
608	}
609	clear_pcb_flags(pcb, PCB_DBREGS);
610}
611
612/*
613 * Reset registers to default values on exec.
614 */
615void
616exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
617{
618	struct trapframe *regs;
619	struct pcb *pcb;
620	register_t saved_rflags;
621
622	regs = td->td_frame;
623	pcb = td->td_pcb;
624
625	if (td->td_proc->p_md.md_ldt != NULL)
626		user_ldt_free(td);
627
628	update_pcb_bases(pcb);
629	pcb->pcb_fsbase = 0;
630	pcb->pcb_gsbase = 0;
631	clear_pcb_flags(pcb, PCB_32BIT);
632	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
633
634	saved_rflags = regs->tf_rflags & PSL_T;
635	bzero((char *)regs, sizeof(struct trapframe));
636	regs->tf_rip = imgp->entry_addr;
637	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
638	regs->tf_rdi = stack;		/* argv */
639	regs->tf_rflags = PSL_USER | saved_rflags;
640	regs->tf_ss = _udatasel;
641	regs->tf_cs = _ucodesel;
642	regs->tf_ds = _udatasel;
643	regs->tf_es = _udatasel;
644	regs->tf_fs = _ufssel;
645	regs->tf_gs = _ugssel;
646	regs->tf_flags = TF_HASSEGS;
647
648	x86_clear_dbregs(pcb);
649
650	/*
651	 * Drop the FP state if we hold it, so that the process gets a
652	 * clean FP state if it uses the FPU again.
653	 */
654	fpstate_drop(td);
655}
656
657void
658cpu_setregs(void)
659{
660	register_t cr0;
661
662	cr0 = rcr0();
663	/*
664	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
665	 * BSP.  See the comments there about why we set them.
666	 */
667	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
668	load_cr0(cr0);
669}
670
671/*
672 * Initialize amd64 and configure to run kernel
673 */
674
675/*
676 * Initialize segments & interrupt table
677 */
678static struct gate_descriptor idt0[NIDT];
679struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
680
681static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
682static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
683static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
684static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
685CTASSERT(sizeof(struct nmi_pcpu) == 16);
686
687/*
688 * Software prototypes -- in more palatable form.
689 *
690 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
691 * slots as corresponding segments for i386 kernel.
692 */
693struct soft_segment_descriptor gdt_segs[] = {
694/* GNULL_SEL	0 Null Descriptor */
695{	.ssd_base = 0x0,
696	.ssd_limit = 0x0,
697	.ssd_type = 0,
698	.ssd_dpl = 0,
699	.ssd_p = 0,
700	.ssd_long = 0,
701	.ssd_def32 = 0,
702	.ssd_gran = 0		},
703/* GNULL2_SEL	1 Null Descriptor */
704{	.ssd_base = 0x0,
705	.ssd_limit = 0x0,
706	.ssd_type = 0,
707	.ssd_dpl = 0,
708	.ssd_p = 0,
709	.ssd_long = 0,
710	.ssd_def32 = 0,
711	.ssd_gran = 0		},
712/* GUFS32_SEL	2 32 bit %gs Descriptor for user */
713{	.ssd_base = 0x0,
714	.ssd_limit = 0xfffff,
715	.ssd_type = SDT_MEMRWA,
716	.ssd_dpl = SEL_UPL,
717	.ssd_p = 1,
718	.ssd_long = 0,
719	.ssd_def32 = 1,
720	.ssd_gran = 1		},
721/* GUGS32_SEL	3 32 bit %fs Descriptor for user */
722{	.ssd_base = 0x0,
723	.ssd_limit = 0xfffff,
724	.ssd_type = SDT_MEMRWA,
725	.ssd_dpl = SEL_UPL,
726	.ssd_p = 1,
727	.ssd_long = 0,
728	.ssd_def32 = 1,
729	.ssd_gran = 1		},
730/* GCODE_SEL	4 Code Descriptor for kernel */
731{	.ssd_base = 0x0,
732	.ssd_limit = 0xfffff,
733	.ssd_type = SDT_MEMERA,
734	.ssd_dpl = SEL_KPL,
735	.ssd_p = 1,
736	.ssd_long = 1,
737	.ssd_def32 = 0,
738	.ssd_gran = 1		},
739/* GDATA_SEL	5 Data Descriptor for kernel */
740{	.ssd_base = 0x0,
741	.ssd_limit = 0xfffff,
742	.ssd_type = SDT_MEMRWA,
743	.ssd_dpl = SEL_KPL,
744	.ssd_p = 1,
745	.ssd_long = 1,
746	.ssd_def32 = 0,
747	.ssd_gran = 1		},
748/* GUCODE32_SEL	6 32 bit Code Descriptor for user */
749{	.ssd_base = 0x0,
750	.ssd_limit = 0xfffff,
751	.ssd_type = SDT_MEMERA,
752	.ssd_dpl = SEL_UPL,
753	.ssd_p = 1,
754	.ssd_long = 0,
755	.ssd_def32 = 1,
756	.ssd_gran = 1		},
757/* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
758{	.ssd_base = 0x0,
759	.ssd_limit = 0xfffff,
760	.ssd_type = SDT_MEMRWA,
761	.ssd_dpl = SEL_UPL,
762	.ssd_p = 1,
763	.ssd_long = 0,
764	.ssd_def32 = 1,
765	.ssd_gran = 1		},
766/* GUCODE_SEL	8 64 bit Code Descriptor for user */
767{	.ssd_base = 0x0,
768	.ssd_limit = 0xfffff,
769	.ssd_type = SDT_MEMERA,
770	.ssd_dpl = SEL_UPL,
771	.ssd_p = 1,
772	.ssd_long = 1,
773	.ssd_def32 = 0,
774	.ssd_gran = 1		},
775/* GPROC0_SEL	9 Proc 0 Tss Descriptor */
776{	.ssd_base = 0x0,
777	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
778	.ssd_type = SDT_SYSTSS,
779	.ssd_dpl = SEL_KPL,
780	.ssd_p = 1,
781	.ssd_long = 0,
782	.ssd_def32 = 0,
783	.ssd_gran = 0		},
784/* Actually, the TSS is a system descriptor which is double size */
785{	.ssd_base = 0x0,
786	.ssd_limit = 0x0,
787	.ssd_type = 0,
788	.ssd_dpl = 0,
789	.ssd_p = 0,
790	.ssd_long = 0,
791	.ssd_def32 = 0,
792	.ssd_gran = 0		},
793/* GUSERLDT_SEL	11 LDT Descriptor */
794{	.ssd_base = 0x0,
795	.ssd_limit = 0x0,
796	.ssd_type = 0,
797	.ssd_dpl = 0,
798	.ssd_p = 0,
799	.ssd_long = 0,
800	.ssd_def32 = 0,
801	.ssd_gran = 0		},
802/* GUSERLDT_SEL	12 LDT Descriptor, double size */
803{	.ssd_base = 0x0,
804	.ssd_limit = 0x0,
805	.ssd_type = 0,
806	.ssd_dpl = 0,
807	.ssd_p = 0,
808	.ssd_long = 0,
809	.ssd_def32 = 0,
810	.ssd_gran = 0		},
811};
812_Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
813
814void
815setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
816{
817	struct gate_descriptor *ip;
818
819	ip = idt + idx;
820	ip->gd_looffset = (uintptr_t)func;
821	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
822	ip->gd_ist = ist;
823	ip->gd_xx = 0;
824	ip->gd_type = typ;
825	ip->gd_dpl = dpl;
826	ip->gd_p = 1;
827	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
828}
829
830extern inthand_t
831	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
832	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
833	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
834	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
835	IDTVEC(xmm), IDTVEC(dblfault),
836	IDTVEC(div_pti), IDTVEC(bpt_pti),
837	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
838	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
839	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
840	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
841	IDTVEC(xmm_pti),
842#ifdef KDTRACE_HOOKS
843	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
844#endif
845#ifdef XENHVM
846	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
847#endif
848	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
849	IDTVEC(fast_syscall_pti);
850
851#ifdef DDB
852/*
853 * Display the index and function name of any IDT entries that don't use
854 * the default 'rsvd' entry point.
855 */
856DB_SHOW_COMMAND(idt, db_show_idt)
857{
858	struct gate_descriptor *ip;
859	int idx;
860	uintptr_t func;
861
862	ip = idt;
863	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
864		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
865		if (func != (uintptr_t)&IDTVEC(rsvd)) {
866			db_printf("%3d\t", idx);
867			db_printsym(func, DB_STGY_PROC);
868			db_printf("\n");
869		}
870		ip++;
871	}
872}
873
874/* Show privileged registers. */
875DB_SHOW_COMMAND(sysregs, db_show_sysregs)
876{
877	struct {
878		uint16_t limit;
879		uint64_t base;
880	} __packed idtr, gdtr;
881	uint16_t ldt, tr;
882
883	__asm __volatile("sidt %0" : "=m" (idtr));
884	db_printf("idtr\t0x%016lx/%04x\n",
885	    (u_long)idtr.base, (u_int)idtr.limit);
886	__asm __volatile("sgdt %0" : "=m" (gdtr));
887	db_printf("gdtr\t0x%016lx/%04x\n",
888	    (u_long)gdtr.base, (u_int)gdtr.limit);
889	__asm __volatile("sldt %0" : "=r" (ldt));
890	db_printf("ldtr\t0x%04x\n", ldt);
891	__asm __volatile("str %0" : "=r" (tr));
892	db_printf("tr\t0x%04x\n", tr);
893	db_printf("cr0\t0x%016lx\n", rcr0());
894	db_printf("cr2\t0x%016lx\n", rcr2());
895	db_printf("cr3\t0x%016lx\n", rcr3());
896	db_printf("cr4\t0x%016lx\n", rcr4());
897	if (rcr4() & CR4_XSAVE)
898		db_printf("xcr0\t0x%016lx\n", rxcr(0));
899	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
900	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
901		db_printf("FEATURES_CTL\t%016lx\n",
902		    rdmsr(MSR_IA32_FEATURE_CONTROL));
903	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
904	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
905	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
906}
907
908DB_SHOW_COMMAND(dbregs, db_show_dbregs)
909{
910
911	db_printf("dr0\t0x%016lx\n", rdr0());
912	db_printf("dr1\t0x%016lx\n", rdr1());
913	db_printf("dr2\t0x%016lx\n", rdr2());
914	db_printf("dr3\t0x%016lx\n", rdr3());
915	db_printf("dr6\t0x%016lx\n", rdr6());
916	db_printf("dr7\t0x%016lx\n", rdr7());
917}
918#endif
919
920void
921sdtossd(sd, ssd)
922	struct user_segment_descriptor *sd;
923	struct soft_segment_descriptor *ssd;
924{
925
926	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
927	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
928	ssd->ssd_type  = sd->sd_type;
929	ssd->ssd_dpl   = sd->sd_dpl;
930	ssd->ssd_p     = sd->sd_p;
931	ssd->ssd_long  = sd->sd_long;
932	ssd->ssd_def32 = sd->sd_def32;
933	ssd->ssd_gran  = sd->sd_gran;
934}
935
936void
937ssdtosd(ssd, sd)
938	struct soft_segment_descriptor *ssd;
939	struct user_segment_descriptor *sd;
940{
941
942	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
943	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
944	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
945	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
946	sd->sd_type  = ssd->ssd_type;
947	sd->sd_dpl   = ssd->ssd_dpl;
948	sd->sd_p     = ssd->ssd_p;
949	sd->sd_long  = ssd->ssd_long;
950	sd->sd_def32 = ssd->ssd_def32;
951	sd->sd_gran  = ssd->ssd_gran;
952}
953
954void
955ssdtosyssd(ssd, sd)
956	struct soft_segment_descriptor *ssd;
957	struct system_segment_descriptor *sd;
958{
959
960	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
961	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
962	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
963	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
964	sd->sd_type  = ssd->ssd_type;
965	sd->sd_dpl   = ssd->ssd_dpl;
966	sd->sd_p     = ssd->ssd_p;
967	sd->sd_gran  = ssd->ssd_gran;
968}
969
970u_int basemem;
971
972static int
973add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
974    int *physmap_idxp)
975{
976	int i, insert_idx, physmap_idx;
977
978	physmap_idx = *physmap_idxp;
979
980	if (length == 0)
981		return (1);
982
983	/*
984	 * Find insertion point while checking for overlap.  Start off by
985	 * assuming the new entry will be added to the end.
986	 *
987	 * NB: physmap_idx points to the next free slot.
988	 */
989	insert_idx = physmap_idx;
990	for (i = 0; i <= physmap_idx; i += 2) {
991		if (base < physmap[i + 1]) {
992			if (base + length <= physmap[i]) {
993				insert_idx = i;
994				break;
995			}
996			if (boothowto & RB_VERBOSE)
997				printf(
998		    "Overlapping memory regions, ignoring second region\n");
999			return (1);
1000		}
1001	}
1002
1003	/* See if we can prepend to the next entry. */
1004	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1005		physmap[insert_idx] = base;
1006		return (1);
1007	}
1008
1009	/* See if we can append to the previous entry. */
1010	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1011		physmap[insert_idx - 1] += length;
1012		return (1);
1013	}
1014
1015	physmap_idx += 2;
1016	*physmap_idxp = physmap_idx;
1017	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1018		printf(
1019		"Too many segments in the physical address map, giving up\n");
1020		return (0);
1021	}
1022
1023	/*
1024	 * Move the last 'N' entries down to make room for the new
1025	 * entry if needed.
1026	 */
1027	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1028		physmap[i] = physmap[i - 2];
1029		physmap[i + 1] = physmap[i - 1];
1030	}
1031
1032	/* Insert the new entry. */
1033	physmap[insert_idx] = base;
1034	physmap[insert_idx + 1] = base + length;
1035	return (1);
1036}
1037
1038void
1039bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1040                      vm_paddr_t *physmap, int *physmap_idx)
1041{
1042	struct bios_smap *smap, *smapend;
1043
1044	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1045
1046	for (smap = smapbase; smap < smapend; smap++) {
1047		if (boothowto & RB_VERBOSE)
1048			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1049			    smap->type, smap->base, smap->length);
1050
1051		if (smap->type != SMAP_TYPE_MEMORY)
1052			continue;
1053
1054		if (!add_physmap_entry(smap->base, smap->length, physmap,
1055		    physmap_idx))
1056			break;
1057	}
1058}
1059
1060static void
1061add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1062    int *physmap_idx)
1063{
1064	struct efi_md *map, *p;
1065	const char *type;
1066	size_t efisz;
1067	int ndesc, i;
1068
1069	static const char *types[] = {
1070		"Reserved",
1071		"LoaderCode",
1072		"LoaderData",
1073		"BootServicesCode",
1074		"BootServicesData",
1075		"RuntimeServicesCode",
1076		"RuntimeServicesData",
1077		"ConventionalMemory",
1078		"UnusableMemory",
1079		"ACPIReclaimMemory",
1080		"ACPIMemoryNVS",
1081		"MemoryMappedIO",
1082		"MemoryMappedIOPortSpace",
1083		"PalCode",
1084		"PersistentMemory"
1085	};
1086
1087	/*
1088	 * Memory map data provided by UEFI via the GetMemoryMap
1089	 * Boot Services API.
1090	 */
1091	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1092	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1093
1094	if (efihdr->descriptor_size == 0)
1095		return;
1096	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1097
1098	if (boothowto & RB_VERBOSE)
1099		printf("%23s %12s %12s %8s %4s\n",
1100		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1101
1102	for (i = 0, p = map; i < ndesc; i++,
1103	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1104		if (boothowto & RB_VERBOSE) {
1105			if (p->md_type < nitems(types))
1106				type = types[p->md_type];
1107			else
1108				type = "<INVALID>";
1109			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1110			    p->md_virt, p->md_pages);
1111			if (p->md_attr & EFI_MD_ATTR_UC)
1112				printf("UC ");
1113			if (p->md_attr & EFI_MD_ATTR_WC)
1114				printf("WC ");
1115			if (p->md_attr & EFI_MD_ATTR_WT)
1116				printf("WT ");
1117			if (p->md_attr & EFI_MD_ATTR_WB)
1118				printf("WB ");
1119			if (p->md_attr & EFI_MD_ATTR_UCE)
1120				printf("UCE ");
1121			if (p->md_attr & EFI_MD_ATTR_WP)
1122				printf("WP ");
1123			if (p->md_attr & EFI_MD_ATTR_RP)
1124				printf("RP ");
1125			if (p->md_attr & EFI_MD_ATTR_XP)
1126				printf("XP ");
1127			if (p->md_attr & EFI_MD_ATTR_NV)
1128				printf("NV ");
1129			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1130				printf("MORE_RELIABLE ");
1131			if (p->md_attr & EFI_MD_ATTR_RO)
1132				printf("RO ");
1133			if (p->md_attr & EFI_MD_ATTR_RT)
1134				printf("RUNTIME");
1135			printf("\n");
1136		}
1137
1138		switch (p->md_type) {
1139		case EFI_MD_TYPE_CODE:
1140		case EFI_MD_TYPE_DATA:
1141		case EFI_MD_TYPE_BS_CODE:
1142		case EFI_MD_TYPE_BS_DATA:
1143		case EFI_MD_TYPE_FREE:
1144			/*
1145			 * We're allowed to use any entry with these types.
1146			 */
1147			break;
1148		default:
1149			continue;
1150		}
1151
1152		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1153		    physmap, physmap_idx))
1154			break;
1155	}
1156}
1157
1158static char bootmethod[16] = "";
1159SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1160    "System firmware boot method");
1161
1162static void
1163native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1164{
1165	struct bios_smap *smap;
1166	struct efi_map_header *efihdr;
1167	u_int32_t size;
1168
1169	/*
1170	 * Memory map from INT 15:E820.
1171	 *
1172	 * subr_module.c says:
1173	 * "Consumer may safely assume that size value precedes data."
1174	 * ie: an int32_t immediately precedes smap.
1175	 */
1176
1177	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1178	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1179	smap = (struct bios_smap *)preload_search_info(kmdp,
1180	    MODINFO_METADATA | MODINFOMD_SMAP);
1181	if (efihdr == NULL && smap == NULL)
1182		panic("No BIOS smap or EFI map info from loader!");
1183
1184	if (efihdr != NULL) {
1185		add_efi_map_entries(efihdr, physmap, physmap_idx);
1186		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1187	} else {
1188		size = *((u_int32_t *)smap - 1);
1189		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1190		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1191	}
1192}
1193
1194#define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1195
1196/*
1197 * Populate the (physmap) array with base/bound pairs describing the
1198 * available physical memory in the system, then test this memory and
1199 * build the phys_avail array describing the actually-available memory.
1200 *
1201 * Total memory size may be set by the kernel environment variable
1202 * hw.physmem or the compile-time define MAXMEM.
1203 *
1204 * XXX first should be vm_paddr_t.
1205 */
1206static void
1207getmemsize(caddr_t kmdp, u_int64_t first)
1208{
1209	int i, physmap_idx, pa_indx, da_indx;
1210	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1211	u_long physmem_start, physmem_tunable, memtest;
1212	pt_entry_t *pte;
1213	quad_t dcons_addr, dcons_size;
1214	int page_counter;
1215
1216	/*
1217	 * Tell the physical memory allocator about pages used to store
1218	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1219	 */
1220	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1221
1222	bzero(physmap, sizeof(physmap));
1223	physmap_idx = 0;
1224
1225	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1226	physmap_idx -= 2;
1227
1228	/*
1229	 * Find the 'base memory' segment for SMP
1230	 */
1231	basemem = 0;
1232	for (i = 0; i <= physmap_idx; i += 2) {
1233		if (physmap[i] <= 0xA0000) {
1234			basemem = physmap[i + 1] / 1024;
1235			break;
1236		}
1237	}
1238	if (basemem == 0 || basemem > 640) {
1239		if (bootverbose)
1240			printf(
1241		"Memory map doesn't contain a basemem segment, faking it");
1242		basemem = 640;
1243	}
1244
1245	/*
1246	 * Maxmem isn't the "maximum memory", it's one larger than the
1247	 * highest page of the physical address space.  It should be
1248	 * called something like "Maxphyspage".  We may adjust this
1249	 * based on ``hw.physmem'' and the results of the memory test.
1250	 */
1251	Maxmem = atop(physmap[physmap_idx + 1]);
1252
1253#ifdef MAXMEM
1254	Maxmem = MAXMEM / 4;
1255#endif
1256
1257	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1258		Maxmem = atop(physmem_tunable);
1259
1260	/*
1261	 * The boot memory test is disabled by default, as it takes a
1262	 * significant amount of time on large-memory systems, and is
1263	 * unfriendly to virtual machines as it unnecessarily touches all
1264	 * pages.
1265	 *
1266	 * A general name is used as the code may be extended to support
1267	 * additional tests beyond the current "page present" test.
1268	 */
1269	memtest = 0;
1270	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1271
1272	/*
1273	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1274	 * in the system.
1275	 */
1276	if (Maxmem > atop(physmap[physmap_idx + 1]))
1277		Maxmem = atop(physmap[physmap_idx + 1]);
1278
1279	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1280	    (boothowto & RB_VERBOSE))
1281		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1282
1283	/*
1284	 * Make hole for "AP -> long mode" bootstrap code.  The
1285	 * mp_bootaddress vector is only available when the kernel
1286	 * is configured to support APs and APs for the system start
1287	 * in real mode mode (e.g. SMP bare metal).
1288	 */
1289	if (init_ops.mp_bootaddress)
1290		init_ops.mp_bootaddress(physmap, &physmap_idx);
1291
1292	/* call pmap initialization to make new kernel address space */
1293	pmap_bootstrap(&first);
1294
1295	/*
1296	 * Size up each available chunk of physical memory.
1297	 *
1298	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1299	 * By default, mask off the first 16 pages unless we appear to be
1300	 * running in a VM.
1301	 */
1302	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1303	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1304	if (physmap[0] < physmem_start) {
1305		if (physmem_start < PAGE_SIZE)
1306			physmap[0] = PAGE_SIZE;
1307		else if (physmem_start >= physmap[1])
1308			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1309		else
1310			physmap[0] = round_page(physmem_start);
1311	}
1312	pa_indx = 0;
1313	da_indx = 1;
1314	phys_avail[pa_indx++] = physmap[0];
1315	phys_avail[pa_indx] = physmap[0];
1316	dump_avail[da_indx] = physmap[0];
1317	pte = CMAP1;
1318
1319	/*
1320	 * Get dcons buffer address
1321	 */
1322	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1323	    getenv_quad("dcons.size", &dcons_size) == 0)
1324		dcons_addr = 0;
1325
1326	/*
1327	 * physmap is in bytes, so when converting to page boundaries,
1328	 * round up the start address and round down the end address.
1329	 */
1330	page_counter = 0;
1331	if (memtest != 0)
1332		printf("Testing system memory");
1333	for (i = 0; i <= physmap_idx; i += 2) {
1334		vm_paddr_t end;
1335
1336		end = ptoa((vm_paddr_t)Maxmem);
1337		if (physmap[i + 1] < end)
1338			end = trunc_page(physmap[i + 1]);
1339		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1340			int tmp, page_bad, full;
1341			int *ptr = (int *)CADDR1;
1342
1343			full = FALSE;
1344			/*
1345			 * block out kernel memory as not available.
1346			 */
1347			if (pa >= (vm_paddr_t)kernphys && pa < first)
1348				goto do_dump_avail;
1349
1350			/*
1351			 * block out dcons buffer
1352			 */
1353			if (dcons_addr > 0
1354			    && pa >= trunc_page(dcons_addr)
1355			    && pa < dcons_addr + dcons_size)
1356				goto do_dump_avail;
1357
1358			page_bad = FALSE;
1359			if (memtest == 0)
1360				goto skip_memtest;
1361
1362			/*
1363			 * Print a "." every GB to show we're making
1364			 * progress.
1365			 */
1366			page_counter++;
1367			if ((page_counter % PAGES_PER_GB) == 0)
1368				printf(".");
1369
1370			/*
1371			 * map page into kernel: valid, read/write,non-cacheable
1372			 */
1373			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1374			invltlb();
1375
1376			tmp = *(int *)ptr;
1377			/*
1378			 * Test for alternating 1's and 0's
1379			 */
1380			*(volatile int *)ptr = 0xaaaaaaaa;
1381			if (*(volatile int *)ptr != 0xaaaaaaaa)
1382				page_bad = TRUE;
1383			/*
1384			 * Test for alternating 0's and 1's
1385			 */
1386			*(volatile int *)ptr = 0x55555555;
1387			if (*(volatile int *)ptr != 0x55555555)
1388				page_bad = TRUE;
1389			/*
1390			 * Test for all 1's
1391			 */
1392			*(volatile int *)ptr = 0xffffffff;
1393			if (*(volatile int *)ptr != 0xffffffff)
1394				page_bad = TRUE;
1395			/*
1396			 * Test for all 0's
1397			 */
1398			*(volatile int *)ptr = 0x0;
1399			if (*(volatile int *)ptr != 0x0)
1400				page_bad = TRUE;
1401			/*
1402			 * Restore original value.
1403			 */
1404			*(int *)ptr = tmp;
1405
1406skip_memtest:
1407			/*
1408			 * Adjust array of valid/good pages.
1409			 */
1410			if (page_bad == TRUE)
1411				continue;
1412			/*
1413			 * If this good page is a continuation of the
1414			 * previous set of good pages, then just increase
1415			 * the end pointer. Otherwise start a new chunk.
1416			 * Note that "end" points one higher than end,
1417			 * making the range >= start and < end.
1418			 * If we're also doing a speculative memory
1419			 * test and we at or past the end, bump up Maxmem
1420			 * so that we keep going. The first bad page
1421			 * will terminate the loop.
1422			 */
1423			if (phys_avail[pa_indx] == pa) {
1424				phys_avail[pa_indx] += PAGE_SIZE;
1425			} else {
1426				pa_indx++;
1427				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1428					printf(
1429		"Too many holes in the physical address space, giving up\n");
1430					pa_indx--;
1431					full = TRUE;
1432					goto do_dump_avail;
1433				}
1434				phys_avail[pa_indx++] = pa;	/* start */
1435				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1436			}
1437			physmem++;
1438do_dump_avail:
1439			if (dump_avail[da_indx] == pa) {
1440				dump_avail[da_indx] += PAGE_SIZE;
1441			} else {
1442				da_indx++;
1443				if (da_indx == PHYS_AVAIL_ENTRIES) {
1444					da_indx--;
1445					goto do_next;
1446				}
1447				dump_avail[da_indx++] = pa; /* start */
1448				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1449			}
1450do_next:
1451			if (full)
1452				break;
1453		}
1454	}
1455	*pte = 0;
1456	invltlb();
1457	if (memtest != 0)
1458		printf("\n");
1459
1460	/*
1461	 * XXX
1462	 * The last chunk must contain at least one page plus the message
1463	 * buffer to avoid complicating other code (message buffer address
1464	 * calculation, etc.).
1465	 */
1466	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1467	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1468		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1469		phys_avail[pa_indx--] = 0;
1470		phys_avail[pa_indx--] = 0;
1471	}
1472
1473	Maxmem = atop(phys_avail[pa_indx]);
1474
1475	/* Trim off space for the message buffer. */
1476	phys_avail[pa_indx] -= round_page(msgbufsize);
1477
1478	/* Map the message buffer. */
1479	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1480}
1481
1482static caddr_t
1483native_parse_preload_data(u_int64_t modulep)
1484{
1485	caddr_t kmdp;
1486	char *envp;
1487#ifdef DDB
1488	vm_offset_t ksym_start;
1489	vm_offset_t ksym_end;
1490#endif
1491
1492	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1493	preload_bootstrap_relocate(KERNBASE);
1494	kmdp = preload_search_by_type("elf kernel");
1495	if (kmdp == NULL)
1496		kmdp = preload_search_by_type("elf64 kernel");
1497	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1498	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1499	if (envp != NULL)
1500		envp += KERNBASE;
1501	init_static_kenv(envp, 0);
1502#ifdef DDB
1503	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1504	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1505	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1506#endif
1507	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1508
1509	return (kmdp);
1510}
1511
1512static void
1513amd64_kdb_init(void)
1514{
1515	kdb_init();
1516#ifdef KDB
1517	if (boothowto & RB_KDB)
1518		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1519#endif
1520}
1521
1522/* Set up the fast syscall stuff */
1523void
1524amd64_conf_fast_syscall(void)
1525{
1526	uint64_t msr;
1527
1528	msr = rdmsr(MSR_EFER) | EFER_SCE;
1529	wrmsr(MSR_EFER, msr);
1530	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1531	    (u_int64_t)IDTVEC(fast_syscall));
1532	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1533	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1534	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1535	wrmsr(MSR_STAR, msr);
1536	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1537}
1538
1539void
1540amd64_bsp_pcpu_init1(struct pcpu *pc)
1541{
1542	struct user_segment_descriptor *gdt;
1543
1544	PCPU_SET(prvspace, pc);
1545	gdt = *PCPU_PTR(gdt);
1546	PCPU_SET(curthread, &thread0);
1547	PCPU_SET(tssp, PCPU_PTR(common_tss));
1548	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1549	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1550	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1551	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1552	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1553	PCPU_SET(smp_tlb_gen, 1);
1554}
1555
1556void
1557amd64_bsp_pcpu_init2(uint64_t rsp0)
1558{
1559
1560	PCPU_SET(rsp0, rsp0);
1561	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1562	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1563	PCPU_SET(curpcb, thread0.td_pcb);
1564}
1565
1566void
1567amd64_bsp_ist_init(struct pcpu *pc)
1568{
1569	struct nmi_pcpu *np;
1570	struct amd64tss *tssp;
1571
1572	tssp = &pc->pc_common_tss;
1573
1574	/* doublefault stack space, runs on ist1 */
1575	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1576	np->np_pcpu = (register_t)pc;
1577	tssp->tss_ist1 = (long)np;
1578
1579	/*
1580	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1581	 * above the start of the ist2 stack.
1582	 */
1583	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1584	np->np_pcpu = (register_t)pc;
1585	tssp->tss_ist2 = (long)np;
1586
1587	/*
1588	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1589	 * above the start of the ist3 stack.
1590	 */
1591	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1592	np->np_pcpu = (register_t)pc;
1593	tssp->tss_ist3 = (long)np;
1594
1595	/*
1596	 * DB# stack, runs on ist4.
1597	 */
1598	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1599	np->np_pcpu = (register_t)pc;
1600	tssp->tss_ist4 = (long)np;
1601}
1602
1603u_int64_t
1604hammer_time(u_int64_t modulep, u_int64_t physfree)
1605{
1606	caddr_t kmdp;
1607	int gsel_tss, x;
1608	struct pcpu *pc;
1609	struct xstate_hdr *xhdr;
1610	u_int64_t rsp0;
1611	char *env;
1612	struct user_segment_descriptor *gdt;
1613	struct region_descriptor r_gdt;
1614	size_t kstack0_sz;
1615	int late_console;
1616
1617	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1618
1619	kmdp = init_ops.parse_preload_data(modulep);
1620
1621	physfree += ucode_load_bsp(physfree + KERNBASE);
1622	physfree = roundup2(physfree, PAGE_SIZE);
1623
1624	identify_cpu1();
1625	identify_hypervisor();
1626	identify_cpu_fixup_bsp();
1627	identify_cpu2();
1628	initializecpucache();
1629
1630	/*
1631	 * Check for pti, pcid, and invpcid before ifuncs are
1632	 * resolved, to correctly select the implementation for
1633	 * pmap_activate_sw_mode().
1634	 */
1635	pti = pti_get_default();
1636	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1637	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1638	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1639		invpcid_works = (cpu_stdext_feature &
1640		    CPUID_STDEXT_INVPCID) != 0;
1641	} else {
1642		pmap_pcid_enabled = 0;
1643	}
1644
1645	link_elf_ireloc(kmdp);
1646
1647	/*
1648	 * This may be done better later if it gets more high level
1649	 * components in it. If so just link td->td_proc here.
1650	 */
1651	proc_linkup0(&proc0, &thread0);
1652
1653	/* Init basic tunables, hz etc */
1654	init_param1();
1655
1656	thread0.td_kstack = physfree + KERNBASE;
1657	thread0.td_kstack_pages = kstack_pages;
1658	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1659	bzero((void *)thread0.td_kstack, kstack0_sz);
1660	physfree += kstack0_sz;
1661
1662	/*
1663	 * Initialize enough of thread0 for delayed invalidation to
1664	 * work very early.  Rely on thread0.td_base_pri
1665	 * zero-initialization, it is reset to PVM at proc0_init().
1666	 */
1667	pmap_thread_init_invl_gen(&thread0);
1668
1669	pc = &temp_bsp_pcpu;
1670	pcpu_init(pc, 0, sizeof(struct pcpu));
1671	gdt = &temp_bsp_pcpu.pc_gdt[0];
1672
1673	/*
1674	 * make gdt memory segments
1675	 */
1676	for (x = 0; x < NGDT; x++) {
1677		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1678		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1679			ssdtosd(&gdt_segs[x], &gdt[x]);
1680	}
1681	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1682	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1683	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1684
1685	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1686	r_gdt.rd_base = (long)gdt;
1687	lgdt(&r_gdt);
1688
1689	wrmsr(MSR_FSBASE, 0);		/* User value */
1690	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1691	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1692
1693	dpcpu_init((void *)(physfree + KERNBASE), 0);
1694	physfree += DPCPU_SIZE;
1695	amd64_bsp_pcpu_init1(pc);
1696	/* Non-late cninit() and printf() can be moved up to here. */
1697
1698	/*
1699	 * Initialize mutexes.
1700	 *
1701	 * icu_lock: in order to allow an interrupt to occur in a critical
1702	 * 	     section, to set pcpu->ipending (etc...) properly, we
1703	 *	     must be able to get the icu lock, so it can't be
1704	 *	     under witness.
1705	 */
1706	mutex_init();
1707	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1708	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1709
1710	/* exceptions */
1711	for (x = 0; x < NIDT; x++)
1712		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1713		    SEL_KPL, 0);
1714	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1715	    SEL_KPL, 0);
1716	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1717	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1718	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1719	    SEL_UPL, 0);
1720	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1721	    SEL_UPL, 0);
1722	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1723	    SEL_KPL, 0);
1724	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1725	    SEL_KPL, 0);
1726	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1727	    SEL_KPL, 0);
1728	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1729	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1730	    SDT_SYSIGT, SEL_KPL, 0);
1731	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1732	    SEL_KPL, 0);
1733	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1734	    SDT_SYSIGT, SEL_KPL, 0);
1735	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1736	    SEL_KPL, 0);
1737	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1738	    SEL_KPL, 0);
1739	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1740	    SEL_KPL, 0);
1741	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1742	    SEL_KPL, 0);
1743	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1744	    SEL_KPL, 0);
1745	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1746	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1747	    SEL_KPL, 0);
1748#ifdef KDTRACE_HOOKS
1749	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1750	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1751#endif
1752#ifdef XENHVM
1753	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1754	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1755#endif
1756	r_idt.rd_limit = sizeof(idt0) - 1;
1757	r_idt.rd_base = (long) idt;
1758	lidt(&r_idt);
1759
1760	/*
1761	 * Initialize the clock before the console so that console
1762	 * initialization can use DELAY().
1763	 */
1764	clock_init();
1765
1766	/*
1767	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1768	 * transition).
1769	 * Once bootblocks have updated, we can test directly for
1770	 * efi_systbl != NULL here...
1771	 */
1772	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1773	    != NULL)
1774		vty_set_preferred(VTY_VT);
1775
1776	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1777	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1778
1779	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1780	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1781
1782	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1783	    &syscall_ret_l1d_flush_mode);
1784
1785	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1786	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1787
1788	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1789
1790	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1791	    &x86_rngds_mitg_enable);
1792
1793	finishidentcpu();	/* Final stage of CPU initialization */
1794	initializecpu();	/* Initialize CPU registers */
1795
1796	amd64_bsp_ist_init(pc);
1797
1798	/* Set the IO permission bitmap (empty due to tss seg limit) */
1799	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1800	    IOPERM_BITMAP_SIZE;
1801
1802	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1803	ltr(gsel_tss);
1804
1805	amd64_conf_fast_syscall();
1806
1807	/*
1808	 * We initialize the PCB pointer early so that exception
1809	 * handlers will work.  Also set up td_critnest to short-cut
1810	 * the page fault handler.
1811	 */
1812	cpu_max_ext_state_size = sizeof(struct savefpu);
1813	set_top_of_stack_td(&thread0);
1814	thread0.td_pcb = get_pcb_td(&thread0);
1815	thread0.td_critnest = 1;
1816
1817	/*
1818	 * The console and kdb should be initialized even earlier than here,
1819	 * but some console drivers don't work until after getmemsize().
1820	 * Default to late console initialization to support these drivers.
1821	 * This loses mainly printf()s in getmemsize() and early debugging.
1822	 */
1823	late_console = 1;
1824	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1825	if (!late_console) {
1826		cninit();
1827		amd64_kdb_init();
1828	}
1829
1830	getmemsize(kmdp, physfree);
1831	init_param2(physmem);
1832
1833	/* now running on new page tables, configured,and u/iom is accessible */
1834
1835#ifdef DEV_PCI
1836        /* This call might adjust phys_avail[]. */
1837        pci_early_quirks();
1838#endif
1839
1840	if (late_console)
1841		cninit();
1842
1843	/*
1844	 * Dump the boot metadata. We have to wait for cninit() since console
1845	 * output is required. If it's grossly incorrect the kernel will never
1846	 * make it this far.
1847	 */
1848	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1849		preload_dump();
1850
1851#ifdef DEV_ISA
1852#ifdef DEV_ATPIC
1853	elcr_probe();
1854	atpic_startup();
1855#else
1856	/* Reset and mask the atpics and leave them shut down. */
1857	atpic_reset();
1858
1859	/*
1860	 * Point the ICU spurious interrupt vectors at the APIC spurious
1861	 * interrupt handler.
1862	 */
1863	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1864	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1865#endif
1866#else
1867#error "have you forgotten the isa device?"
1868#endif
1869
1870	if (late_console)
1871		amd64_kdb_init();
1872
1873	msgbufinit(msgbufp, msgbufsize);
1874	fpuinit();
1875
1876	/*
1877	 * Reinitialize thread0's stack base now that the xsave area size is
1878	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
1879	 * save area size.  Zero out the extended state header in fpu save area.
1880	 */
1881	set_top_of_stack_td(&thread0);
1882	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1883	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1884	if (use_xsave) {
1885		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1886		    1);
1887		xhdr->xstate_bv = xsave_mask;
1888	}
1889	/* make an initial tss so cpu can get interrupt stack on syscall! */
1890	rsp0 = thread0.td_md.md_stack_base;
1891	/* Ensure the stack is aligned to 16 bytes */
1892	rsp0 &= ~0xFul;
1893	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1894	amd64_bsp_pcpu_init2(rsp0);
1895
1896	/* transfer to user mode */
1897
1898	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1899	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1900	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1901	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1902	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1903
1904	load_ds(_udatasel);
1905	load_es(_udatasel);
1906	load_fs(_ufssel);
1907
1908	/* setup proc 0's pcb */
1909	thread0.td_pcb->pcb_flags = 0;
1910	thread0.td_frame = &proc0_tf;
1911
1912        env = kern_getenv("kernelname");
1913	if (env != NULL)
1914		strlcpy(kernelname, env, sizeof(kernelname));
1915
1916	kcsan_cpu_init(0);
1917
1918#ifdef FDT
1919	x86_init_fdt();
1920#endif
1921	thread0.td_critnest = 0;
1922
1923	TSEXIT();
1924
1925	/* Location of kernel stack for locore */
1926	return (thread0.td_md.md_stack_base);
1927}
1928
1929void
1930cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1931{
1932
1933	pcpu->pc_acpi_id = 0xffffffff;
1934}
1935
1936static int
1937smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1938{
1939	struct bios_smap *smapbase;
1940	struct bios_smap_xattr smap;
1941	caddr_t kmdp;
1942	uint32_t *smapattr;
1943	int count, error, i;
1944
1945	/* Retrieve the system memory map from the loader. */
1946	kmdp = preload_search_by_type("elf kernel");
1947	if (kmdp == NULL)
1948		kmdp = preload_search_by_type("elf64 kernel");
1949	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1950	    MODINFO_METADATA | MODINFOMD_SMAP);
1951	if (smapbase == NULL)
1952		return (0);
1953	smapattr = (uint32_t *)preload_search_info(kmdp,
1954	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1955	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1956	error = 0;
1957	for (i = 0; i < count; i++) {
1958		smap.base = smapbase[i].base;
1959		smap.length = smapbase[i].length;
1960		smap.type = smapbase[i].type;
1961		if (smapattr != NULL)
1962			smap.xattr = smapattr[i];
1963		else
1964			smap.xattr = 0;
1965		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1966	}
1967	return (error);
1968}
1969SYSCTL_PROC(_machdep, OID_AUTO, smap,
1970    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1971    smap_sysctl_handler, "S,bios_smap_xattr",
1972    "Raw BIOS SMAP data");
1973
1974static int
1975efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1976{
1977	struct efi_map_header *efihdr;
1978	caddr_t kmdp;
1979	uint32_t efisize;
1980
1981	kmdp = preload_search_by_type("elf kernel");
1982	if (kmdp == NULL)
1983		kmdp = preload_search_by_type("elf64 kernel");
1984	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1985	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1986	if (efihdr == NULL)
1987		return (0);
1988	efisize = *((uint32_t *)efihdr - 1);
1989	return (SYSCTL_OUT(req, efihdr, efisize));
1990}
1991SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1992    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1993    efi_map_sysctl_handler, "S,efi_map_header",
1994    "Raw EFI Memory Map");
1995
1996void
1997spinlock_enter(void)
1998{
1999	struct thread *td;
2000	register_t flags;
2001
2002	td = curthread;
2003	if (td->td_md.md_spinlock_count == 0) {
2004		flags = intr_disable();
2005		td->td_md.md_spinlock_count = 1;
2006		td->td_md.md_saved_flags = flags;
2007		critical_enter();
2008	} else
2009		td->td_md.md_spinlock_count++;
2010}
2011
2012void
2013spinlock_exit(void)
2014{
2015	struct thread *td;
2016	register_t flags;
2017
2018	td = curthread;
2019	flags = td->td_md.md_saved_flags;
2020	td->td_md.md_spinlock_count--;
2021	if (td->td_md.md_spinlock_count == 0) {
2022		critical_exit();
2023		intr_restore(flags);
2024	}
2025}
2026
2027/*
2028 * Construct a PCB from a trapframe. This is called from kdb_trap() where
2029 * we want to start a backtrace from the function that caused us to enter
2030 * the debugger. We have the context in the trapframe, but base the trace
2031 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2032 * enough for a backtrace.
2033 */
2034void
2035makectx(struct trapframe *tf, struct pcb *pcb)
2036{
2037
2038	pcb->pcb_r12 = tf->tf_r12;
2039	pcb->pcb_r13 = tf->tf_r13;
2040	pcb->pcb_r14 = tf->tf_r14;
2041	pcb->pcb_r15 = tf->tf_r15;
2042	pcb->pcb_rbp = tf->tf_rbp;
2043	pcb->pcb_rbx = tf->tf_rbx;
2044	pcb->pcb_rip = tf->tf_rip;
2045	pcb->pcb_rsp = tf->tf_rsp;
2046}
2047
2048int
2049ptrace_set_pc(struct thread *td, unsigned long addr)
2050{
2051
2052	td->td_frame->tf_rip = addr;
2053	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2054	return (0);
2055}
2056
2057int
2058ptrace_single_step(struct thread *td)
2059{
2060
2061	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2062	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2063		td->td_frame->tf_rflags |= PSL_T;
2064		td->td_dbgflags |= TDB_STEP;
2065	}
2066	return (0);
2067}
2068
2069int
2070ptrace_clear_single_step(struct thread *td)
2071{
2072
2073	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2074	td->td_frame->tf_rflags &= ~PSL_T;
2075	td->td_dbgflags &= ~TDB_STEP;
2076	return (0);
2077}
2078
2079int
2080fill_regs(struct thread *td, struct reg *regs)
2081{
2082	struct trapframe *tp;
2083
2084	tp = td->td_frame;
2085	return (fill_frame_regs(tp, regs));
2086}
2087
2088int
2089fill_frame_regs(struct trapframe *tp, struct reg *regs)
2090{
2091
2092	regs->r_r15 = tp->tf_r15;
2093	regs->r_r14 = tp->tf_r14;
2094	regs->r_r13 = tp->tf_r13;
2095	regs->r_r12 = tp->tf_r12;
2096	regs->r_r11 = tp->tf_r11;
2097	regs->r_r10 = tp->tf_r10;
2098	regs->r_r9  = tp->tf_r9;
2099	regs->r_r8  = tp->tf_r8;
2100	regs->r_rdi = tp->tf_rdi;
2101	regs->r_rsi = tp->tf_rsi;
2102	regs->r_rbp = tp->tf_rbp;
2103	regs->r_rbx = tp->tf_rbx;
2104	regs->r_rdx = tp->tf_rdx;
2105	regs->r_rcx = tp->tf_rcx;
2106	regs->r_rax = tp->tf_rax;
2107	regs->r_rip = tp->tf_rip;
2108	regs->r_cs = tp->tf_cs;
2109	regs->r_rflags = tp->tf_rflags;
2110	regs->r_rsp = tp->tf_rsp;
2111	regs->r_ss = tp->tf_ss;
2112	if (tp->tf_flags & TF_HASSEGS) {
2113		regs->r_ds = tp->tf_ds;
2114		regs->r_es = tp->tf_es;
2115		regs->r_fs = tp->tf_fs;
2116		regs->r_gs = tp->tf_gs;
2117	} else {
2118		regs->r_ds = 0;
2119		regs->r_es = 0;
2120		regs->r_fs = 0;
2121		regs->r_gs = 0;
2122	}
2123	regs->r_err = 0;
2124	regs->r_trapno = 0;
2125	return (0);
2126}
2127
2128int
2129set_regs(struct thread *td, struct reg *regs)
2130{
2131	struct trapframe *tp;
2132	register_t rflags;
2133
2134	tp = td->td_frame;
2135	rflags = regs->r_rflags & 0xffffffff;
2136	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2137		return (EINVAL);
2138	tp->tf_r15 = regs->r_r15;
2139	tp->tf_r14 = regs->r_r14;
2140	tp->tf_r13 = regs->r_r13;
2141	tp->tf_r12 = regs->r_r12;
2142	tp->tf_r11 = regs->r_r11;
2143	tp->tf_r10 = regs->r_r10;
2144	tp->tf_r9  = regs->r_r9;
2145	tp->tf_r8  = regs->r_r8;
2146	tp->tf_rdi = regs->r_rdi;
2147	tp->tf_rsi = regs->r_rsi;
2148	tp->tf_rbp = regs->r_rbp;
2149	tp->tf_rbx = regs->r_rbx;
2150	tp->tf_rdx = regs->r_rdx;
2151	tp->tf_rcx = regs->r_rcx;
2152	tp->tf_rax = regs->r_rax;
2153	tp->tf_rip = regs->r_rip;
2154	tp->tf_cs = regs->r_cs;
2155	tp->tf_rflags = rflags;
2156	tp->tf_rsp = regs->r_rsp;
2157	tp->tf_ss = regs->r_ss;
2158	if (0) {	/* XXXKIB */
2159		tp->tf_ds = regs->r_ds;
2160		tp->tf_es = regs->r_es;
2161		tp->tf_fs = regs->r_fs;
2162		tp->tf_gs = regs->r_gs;
2163		tp->tf_flags = TF_HASSEGS;
2164	}
2165	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2166	return (0);
2167}
2168
2169/* XXX check all this stuff! */
2170/* externalize from sv_xmm */
2171static void
2172fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2173{
2174	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2175	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2176	int i;
2177
2178	/* pcb -> fpregs */
2179	bzero(fpregs, sizeof(*fpregs));
2180
2181	/* FPU control/status */
2182	penv_fpreg->en_cw = penv_xmm->en_cw;
2183	penv_fpreg->en_sw = penv_xmm->en_sw;
2184	penv_fpreg->en_tw = penv_xmm->en_tw;
2185	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2186	penv_fpreg->en_rip = penv_xmm->en_rip;
2187	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2188	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2189	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2190
2191	/* FPU registers */
2192	for (i = 0; i < 8; ++i)
2193		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2194
2195	/* SSE registers */
2196	for (i = 0; i < 16; ++i)
2197		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2198}
2199
2200/* internalize from fpregs into sv_xmm */
2201static void
2202set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2203{
2204	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2205	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2206	int i;
2207
2208	/* fpregs -> pcb */
2209	/* FPU control/status */
2210	penv_xmm->en_cw = penv_fpreg->en_cw;
2211	penv_xmm->en_sw = penv_fpreg->en_sw;
2212	penv_xmm->en_tw = penv_fpreg->en_tw;
2213	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2214	penv_xmm->en_rip = penv_fpreg->en_rip;
2215	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2216	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2217	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2218
2219	/* FPU registers */
2220	for (i = 0; i < 8; ++i)
2221		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2222
2223	/* SSE registers */
2224	for (i = 0; i < 16; ++i)
2225		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2226}
2227
2228/* externalize from td->pcb */
2229int
2230fill_fpregs(struct thread *td, struct fpreg *fpregs)
2231{
2232
2233	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2234	    P_SHOULDSTOP(td->td_proc),
2235	    ("not suspended thread %p", td));
2236	fpugetregs(td);
2237	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2238	return (0);
2239}
2240
2241/* internalize to td->pcb */
2242int
2243set_fpregs(struct thread *td, struct fpreg *fpregs)
2244{
2245
2246	critical_enter();
2247	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2248	fpuuserinited(td);
2249	critical_exit();
2250	return (0);
2251}
2252
2253/*
2254 * Get machine context.
2255 */
2256int
2257get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2258{
2259	struct pcb *pcb;
2260	struct trapframe *tp;
2261
2262	pcb = td->td_pcb;
2263	tp = td->td_frame;
2264	PROC_LOCK(curthread->td_proc);
2265	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2266	PROC_UNLOCK(curthread->td_proc);
2267	mcp->mc_r15 = tp->tf_r15;
2268	mcp->mc_r14 = tp->tf_r14;
2269	mcp->mc_r13 = tp->tf_r13;
2270	mcp->mc_r12 = tp->tf_r12;
2271	mcp->mc_r11 = tp->tf_r11;
2272	mcp->mc_r10 = tp->tf_r10;
2273	mcp->mc_r9  = tp->tf_r9;
2274	mcp->mc_r8  = tp->tf_r8;
2275	mcp->mc_rdi = tp->tf_rdi;
2276	mcp->mc_rsi = tp->tf_rsi;
2277	mcp->mc_rbp = tp->tf_rbp;
2278	mcp->mc_rbx = tp->tf_rbx;
2279	mcp->mc_rcx = tp->tf_rcx;
2280	mcp->mc_rflags = tp->tf_rflags;
2281	if (flags & GET_MC_CLEAR_RET) {
2282		mcp->mc_rax = 0;
2283		mcp->mc_rdx = 0;
2284		mcp->mc_rflags &= ~PSL_C;
2285	} else {
2286		mcp->mc_rax = tp->tf_rax;
2287		mcp->mc_rdx = tp->tf_rdx;
2288	}
2289	mcp->mc_rip = tp->tf_rip;
2290	mcp->mc_cs = tp->tf_cs;
2291	mcp->mc_rsp = tp->tf_rsp;
2292	mcp->mc_ss = tp->tf_ss;
2293	mcp->mc_ds = tp->tf_ds;
2294	mcp->mc_es = tp->tf_es;
2295	mcp->mc_fs = tp->tf_fs;
2296	mcp->mc_gs = tp->tf_gs;
2297	mcp->mc_flags = tp->tf_flags;
2298	mcp->mc_len = sizeof(*mcp);
2299	get_fpcontext(td, mcp, NULL, 0);
2300	update_pcb_bases(pcb);
2301	mcp->mc_fsbase = pcb->pcb_fsbase;
2302	mcp->mc_gsbase = pcb->pcb_gsbase;
2303	mcp->mc_xfpustate = 0;
2304	mcp->mc_xfpustate_len = 0;
2305	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2306	return (0);
2307}
2308
2309/*
2310 * Set machine context.
2311 *
2312 * However, we don't set any but the user modifiable flags, and we won't
2313 * touch the cs selector.
2314 */
2315int
2316set_mcontext(struct thread *td, mcontext_t *mcp)
2317{
2318	struct pcb *pcb;
2319	struct trapframe *tp;
2320	char *xfpustate;
2321	long rflags;
2322	int ret;
2323
2324	pcb = td->td_pcb;
2325	tp = td->td_frame;
2326	if (mcp->mc_len != sizeof(*mcp) ||
2327	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2328		return (EINVAL);
2329	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2330	    (tp->tf_rflags & ~PSL_USERCHANGE);
2331	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2332		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2333		    sizeof(struct savefpu))
2334			return (EINVAL);
2335		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2336		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2337		    mcp->mc_xfpustate_len);
2338		if (ret != 0)
2339			return (ret);
2340	} else
2341		xfpustate = NULL;
2342	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2343	if (ret != 0)
2344		return (ret);
2345	tp->tf_r15 = mcp->mc_r15;
2346	tp->tf_r14 = mcp->mc_r14;
2347	tp->tf_r13 = mcp->mc_r13;
2348	tp->tf_r12 = mcp->mc_r12;
2349	tp->tf_r11 = mcp->mc_r11;
2350	tp->tf_r10 = mcp->mc_r10;
2351	tp->tf_r9  = mcp->mc_r9;
2352	tp->tf_r8  = mcp->mc_r8;
2353	tp->tf_rdi = mcp->mc_rdi;
2354	tp->tf_rsi = mcp->mc_rsi;
2355	tp->tf_rbp = mcp->mc_rbp;
2356	tp->tf_rbx = mcp->mc_rbx;
2357	tp->tf_rdx = mcp->mc_rdx;
2358	tp->tf_rcx = mcp->mc_rcx;
2359	tp->tf_rax = mcp->mc_rax;
2360	tp->tf_rip = mcp->mc_rip;
2361	tp->tf_rflags = rflags;
2362	tp->tf_rsp = mcp->mc_rsp;
2363	tp->tf_ss = mcp->mc_ss;
2364	tp->tf_flags = mcp->mc_flags;
2365	if (tp->tf_flags & TF_HASSEGS) {
2366		tp->tf_ds = mcp->mc_ds;
2367		tp->tf_es = mcp->mc_es;
2368		tp->tf_fs = mcp->mc_fs;
2369		tp->tf_gs = mcp->mc_gs;
2370	}
2371	set_pcb_flags(pcb, PCB_FULL_IRET);
2372	if (mcp->mc_flags & _MC_HASBASES) {
2373		pcb->pcb_fsbase = mcp->mc_fsbase;
2374		pcb->pcb_gsbase = mcp->mc_gsbase;
2375	}
2376	return (0);
2377}
2378
2379static void
2380get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2381    size_t xfpusave_len)
2382{
2383	size_t max_len, len;
2384
2385	mcp->mc_ownedfp = fpugetregs(td);
2386	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2387	    sizeof(mcp->mc_fpstate));
2388	mcp->mc_fpformat = fpuformat();
2389	if (!use_xsave || xfpusave_len == 0)
2390		return;
2391	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2392	len = xfpusave_len;
2393	if (len > max_len) {
2394		len = max_len;
2395		bzero(xfpusave + max_len, len - max_len);
2396	}
2397	mcp->mc_flags |= _MC_HASFPXSTATE;
2398	mcp->mc_xfpustate_len = len;
2399	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2400}
2401
2402static int
2403set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2404    size_t xfpustate_len)
2405{
2406	int error;
2407
2408	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2409		return (0);
2410	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2411		return (EINVAL);
2412	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2413		/* We don't care what state is left in the FPU or PCB. */
2414		fpstate_drop(td);
2415		error = 0;
2416	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2417	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2418		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2419		    xfpustate, xfpustate_len);
2420	} else
2421		return (EINVAL);
2422	return (error);
2423}
2424
2425void
2426fpstate_drop(struct thread *td)
2427{
2428
2429	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2430	critical_enter();
2431	if (PCPU_GET(fpcurthread) == td)
2432		fpudrop();
2433	/*
2434	 * XXX force a full drop of the fpu.  The above only drops it if we
2435	 * owned it.
2436	 *
2437	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2438	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2439	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2440	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2441	 * have too many layers.
2442	 */
2443	clear_pcb_flags(curthread->td_pcb,
2444	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2445	critical_exit();
2446}
2447
2448int
2449fill_dbregs(struct thread *td, struct dbreg *dbregs)
2450{
2451	struct pcb *pcb;
2452
2453	if (td == NULL) {
2454		dbregs->dr[0] = rdr0();
2455		dbregs->dr[1] = rdr1();
2456		dbregs->dr[2] = rdr2();
2457		dbregs->dr[3] = rdr3();
2458		dbregs->dr[6] = rdr6();
2459		dbregs->dr[7] = rdr7();
2460	} else {
2461		pcb = td->td_pcb;
2462		dbregs->dr[0] = pcb->pcb_dr0;
2463		dbregs->dr[1] = pcb->pcb_dr1;
2464		dbregs->dr[2] = pcb->pcb_dr2;
2465		dbregs->dr[3] = pcb->pcb_dr3;
2466		dbregs->dr[6] = pcb->pcb_dr6;
2467		dbregs->dr[7] = pcb->pcb_dr7;
2468	}
2469	dbregs->dr[4] = 0;
2470	dbregs->dr[5] = 0;
2471	dbregs->dr[8] = 0;
2472	dbregs->dr[9] = 0;
2473	dbregs->dr[10] = 0;
2474	dbregs->dr[11] = 0;
2475	dbregs->dr[12] = 0;
2476	dbregs->dr[13] = 0;
2477	dbregs->dr[14] = 0;
2478	dbregs->dr[15] = 0;
2479	return (0);
2480}
2481
2482int
2483set_dbregs(struct thread *td, struct dbreg *dbregs)
2484{
2485	struct pcb *pcb;
2486	int i;
2487
2488	if (td == NULL) {
2489		load_dr0(dbregs->dr[0]);
2490		load_dr1(dbregs->dr[1]);
2491		load_dr2(dbregs->dr[2]);
2492		load_dr3(dbregs->dr[3]);
2493		load_dr6(dbregs->dr[6]);
2494		load_dr7(dbregs->dr[7]);
2495	} else {
2496		/*
2497		 * Don't let an illegal value for dr7 get set.  Specifically,
2498		 * check for undefined settings.  Setting these bit patterns
2499		 * result in undefined behaviour and can lead to an unexpected
2500		 * TRCTRAP or a general protection fault right here.
2501		 * Upper bits of dr6 and dr7 must not be set
2502		 */
2503		for (i = 0; i < 4; i++) {
2504			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2505				return (EINVAL);
2506			if (td->td_frame->tf_cs == _ucode32sel &&
2507			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2508				return (EINVAL);
2509		}
2510		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2511		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2512			return (EINVAL);
2513
2514		pcb = td->td_pcb;
2515
2516		/*
2517		 * Don't let a process set a breakpoint that is not within the
2518		 * process's address space.  If a process could do this, it
2519		 * could halt the system by setting a breakpoint in the kernel
2520		 * (if ddb was enabled).  Thus, we need to check to make sure
2521		 * that no breakpoints are being enabled for addresses outside
2522		 * process's address space.
2523		 *
2524		 * XXX - what about when the watched area of the user's
2525		 * address space is written into from within the kernel
2526		 * ... wouldn't that still cause a breakpoint to be generated
2527		 * from within kernel mode?
2528		 */
2529
2530		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2531			/* dr0 is enabled */
2532			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2533				return (EINVAL);
2534		}
2535		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2536			/* dr1 is enabled */
2537			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2538				return (EINVAL);
2539		}
2540		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2541			/* dr2 is enabled */
2542			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2543				return (EINVAL);
2544		}
2545		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2546			/* dr3 is enabled */
2547			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2548				return (EINVAL);
2549		}
2550
2551		pcb->pcb_dr0 = dbregs->dr[0];
2552		pcb->pcb_dr1 = dbregs->dr[1];
2553		pcb->pcb_dr2 = dbregs->dr[2];
2554		pcb->pcb_dr3 = dbregs->dr[3];
2555		pcb->pcb_dr6 = dbregs->dr[6];
2556		pcb->pcb_dr7 = dbregs->dr[7];
2557
2558		set_pcb_flags(pcb, PCB_DBREGS);
2559	}
2560
2561	return (0);
2562}
2563
2564void
2565reset_dbregs(void)
2566{
2567
2568	load_dr7(0);	/* Turn off the control bits first */
2569	load_dr0(0);
2570	load_dr1(0);
2571	load_dr2(0);
2572	load_dr3(0);
2573	load_dr6(0);
2574}
2575
2576/*
2577 * Return > 0 if a hardware breakpoint has been hit, and the
2578 * breakpoint was in user space.  Return 0, otherwise.
2579 */
2580int
2581user_dbreg_trap(register_t dr6)
2582{
2583        u_int64_t dr7;
2584        u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2585        int nbp;            /* number of breakpoints that triggered */
2586        caddr_t addr[4];    /* breakpoint addresses */
2587        int i;
2588
2589        bp = dr6 & DBREG_DR6_BMASK;
2590        if (bp == 0) {
2591                /*
2592                 * None of the breakpoint bits are set meaning this
2593                 * trap was not caused by any of the debug registers
2594                 */
2595                return 0;
2596        }
2597
2598        dr7 = rdr7();
2599        if ((dr7 & 0x000000ff) == 0) {
2600                /*
2601                 * all GE and LE bits in the dr7 register are zero,
2602                 * thus the trap couldn't have been caused by the
2603                 * hardware debug registers
2604                 */
2605                return 0;
2606        }
2607
2608        nbp = 0;
2609
2610        /*
2611         * at least one of the breakpoints were hit, check to see
2612         * which ones and if any of them are user space addresses
2613         */
2614
2615        if (bp & 0x01) {
2616                addr[nbp++] = (caddr_t)rdr0();
2617        }
2618        if (bp & 0x02) {
2619                addr[nbp++] = (caddr_t)rdr1();
2620        }
2621        if (bp & 0x04) {
2622                addr[nbp++] = (caddr_t)rdr2();
2623        }
2624        if (bp & 0x08) {
2625                addr[nbp++] = (caddr_t)rdr3();
2626        }
2627
2628        for (i = 0; i < nbp; i++) {
2629                if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2630                        /*
2631                         * addr[i] is in user space
2632                         */
2633                        return nbp;
2634                }
2635        }
2636
2637        /*
2638         * None of the breakpoints are in user space.
2639         */
2640        return 0;
2641}
2642
2643/*
2644 * The pcb_flags is only modified by current thread, or by other threads
2645 * when current thread is stopped.  However, current thread may change it
2646 * from the interrupt context in cpu_switch(), or in the trap handler.
2647 * When we read-modify-write pcb_flags from C sources, compiler may generate
2648 * code that is not atomic regarding the interrupt handler.  If a trap or
2649 * interrupt happens and any flag is modified from the handler, it can be
2650 * clobbered with the cached value later.  Therefore, we implement setting
2651 * and clearing flags with single-instruction functions, which do not race
2652 * with possible modification of the flags from the trap or interrupt context,
2653 * because traps and interrupts are executed only on instruction boundary.
2654 */
2655void
2656set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2657{
2658
2659	__asm __volatile("orl %1,%0"
2660	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2661	    : "cc", "memory");
2662
2663}
2664
2665/*
2666 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2667 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2668 * pcb if user space modified the bases.  We must save on the context
2669 * switch or if the return to usermode happens through the doreti.
2670 *
2671 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2672 * which have a consequence that the base MSRs must be saved each time
2673 * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2674 * context switches.
2675 */
2676static void
2677set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2678{
2679	register_t r;
2680
2681	if (curpcb == pcb &&
2682	    (flags & PCB_FULL_IRET) != 0 &&
2683	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2684		r = intr_disable();
2685		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2686			if (rfs() == _ufssel)
2687				pcb->pcb_fsbase = rdfsbase();
2688			if (rgs() == _ugssel)
2689				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2690		}
2691		set_pcb_flags_raw(pcb, flags);
2692		intr_restore(r);
2693	} else {
2694		set_pcb_flags_raw(pcb, flags);
2695	}
2696}
2697
2698DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2699{
2700
2701	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2702	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2703}
2704
2705void
2706clear_pcb_flags(struct pcb *pcb, const u_int flags)
2707{
2708
2709	__asm __volatile("andl %1,%0"
2710	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2711	    : "cc", "memory");
2712}
2713
2714#ifdef KDB
2715
2716/*
2717 * Provide inb() and outb() as functions.  They are normally only available as
2718 * inline functions, thus cannot be called from the debugger.
2719 */
2720
2721/* silence compiler warnings */
2722u_char inb_(u_short);
2723void outb_(u_short, u_char);
2724
2725u_char
2726inb_(u_short port)
2727{
2728	return inb(port);
2729}
2730
2731void
2732outb_(u_short port, u_char data)
2733{
2734	outb(port, data);
2735}
2736
2737#endif /* KDB */
2738
2739#undef memset
2740#undef memmove
2741#undef memcpy
2742
2743void	*memset_std(void *buf, int c, size_t len);
2744void	*memset_erms(void *buf, int c, size_t len);
2745void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2746	    size_t len);
2747void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2748	    size_t len);
2749void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2750	    size_t len);
2751void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2752	    size_t len);
2753
2754#ifdef KCSAN
2755/*
2756 * These fail to build as ifuncs when used with KCSAN.
2757 */
2758void *
2759memset(void *buf, int c, size_t len)
2760{
2761
2762	return (memset_std(buf, c, len));
2763}
2764
2765void *
2766memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2767{
2768
2769	return (memmove_std(dst, src, len));
2770}
2771
2772void *
2773memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2774{
2775
2776	return (memcpy_std(dst, src, len));
2777}
2778#else
2779DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2780{
2781
2782	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2783	    memset_erms : memset_std);
2784}
2785
2786DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2787    size_t))
2788{
2789
2790	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2791	    memmove_erms : memmove_std);
2792}
2793
2794DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2795{
2796
2797	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2798	    memcpy_erms : memcpy_std);
2799}
2800#endif
2801
2802void	pagezero_std(void *addr);
2803void	pagezero_erms(void *addr);
2804DEFINE_IFUNC(, void , pagezero, (void *))
2805{
2806
2807	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2808	    pagezero_erms : pagezero_std);
2809}
2810