1/*-
2 * Copyright (c) 2003 Peter Wemm.
3 * Copyright (c) 1992 Terrence R. Lambert.
4 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * William Jolitz.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/11/sys/x86/x86/cpu_machdep.c 362383 2020-06-19 13:48:23Z kib $");
43
44#include "opt_atpic.h"
45#include "opt_compat.h"
46#include "opt_cpu.h"
47#include "opt_ddb.h"
48#include "opt_inet.h"
49#include "opt_isa.h"
50#include "opt_kdb.h"
51#include "opt_kstack_pages.h"
52#include "opt_maxmem.h"
53#include "opt_mp_watchdog.h"
54#include "opt_perfmon.h"
55#include "opt_platform.h"
56#ifdef __i386__
57#include "opt_apic.h"
58#include "opt_xbox.h"
59#endif
60
61#include <sys/param.h>
62#include <sys/proc.h>
63#include <sys/systm.h>
64#include <sys/bus.h>
65#include <sys/cpu.h>
66#include <sys/kdb.h>
67#include <sys/kernel.h>
68#include <sys/ktr.h>
69#include <sys/lock.h>
70#include <sys/malloc.h>
71#include <sys/mutex.h>
72#include <sys/pcpu.h>
73#include <sys/rwlock.h>
74#include <sys/sched.h>
75#include <sys/smp.h>
76#include <sys/sysctl.h>
77
78#include <machine/clock.h>
79#include <machine/cpu.h>
80#include <machine/cputypes.h>
81#include <machine/specialreg.h>
82#include <machine/md_var.h>
83#include <machine/mp_watchdog.h>
84#ifdef PERFMON
85#include <machine/perfmon.h>
86#endif
87#include <machine/tss.h>
88#ifdef SMP
89#include <machine/smp.h>
90#endif
91#ifdef CPU_ELAN
92#include <machine/elan_mmcr.h>
93#endif
94#include <x86/acpica_machdep.h>
95
96#include <vm/vm.h>
97#include <vm/vm_extern.h>
98#include <vm/vm_kern.h>
99#include <vm/vm_page.h>
100#include <vm/vm_map.h>
101#include <vm/vm_object.h>
102#include <vm/vm_pager.h>
103#include <vm/vm_param.h>
104
105#ifndef PC98
106#include <isa/isareg.h>
107#endif
108
109#define	STATE_RUNNING	0x0
110#define	STATE_MWAIT	0x1
111#define	STATE_SLEEPING	0x2
112
113#ifdef SMP
114static u_int	cpu_reset_proxyid;
115static volatile u_int	cpu_reset_proxy_active;
116#endif
117
118struct msr_op_arg {
119	u_int msr;
120	int op;
121	uint64_t arg1;
122};
123
124static void
125x86_msr_op_one(void *argp)
126{
127	struct msr_op_arg *a;
128	uint64_t v;
129
130	a = argp;
131	switch (a->op) {
132	case MSR_OP_ANDNOT:
133		v = rdmsr(a->msr);
134		v &= ~a->arg1;
135		wrmsr(a->msr, v);
136		break;
137	case MSR_OP_OR:
138		v = rdmsr(a->msr);
139		v |= a->arg1;
140		wrmsr(a->msr, v);
141		break;
142	case MSR_OP_WRITE:
143		wrmsr(a->msr, a->arg1);
144		break;
145	}
146}
147
148#define	MSR_OP_EXMODE_MASK	0xf0000000
149#define	MSR_OP_OP_MASK		0x000000ff
150
151void
152x86_msr_op(u_int msr, u_int op, uint64_t arg1)
153{
154	struct thread *td;
155	struct msr_op_arg a;
156	u_int exmode;
157	int bound_cpu, i, is_bound;
158
159	a.op = op & MSR_OP_OP_MASK;
160	MPASS(a.op == MSR_OP_ANDNOT || a.op == MSR_OP_OR ||
161	    a.op == MSR_OP_WRITE);
162	exmode = op & MSR_OP_EXMODE_MASK;
163	MPASS(exmode == MSR_OP_LOCAL || exmode == MSR_OP_SCHED ||
164	    exmode == MSR_OP_RENDEZVOUS);
165	a.msr = msr;
166	a.arg1 = arg1;
167	switch (exmode) {
168	case MSR_OP_LOCAL:
169		x86_msr_op_one(&a);
170		break;
171	case MSR_OP_SCHED:
172		td = curthread;
173		thread_lock(td);
174		is_bound = sched_is_bound(td);
175		bound_cpu = td->td_oncpu;
176		CPU_FOREACH(i) {
177			sched_bind(td, i);
178			x86_msr_op_one(&a);
179		}
180		if (is_bound)
181			sched_bind(td, bound_cpu);
182		else
183			sched_unbind(td);
184		thread_unlock(td);
185		break;
186	case MSR_OP_RENDEZVOUS:
187		smp_rendezvous(NULL, x86_msr_op_one, NULL, &a);
188		break;
189	}
190}
191
192/*
193 * Machine dependent boot() routine
194 *
195 * I haven't seen anything to put here yet
196 * Possibly some stuff might be grafted back here from boot()
197 */
198void
199cpu_boot(int howto)
200{
201}
202
203/*
204 * Flush the D-cache for non-DMA I/O so that the I-cache can
205 * be made coherent later.
206 */
207void
208cpu_flush_dcache(void *ptr, size_t len)
209{
210	/* Not applicable */
211}
212
213void
214acpi_cpu_c1(void)
215{
216
217	__asm __volatile("sti; hlt");
218}
219
220/*
221 * Use mwait to pause execution while waiting for an interrupt or
222 * another thread to signal that there is more work.
223 *
224 * NOTE: Interrupts will cause a wakeup; however, this function does
225 * not enable interrupt handling. The caller is responsible to enable
226 * interrupts.
227 */
228void
229acpi_cpu_idle_mwait(uint32_t mwait_hint)
230{
231	int *state;
232	uint64_t v;
233
234	/*
235	 * A comment in Linux patch claims that 'CPUs run faster with
236	 * speculation protection disabled. All CPU threads in a core
237	 * must disable speculation protection for it to be
238	 * disabled. Disable it while we are idle so the other
239	 * hyperthread can run fast.'
240	 *
241	 * XXXKIB.  Software coordination mode should be supported,
242	 * but all Intel CPUs provide hardware coordination.
243	 */
244
245	state = (int *)PCPU_PTR(monitorbuf);
246	KASSERT(atomic_load_int(state) == STATE_SLEEPING,
247	    ("cpu_mwait_cx: wrong monitorbuf state"));
248	atomic_store_int(state, STATE_MWAIT);
249	if (PCPU_GET(ibpb_set) || hw_ssb_active) {
250		v = rdmsr(MSR_IA32_SPEC_CTRL);
251		wrmsr(MSR_IA32_SPEC_CTRL, v & ~(IA32_SPEC_CTRL_IBRS |
252		    IA32_SPEC_CTRL_STIBP | IA32_SPEC_CTRL_SSBD));
253	} else {
254		v = 0;
255	}
256	cpu_monitor(state, 0, 0);
257	if (atomic_load_int(state) == STATE_MWAIT)
258		cpu_mwait(MWAIT_INTRBREAK, mwait_hint);
259
260	/*
261	 * SSB cannot be disabled while we sleep, or rather, if it was
262	 * disabled, the sysctl thread will bind to our cpu to tweak
263	 * MSR.
264	 */
265	if (v != 0)
266		wrmsr(MSR_IA32_SPEC_CTRL, v);
267
268	/*
269	 * We should exit on any event that interrupts mwait, because
270	 * that event might be a wanted interrupt.
271	 */
272	atomic_store_int(state, STATE_RUNNING);
273}
274
275/* Get current clock frequency for the given cpu id. */
276int
277cpu_est_clockrate(int cpu_id, uint64_t *rate)
278{
279	uint64_t tsc1, tsc2;
280	uint64_t acnt, mcnt, perf;
281	register_t reg;
282
283	if (pcpu_find(cpu_id) == NULL || rate == NULL)
284		return (EINVAL);
285#ifdef __i386__
286	if ((cpu_feature & CPUID_TSC) == 0)
287		return (EOPNOTSUPP);
288#endif
289
290	/*
291	 * If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
292	 * DELAY(9) based logic fails.
293	 */
294	if (tsc_is_invariant && !tsc_perf_stat)
295		return (EOPNOTSUPP);
296
297#ifdef SMP
298	if (smp_cpus > 1) {
299		/* Schedule ourselves on the indicated cpu. */
300		thread_lock(curthread);
301		sched_bind(curthread, cpu_id);
302		thread_unlock(curthread);
303	}
304#endif
305
306	/* Calibrate by measuring a short delay. */
307	reg = intr_disable();
308	if (tsc_is_invariant) {
309		wrmsr(MSR_MPERF, 0);
310		wrmsr(MSR_APERF, 0);
311		tsc1 = rdtsc();
312		DELAY(1000);
313		mcnt = rdmsr(MSR_MPERF);
314		acnt = rdmsr(MSR_APERF);
315		tsc2 = rdtsc();
316		intr_restore(reg);
317		perf = 1000 * acnt / mcnt;
318		*rate = (tsc2 - tsc1) * perf;
319	} else {
320		tsc1 = rdtsc();
321		DELAY(1000);
322		tsc2 = rdtsc();
323		intr_restore(reg);
324		*rate = (tsc2 - tsc1) * 1000;
325	}
326
327#ifdef SMP
328	if (smp_cpus > 1) {
329		thread_lock(curthread);
330		sched_unbind(curthread);
331		thread_unlock(curthread);
332	}
333#endif
334
335	return (0);
336}
337
338/*
339 * Shutdown the CPU as much as possible
340 */
341void
342cpu_halt(void)
343{
344	for (;;)
345		halt();
346}
347
348static void
349cpu_reset_real(void)
350{
351	struct region_descriptor null_idt;
352#ifndef PC98
353	int b;
354#endif
355
356	disable_intr();
357#ifdef CPU_ELAN
358	if (elan_mmcr != NULL)
359		elan_mmcr->RESCFG = 1;
360#endif
361#ifdef __i386__
362	if (cpu == CPU_GEODE1100) {
363		/* Attempt Geode's own reset */
364		outl(0xcf8, 0x80009044ul);
365		outl(0xcfc, 0xf);
366	}
367#endif
368#ifdef PC98
369	/*
370	 * Attempt to do a CPU reset via CPU reset port.
371	 */
372	if ((inb(0x35) & 0xa0) != 0xa0) {
373		outb(0x37, 0x0f);		/* SHUT0 = 0. */
374		outb(0x37, 0x0b);		/* SHUT1 = 0. */
375	}
376	outb(0xf0, 0x00);			/* Reset. */
377#else
378#if !defined(BROKEN_KEYBOARD_RESET)
379	/*
380	 * Attempt to do a CPU reset via the keyboard controller,
381	 * do not turn off GateA20, as any machine that fails
382	 * to do the reset here would then end up in no man's land.
383	 */
384	outb(IO_KBD + 4, 0xFE);
385	DELAY(500000);	/* wait 0.5 sec to see if that did it */
386#endif
387
388	/*
389	 * Attempt to force a reset via the Reset Control register at
390	 * I/O port 0xcf9.  Bit 2 forces a system reset when it
391	 * transitions from 0 to 1.  Bit 1 selects the type of reset
392	 * to attempt: 0 selects a "soft" reset, and 1 selects a
393	 * "hard" reset.  We try a "hard" reset.  The first write sets
394	 * bit 1 to select a "hard" reset and clears bit 2.  The
395	 * second write forces a 0 -> 1 transition in bit 2 to trigger
396	 * a reset.
397	 */
398	outb(0xcf9, 0x2);
399	outb(0xcf9, 0x6);
400	DELAY(500000);  /* wait 0.5 sec to see if that did it */
401
402	/*
403	 * Attempt to force a reset via the Fast A20 and Init register
404	 * at I/O port 0x92.  Bit 1 serves as an alternate A20 gate.
405	 * Bit 0 asserts INIT# when set to 1.  We are careful to only
406	 * preserve bit 1 while setting bit 0.  We also must clear bit
407	 * 0 before setting it if it isn't already clear.
408	 */
409	b = inb(0x92);
410	if (b != 0xff) {
411		if ((b & 0x1) != 0)
412			outb(0x92, b & 0xfe);
413		outb(0x92, b | 0x1);
414		DELAY(500000);  /* wait 0.5 sec to see if that did it */
415	}
416#endif /* PC98 */
417
418	printf("No known reset method worked, attempting CPU shutdown\n");
419	DELAY(1000000); /* wait 1 sec for printf to complete */
420
421	/* Wipe the IDT. */
422	null_idt.rd_limit = 0;
423	null_idt.rd_base = 0;
424	lidt(&null_idt);
425
426	/* "good night, sweet prince .... <THUNK!>" */
427	breakpoint();
428
429	/* NOTREACHED */
430	while(1);
431}
432
433#ifdef SMP
434static void
435cpu_reset_proxy(void)
436{
437
438	cpu_reset_proxy_active = 1;
439	while (cpu_reset_proxy_active == 1)
440		ia32_pause(); /* Wait for other cpu to see that we've started */
441
442	printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid);
443	DELAY(1000000);
444	cpu_reset_real();
445}
446#endif
447
448void
449cpu_reset(void)
450{
451#ifdef SMP
452	cpuset_t map;
453	u_int cnt;
454
455	if (smp_started) {
456		map = all_cpus;
457		CPU_CLR(PCPU_GET(cpuid), &map);
458		CPU_NAND(&map, &stopped_cpus);
459		if (!CPU_EMPTY(&map)) {
460			printf("cpu_reset: Stopping other CPUs\n");
461			stop_cpus(map);
462		}
463
464		if (PCPU_GET(cpuid) != 0) {
465			cpu_reset_proxyid = PCPU_GET(cpuid);
466			cpustop_restartfunc = cpu_reset_proxy;
467			cpu_reset_proxy_active = 0;
468			printf("cpu_reset: Restarting BSP\n");
469
470			/* Restart CPU #0. */
471			CPU_SETOF(0, &started_cpus);
472			wmb();
473
474			cnt = 0;
475			while (cpu_reset_proxy_active == 0 && cnt < 10000000) {
476				ia32_pause();
477				cnt++;	/* Wait for BSP to announce restart */
478			}
479			if (cpu_reset_proxy_active == 0) {
480				printf("cpu_reset: Failed to restart BSP\n");
481			} else {
482				cpu_reset_proxy_active = 2;
483				while (1)
484					ia32_pause();
485				/* NOTREACHED */
486			}
487		}
488
489		DELAY(1000000);
490	}
491#endif
492	cpu_reset_real();
493	/* NOTREACHED */
494}
495
496bool
497cpu_mwait_usable(void)
498{
499
500	return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags &
501	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) ==
502	    (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)));
503}
504
505void (*cpu_idle_hook)(sbintime_t) = NULL;	/* ACPI idle hook. */
506static int	cpu_ident_amdc1e = 0;	/* AMD C1E supported. */
507static int	idle_mwait = 1;		/* Use MONITOR/MWAIT for short idle. */
508SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait,
509    0, "Use MONITOR/MWAIT for short idle");
510
511#ifndef PC98
512static void
513cpu_idle_acpi(sbintime_t sbt)
514{
515	int *state;
516
517	state = (int *)PCPU_PTR(monitorbuf);
518	atomic_store_int(state, STATE_SLEEPING);
519
520	/* See comments in cpu_idle_hlt(). */
521	disable_intr();
522	if (sched_runnable())
523		enable_intr();
524	else if (cpu_idle_hook)
525		cpu_idle_hook(sbt);
526	else
527		acpi_cpu_c1();
528	atomic_store_int(state, STATE_RUNNING);
529}
530#endif /* !PC98 */
531
532static void
533cpu_idle_hlt(sbintime_t sbt)
534{
535	int *state;
536
537	state = (int *)PCPU_PTR(monitorbuf);
538	atomic_store_int(state, STATE_SLEEPING);
539
540	/*
541	 * Since we may be in a critical section from cpu_idle(), if
542	 * an interrupt fires during that critical section we may have
543	 * a pending preemption.  If the CPU halts, then that thread
544	 * may not execute until a later interrupt awakens the CPU.
545	 * To handle this race, check for a runnable thread after
546	 * disabling interrupts and immediately return if one is
547	 * found.  Also, we must absolutely guarentee that hlt is
548	 * the next instruction after sti.  This ensures that any
549	 * interrupt that fires after the call to disable_intr() will
550	 * immediately awaken the CPU from hlt.  Finally, please note
551	 * that on x86 this works fine because of interrupts enabled only
552	 * after the instruction following sti takes place, while IF is set
553	 * to 1 immediately, allowing hlt instruction to acknowledge the
554	 * interrupt.
555	 */
556	disable_intr();
557	if (sched_runnable())
558		enable_intr();
559	else
560		acpi_cpu_c1();
561	atomic_store_int(state, STATE_RUNNING);
562}
563
564static void
565cpu_idle_mwait(sbintime_t sbt)
566{
567	int *state;
568
569	state = (int *)PCPU_PTR(monitorbuf);
570	atomic_store_int(state, STATE_MWAIT);
571
572	/* See comments in cpu_idle_hlt(). */
573	disable_intr();
574	if (sched_runnable()) {
575		atomic_store_int(state, STATE_RUNNING);
576		enable_intr();
577		return;
578	}
579
580	cpu_monitor(state, 0, 0);
581	if (atomic_load_int(state) == STATE_MWAIT)
582		__asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0));
583	else
584		enable_intr();
585	atomic_store_int(state, STATE_RUNNING);
586}
587
588static void
589cpu_idle_spin(sbintime_t sbt)
590{
591	int *state;
592	int i;
593
594	state = (int *)PCPU_PTR(monitorbuf);
595	atomic_store_int(state, STATE_RUNNING);
596
597	/*
598	 * The sched_runnable() call is racy but as long as there is
599	 * a loop missing it one time will have just a little impact if any
600	 * (and it is much better than missing the check at all).
601	 */
602	for (i = 0; i < 1000; i++) {
603		if (sched_runnable())
604			return;
605		cpu_spinwait();
606	}
607}
608
609/*
610 * C1E renders the local APIC timer dead, so we disable it by
611 * reading the Interrupt Pending Message register and clearing
612 * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
613 *
614 * Reference:
615 *   "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
616 *   #32559 revision 3.00+
617 */
618#define	MSR_AMDK8_IPM		0xc0010055
619#define	AMDK8_SMIONCMPHALT	(1ULL << 27)
620#define	AMDK8_C1EONCMPHALT	(1ULL << 28)
621#define	AMDK8_CMPHALT		(AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT)
622
623void
624cpu_probe_amdc1e(void)
625{
626
627	/*
628	 * Detect the presence of C1E capability mostly on latest
629	 * dual-cores (or future) k8 family.
630	 */
631	if (cpu_vendor_id == CPU_VENDOR_AMD &&
632	    (cpu_id & 0x00000f00) == 0x00000f00 &&
633	    (cpu_id & 0x0fff0000) >=  0x00040000) {
634		cpu_ident_amdc1e = 1;
635	}
636}
637
638#if defined(__i386__) && defined(PC98)
639void (*cpu_idle_fn)(sbintime_t) = cpu_idle_hlt;
640#else
641void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi;
642#endif
643
644void
645cpu_idle(int busy)
646{
647	uint64_t msr;
648	sbintime_t sbt = -1;
649
650	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
651	    busy, curcpu);
652#ifdef MP_WATCHDOG
653	ap_watchdog(PCPU_GET(cpuid));
654#endif
655
656	/* If we are busy - try to use fast methods. */
657	if (busy) {
658		if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
659			cpu_idle_mwait(busy);
660			goto out;
661		}
662	}
663
664	/* If we have time - switch timers into idle mode. */
665	if (!busy) {
666		critical_enter();
667		sbt = cpu_idleclock();
668	}
669
670	/* Apply AMD APIC timer C1E workaround. */
671	if (cpu_ident_amdc1e && cpu_disable_c3_sleep) {
672		msr = rdmsr(MSR_AMDK8_IPM);
673		if (msr & AMDK8_CMPHALT)
674			wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
675	}
676
677	/* Call main idle method. */
678	cpu_idle_fn(sbt);
679
680	/* Switch timers back into active mode. */
681	if (!busy) {
682		cpu_activeclock();
683		critical_exit();
684	}
685out:
686	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
687	    busy, curcpu);
688}
689
690static int cpu_idle_apl31_workaround;
691SYSCTL_INT(_machdep, OID_AUTO, idle_apl31, CTLFLAG_RW,
692    &cpu_idle_apl31_workaround, 0,
693    "Apollo Lake APL31 MWAIT bug workaround");
694
695int
696cpu_idle_wakeup(int cpu)
697{
698	int *state;
699
700	state = (int *)pcpu_find(cpu)->pc_monitorbuf;
701	switch (atomic_load_int(state)) {
702	case STATE_SLEEPING:
703		return (0);
704	case STATE_MWAIT:
705		atomic_store_int(state, STATE_RUNNING);
706		return (cpu_idle_apl31_workaround ? 0 : 1);
707	case STATE_RUNNING:
708		return (1);
709	default:
710		panic("bad monitor state");
711		return (1);
712	}
713}
714
715/*
716 * Ordered by speed/power consumption.
717 */
718static struct {
719	void	*id_fn;
720	char	*id_name;
721	int	id_cpuid2_flag;
722} idle_tbl[] = {
723	{ .id_fn = cpu_idle_spin, .id_name = "spin" },
724	{ .id_fn = cpu_idle_mwait, .id_name = "mwait",
725	    .id_cpuid2_flag = CPUID2_MON },
726	{ .id_fn = cpu_idle_hlt, .id_name = "hlt" },
727#if !defined(__i386__) || !defined(PC98)
728	{ .id_fn = cpu_idle_acpi, .id_name = "acpi" },
729#endif
730};
731
732static int
733idle_sysctl_available(SYSCTL_HANDLER_ARGS)
734{
735	char *avail, *p;
736	int error;
737	int i;
738
739	avail = malloc(256, M_TEMP, M_WAITOK);
740	p = avail;
741	for (i = 0; i < nitems(idle_tbl); i++) {
742		if (idle_tbl[i].id_cpuid2_flag != 0 &&
743		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
744			continue;
745#if !defined(__i386__) || !defined(PC98)
746		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
747		    cpu_idle_hook == NULL)
748			continue;
749#endif
750		p += sprintf(p, "%s%s", p != avail ? ", " : "",
751		    idle_tbl[i].id_name);
752	}
753	error = sysctl_handle_string(oidp, avail, 0, req);
754	free(avail, M_TEMP);
755	return (error);
756}
757
758SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD,
759    0, 0, idle_sysctl_available, "A", "list of available idle functions");
760
761static bool
762cpu_idle_selector(const char *new_idle_name)
763{
764	int i;
765
766	for (i = 0; i < nitems(idle_tbl); i++) {
767		if (idle_tbl[i].id_cpuid2_flag != 0 &&
768		    (cpu_feature2 & idle_tbl[i].id_cpuid2_flag) == 0)
769			continue;
770#if !defined(__i386__) || !defined(PC98)
771		if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
772		    cpu_idle_hook == NULL)
773			continue;
774#endif
775		if (strcmp(idle_tbl[i].id_name, new_idle_name))
776			continue;
777		cpu_idle_fn = idle_tbl[i].id_fn;
778		if (bootverbose)
779			printf("CPU idle set to %s\n", idle_tbl[i].id_name);
780		return (true);
781	}
782	return (false);
783}
784
785static int
786cpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
787{
788	char buf[16], *p;
789	int error, i;
790
791	p = "unknown";
792	for (i = 0; i < nitems(idle_tbl); i++) {
793		if (idle_tbl[i].id_fn == cpu_idle_fn) {
794			p = idle_tbl[i].id_name;
795			break;
796		}
797	}
798	strncpy(buf, p, sizeof(buf));
799	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
800	if (error != 0 || req->newptr == NULL)
801		return (error);
802	return (cpu_idle_selector(buf) ? 0 : EINVAL);
803}
804
805SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0,
806    cpu_idle_sysctl, "A", "currently selected idle function");
807
808static void
809cpu_idle_tun(void *unused __unused)
810{
811	char tunvar[16];
812
813	if (TUNABLE_STR_FETCH("machdep.idle", tunvar, sizeof(tunvar)))
814		cpu_idle_selector(tunvar);
815	else if (cpu_vendor_id == CPU_VENDOR_AMD &&
816	    CPUID_TO_FAMILY(cpu_id) == 0x17 && CPUID_TO_MODEL(cpu_id) == 0x1) {
817		/* Ryzen erratas 1057, 1109. */
818		cpu_idle_selector("hlt");
819		idle_mwait = 0;
820	}
821
822	if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) {
823		/*
824		 * Apollo Lake errata APL31 (public errata APL30).
825		 * Stores to the armed address range may not trigger
826		 * MWAIT to resume execution.  OS needs to use
827		 * interrupts to wake processors from MWAIT-induced
828		 * sleep states.
829		 */
830		cpu_idle_apl31_workaround = 1;
831	}
832	TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround);
833}
834SYSINIT(cpu_idle_tun, SI_SUB_CPU, SI_ORDER_MIDDLE, cpu_idle_tun, NULL);
835
836static int panic_on_nmi = 1;
837SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
838    &panic_on_nmi, 0,
839    "Panic on NMI raised by hardware failure");
840int nmi_is_broadcast = 1;
841SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN,
842    &nmi_is_broadcast, 0,
843    "Chipset NMI is broadcast");
844#ifdef KDB
845int kdb_on_nmi = 1;
846SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
847    &kdb_on_nmi, 0,
848    "Go to KDB on NMI with unknown source");
849#endif
850
851void
852nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame)
853{
854	bool claimed = false;
855
856#ifdef DEV_ISA
857	/* machine/parity/power fail/"kitchen sink" faults */
858	if (isa_nmi(frame->tf_err)) {
859		claimed = true;
860		if (panic_on_nmi)
861			panic("NMI indicates hardware failure");
862	}
863#endif /* DEV_ISA */
864#ifdef KDB
865	if (!claimed && kdb_on_nmi) {
866		/*
867		 * NMI can be hooked up to a pushbutton for debugging.
868		 */
869		printf("NMI/cpu%d ... going to debugger\n", cpu);
870		kdb_trap(type, 0, frame);
871	}
872#endif /* KDB */
873}
874
875void
876nmi_handle_intr(u_int type, struct trapframe *frame)
877{
878
879#ifdef SMP
880	if (nmi_is_broadcast) {
881		nmi_call_kdb_smp(type, frame);
882		return;
883	}
884#endif
885	nmi_call_kdb(PCPU_GET(cpuid), type, frame);
886}
887
888static int hw_ibrs_active;
889int hw_ibrs_ibpb_active;
890int hw_ibrs_disable = 1;
891
892SYSCTL_INT(_hw, OID_AUTO, ibrs_active, CTLFLAG_RD, &hw_ibrs_active, 0,
893    "Indirect Branch Restricted Speculation active");
894
895void
896hw_ibrs_recalculate(bool for_all_cpus)
897{
898	if ((cpu_ia32_arch_caps & IA32_ARCH_CAP_IBRS_ALL) != 0) {
899		x86_msr_op(MSR_IA32_SPEC_CTRL, (for_all_cpus ?
900		    MSR_OP_RENDEZVOUS : MSR_OP_LOCAL) |
901		    (hw_ibrs_disable != 0 ? MSR_OP_ANDNOT : MSR_OP_OR),
902		    IA32_SPEC_CTRL_IBRS);
903		hw_ibrs_active = hw_ibrs_disable == 0;
904		hw_ibrs_ibpb_active = 0;
905	} else {
906		hw_ibrs_active = hw_ibrs_ibpb_active = (cpu_stdext_feature3 &
907		    CPUID_STDEXT3_IBPB) != 0 && !hw_ibrs_disable;
908	}
909}
910
911static int
912hw_ibrs_disable_handler(SYSCTL_HANDLER_ARGS)
913{
914	int error, val;
915
916	val = hw_ibrs_disable;
917	error = sysctl_handle_int(oidp, &val, 0, req);
918	if (error != 0 || req->newptr == NULL)
919		return (error);
920	hw_ibrs_disable = val != 0;
921	hw_ibrs_recalculate(true);
922	return (0);
923}
924SYSCTL_PROC(_hw, OID_AUTO, ibrs_disable, CTLTYPE_INT | CTLFLAG_RWTUN |
925    CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0, hw_ibrs_disable_handler, "I",
926    "Disable Indirect Branch Restricted Speculation");
927
928int hw_ssb_active;
929int hw_ssb_disable;
930
931SYSCTL_INT(_hw, OID_AUTO, spec_store_bypass_disable_active, CTLFLAG_RD,
932    &hw_ssb_active, 0,
933    "Speculative Store Bypass Disable active");
934
935static void
936hw_ssb_set(bool enable, bool for_all_cpus)
937{
938
939	if ((cpu_stdext_feature3 & CPUID_STDEXT3_SSBD) == 0) {
940		hw_ssb_active = 0;
941		return;
942	}
943	hw_ssb_active = enable;
944	x86_msr_op(MSR_IA32_SPEC_CTRL,
945	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
946	    (for_all_cpus ? MSR_OP_SCHED : MSR_OP_LOCAL), IA32_SPEC_CTRL_SSBD);
947}
948
949void
950hw_ssb_recalculate(bool all_cpus)
951{
952
953	switch (hw_ssb_disable) {
954	default:
955		hw_ssb_disable = 0;
956		/* FALLTHROUGH */
957	case 0: /* off */
958		hw_ssb_set(false, all_cpus);
959		break;
960	case 1: /* on */
961		hw_ssb_set(true, all_cpus);
962		break;
963	case 2: /* auto */
964		hw_ssb_set((cpu_ia32_arch_caps & IA32_ARCH_CAP_SSB_NO) != 0 ?
965		    false : true, all_cpus);
966		break;
967	}
968}
969
970static int
971hw_ssb_disable_handler(SYSCTL_HANDLER_ARGS)
972{
973	int error, val;
974
975	val = hw_ssb_disable;
976	error = sysctl_handle_int(oidp, &val, 0, req);
977	if (error != 0 || req->newptr == NULL)
978		return (error);
979	hw_ssb_disable = val;
980	hw_ssb_recalculate(true);
981	return (0);
982}
983SYSCTL_PROC(_hw, OID_AUTO, spec_store_bypass_disable, CTLTYPE_INT |
984    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
985    hw_ssb_disable_handler, "I",
986    "Speculative Store Bypass Disable (0 - off, 1 - on, 2 - auto");
987
988int hw_mds_disable;
989
990/*
991 * Handler for Microarchitectural Data Sampling issues.  Really not a
992 * pointer to C function: on amd64 the code must not change any CPU
993 * architectural state except possibly %rflags. Also, it is always
994 * called with interrupts disabled.
995 */
996void mds_handler_void(void);
997void mds_handler_verw(void);
998void mds_handler_ivb(void);
999void mds_handler_bdw(void);
1000void mds_handler_skl_sse(void);
1001void mds_handler_skl_avx(void);
1002void mds_handler_skl_avx512(void);
1003void mds_handler_silvermont(void);
1004void (*mds_handler)(void) = mds_handler_void;
1005
1006static int
1007sysctl_hw_mds_disable_state_handler(SYSCTL_HANDLER_ARGS)
1008{
1009	const char *state;
1010
1011	if (mds_handler == mds_handler_void)
1012		state = "inactive";
1013	else if (mds_handler == mds_handler_verw)
1014		state = "VERW";
1015	else if (mds_handler == mds_handler_ivb)
1016		state = "software IvyBridge";
1017	else if (mds_handler == mds_handler_bdw)
1018		state = "software Broadwell";
1019	else if (mds_handler == mds_handler_skl_sse)
1020		state = "software Skylake SSE";
1021	else if (mds_handler == mds_handler_skl_avx)
1022		state = "software Skylake AVX";
1023	else if (mds_handler == mds_handler_skl_avx512)
1024		state = "software Skylake AVX512";
1025	else if (mds_handler == mds_handler_silvermont)
1026		state = "software Silvermont";
1027	else
1028		state = "unknown";
1029	return (SYSCTL_OUT(req, state, strlen(state)));
1030}
1031
1032SYSCTL_PROC(_hw, OID_AUTO, mds_disable_state,
1033    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1034    sysctl_hw_mds_disable_state_handler, "A",
1035    "Microarchitectural Data Sampling Mitigation state");
1036
1037_Static_assert(__offsetof(struct pcpu, pc_mds_tmp) % 64 == 0, "MDS AVX512");
1038
1039void
1040hw_mds_recalculate(void)
1041{
1042	struct pcpu *pc;
1043	vm_offset_t b64;
1044	u_long xcr0;
1045	int i;
1046
1047	/*
1048	 * Allow user to force VERW variant even if MD_CLEAR is not
1049	 * reported.  For instance, hypervisor might unknowingly
1050	 * filter the cap out.
1051	 * For the similar reasons, and for testing, allow to enable
1052	 * mitigation even when MDS_NO cap is set.
1053	 */
1054	if (cpu_vendor_id != CPU_VENDOR_INTEL || hw_mds_disable == 0 ||
1055	    ((cpu_ia32_arch_caps & IA32_ARCH_CAP_MDS_NO) != 0 &&
1056	    hw_mds_disable == 3)) {
1057		mds_handler = mds_handler_void;
1058	} else if (((cpu_stdext_feature3 & CPUID_STDEXT3_MD_CLEAR) != 0 &&
1059	    hw_mds_disable == 3) || hw_mds_disable == 1) {
1060		mds_handler = mds_handler_verw;
1061	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1062	    (CPUID_TO_MODEL(cpu_id) == 0x2e || CPUID_TO_MODEL(cpu_id) == 0x1e ||
1063	    CPUID_TO_MODEL(cpu_id) == 0x1f || CPUID_TO_MODEL(cpu_id) == 0x1a ||
1064	    CPUID_TO_MODEL(cpu_id) == 0x2f || CPUID_TO_MODEL(cpu_id) == 0x25 ||
1065	    CPUID_TO_MODEL(cpu_id) == 0x2c || CPUID_TO_MODEL(cpu_id) == 0x2d ||
1066	    CPUID_TO_MODEL(cpu_id) == 0x2a || CPUID_TO_MODEL(cpu_id) == 0x3e ||
1067	    CPUID_TO_MODEL(cpu_id) == 0x3a) &&
1068	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1069		/*
1070		 * Nehalem, SandyBridge, IvyBridge
1071		 */
1072		CPU_FOREACH(i) {
1073			pc = pcpu_find(i);
1074			if (pc->pc_mds_buf == NULL) {
1075				pc->pc_mds_buf = malloc(672, M_TEMP,
1076				    M_WAITOK);
1077				bzero(pc->pc_mds_buf, 16);
1078			}
1079		}
1080		mds_handler = mds_handler_ivb;
1081	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1082	    (CPUID_TO_MODEL(cpu_id) == 0x3f || CPUID_TO_MODEL(cpu_id) == 0x3c ||
1083	    CPUID_TO_MODEL(cpu_id) == 0x45 || CPUID_TO_MODEL(cpu_id) == 0x46 ||
1084	    CPUID_TO_MODEL(cpu_id) == 0x56 || CPUID_TO_MODEL(cpu_id) == 0x4f ||
1085	    CPUID_TO_MODEL(cpu_id) == 0x47 || CPUID_TO_MODEL(cpu_id) == 0x3d) &&
1086	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1087		/*
1088		 * Haswell, Broadwell
1089		 */
1090		CPU_FOREACH(i) {
1091			pc = pcpu_find(i);
1092			if (pc->pc_mds_buf == NULL) {
1093				pc->pc_mds_buf = malloc(1536, M_TEMP,
1094				    M_WAITOK);
1095				bzero(pc->pc_mds_buf, 16);
1096			}
1097		}
1098		mds_handler = mds_handler_bdw;
1099	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1100	    ((CPUID_TO_MODEL(cpu_id) == 0x55 && (cpu_id &
1101	    CPUID_STEPPING) <= 5) ||
1102	    CPUID_TO_MODEL(cpu_id) == 0x4e || CPUID_TO_MODEL(cpu_id) == 0x5e ||
1103	    (CPUID_TO_MODEL(cpu_id) == 0x8e && (cpu_id &
1104	    CPUID_STEPPING) <= 0xb) ||
1105	    (CPUID_TO_MODEL(cpu_id) == 0x9e && (cpu_id &
1106	    CPUID_STEPPING) <= 0xc)) &&
1107	    (hw_mds_disable == 2 || hw_mds_disable == 3)) {
1108		/*
1109		 * Skylake, KabyLake, CoffeeLake, WhiskeyLake,
1110		 * CascadeLake
1111		 */
1112		CPU_FOREACH(i) {
1113			pc = pcpu_find(i);
1114			if (pc->pc_mds_buf == NULL) {
1115				pc->pc_mds_buf = malloc(6 * 1024,
1116				    M_TEMP, M_WAITOK);
1117				b64 = (vm_offset_t)malloc(64 + 63,
1118				    M_TEMP, M_WAITOK);
1119				pc->pc_mds_buf64 = (void *)roundup2(b64, 64);
1120				bzero(pc->pc_mds_buf64, 64);
1121			}
1122		}
1123		xcr0 = rxcr(0);
1124		if ((xcr0 & XFEATURE_ENABLED_ZMM_HI256) != 0 &&
1125		    (cpu_stdext_feature2 & CPUID_STDEXT_AVX512DQ) != 0)
1126			mds_handler = mds_handler_skl_avx512;
1127		else if ((xcr0 & XFEATURE_ENABLED_AVX) != 0 &&
1128		    (cpu_feature2 & CPUID2_AVX) != 0)
1129			mds_handler = mds_handler_skl_avx;
1130		else
1131			mds_handler = mds_handler_skl_sse;
1132	} else if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
1133	    ((CPUID_TO_MODEL(cpu_id) == 0x37 ||
1134	    CPUID_TO_MODEL(cpu_id) == 0x4a ||
1135	    CPUID_TO_MODEL(cpu_id) == 0x4c ||
1136	    CPUID_TO_MODEL(cpu_id) == 0x4d ||
1137	    CPUID_TO_MODEL(cpu_id) == 0x5a ||
1138	    CPUID_TO_MODEL(cpu_id) == 0x5d ||
1139	    CPUID_TO_MODEL(cpu_id) == 0x6e ||
1140	    CPUID_TO_MODEL(cpu_id) == 0x65 ||
1141	    CPUID_TO_MODEL(cpu_id) == 0x75 ||
1142	    CPUID_TO_MODEL(cpu_id) == 0x1c ||
1143	    CPUID_TO_MODEL(cpu_id) == 0x26 ||
1144	    CPUID_TO_MODEL(cpu_id) == 0x27 ||
1145	    CPUID_TO_MODEL(cpu_id) == 0x35 ||
1146	    CPUID_TO_MODEL(cpu_id) == 0x36 ||
1147	    CPUID_TO_MODEL(cpu_id) == 0x7a))) {
1148		/* Silvermont, Airmont */
1149		CPU_FOREACH(i) {
1150			pc = pcpu_find(i);
1151			if (pc->pc_mds_buf == NULL)
1152				pc->pc_mds_buf = malloc(256, M_TEMP, M_WAITOK);
1153		}
1154		mds_handler = mds_handler_silvermont;
1155	} else {
1156		hw_mds_disable = 0;
1157		mds_handler = mds_handler_void;
1158	}
1159}
1160
1161static void
1162hw_mds_recalculate_boot(void *arg __unused)
1163{
1164
1165	hw_mds_recalculate();
1166}
1167SYSINIT(mds_recalc, SI_SUB_SMP, SI_ORDER_ANY, hw_mds_recalculate_boot, NULL);
1168
1169static int
1170sysctl_mds_disable_handler(SYSCTL_HANDLER_ARGS)
1171{
1172	int error, val;
1173
1174	val = hw_mds_disable;
1175	error = sysctl_handle_int(oidp, &val, 0, req);
1176	if (error != 0 || req->newptr == NULL)
1177		return (error);
1178	if (val < 0 || val > 3)
1179		return (EINVAL);
1180	hw_mds_disable = val;
1181	hw_mds_recalculate();
1182	return (0);
1183}
1184
1185SYSCTL_PROC(_hw, OID_AUTO, mds_disable, CTLTYPE_INT |
1186    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1187    sysctl_mds_disable_handler, "I",
1188    "Microarchitectural Data Sampling Mitigation "
1189    "(0 - off, 1 - on VERW, 2 - on SW, 3 - on AUTO");
1190
1191
1192/*
1193 * Intel Transactional Memory Asynchronous Abort Mitigation
1194 * CVE-2019-11135
1195 */
1196int x86_taa_enable;
1197int x86_taa_state;
1198enum {
1199	TAA_NONE	= 0,	/* No mitigation enabled */
1200	TAA_TSX_DISABLE	= 1,	/* Disable TSX via MSR */
1201	TAA_VERW	= 2,	/* Use VERW mitigation */
1202	TAA_AUTO	= 3,	/* Automatically select the mitigation */
1203
1204	/* The states below are not selectable by the operator */
1205
1206	TAA_TAA_UC	= 4,	/* Mitigation present in microcode */
1207	TAA_NOT_PRESENT	= 5	/* TSX is not present */
1208};
1209
1210static void
1211taa_set(bool enable, bool all)
1212{
1213
1214	x86_msr_op(MSR_IA32_TSX_CTRL,
1215	    (enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1216	    (all ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
1217	    IA32_TSX_CTRL_RTM_DISABLE | IA32_TSX_CTRL_TSX_CPUID_CLEAR);
1218}
1219
1220void
1221x86_taa_recalculate(void)
1222{
1223	static int taa_saved_mds_disable = 0;
1224	int taa_need = 0, taa_state = 0;
1225	int mds_disable = 0, need_mds_recalc = 0;
1226
1227	/* Check CPUID.07h.EBX.HLE and RTM for the presence of TSX */
1228	if ((cpu_stdext_feature & CPUID_STDEXT_HLE) == 0 ||
1229	    (cpu_stdext_feature & CPUID_STDEXT_RTM) == 0) {
1230		/* TSX is not present */
1231		x86_taa_state = TAA_NOT_PRESENT;
1232		return;
1233	}
1234
1235	/* Check to see what mitigation options the CPU gives us */
1236	if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TAA_NO) {
1237		/* CPU is not suseptible to TAA */
1238		taa_need = TAA_TAA_UC;
1239	} else if (cpu_ia32_arch_caps & IA32_ARCH_CAP_TSX_CTRL) {
1240		/*
1241		 * CPU can turn off TSX.  This is the next best option
1242		 * if TAA_NO hardware mitigation isn't present
1243		 */
1244		taa_need = TAA_TSX_DISABLE;
1245	} else {
1246		/* No TSX/TAA specific remedies are available. */
1247		if (x86_taa_enable == TAA_TSX_DISABLE) {
1248			if (bootverbose)
1249				printf("TSX control not available\n");
1250			return;
1251		} else
1252			taa_need = TAA_VERW;
1253	}
1254
1255	/* Can we automatically take action, or are we being forced? */
1256	if (x86_taa_enable == TAA_AUTO)
1257		taa_state = taa_need;
1258	else
1259		taa_state = x86_taa_enable;
1260
1261	/* No state change, nothing to do */
1262	if (taa_state == x86_taa_state) {
1263		if (bootverbose)
1264			printf("No TSX change made\n");
1265		return;
1266	}
1267
1268	/* Does the MSR need to be turned on or off? */
1269	if (taa_state == TAA_TSX_DISABLE)
1270		taa_set(true, true);
1271	else if (x86_taa_state == TAA_TSX_DISABLE)
1272		taa_set(false, true);
1273
1274	/* Does MDS need to be set to turn on VERW? */
1275	if (taa_state == TAA_VERW) {
1276		taa_saved_mds_disable = hw_mds_disable;
1277		mds_disable = hw_mds_disable = 1;
1278		need_mds_recalc = 1;
1279	} else if (x86_taa_state == TAA_VERW) {
1280		mds_disable = hw_mds_disable = taa_saved_mds_disable;
1281		need_mds_recalc = 1;
1282	}
1283	if (need_mds_recalc) {
1284		hw_mds_recalculate();
1285		if (mds_disable != hw_mds_disable) {
1286			if (bootverbose)
1287				printf("Cannot change MDS state for TAA\n");
1288			/* Don't update our state */
1289			return;
1290		}
1291	}
1292
1293	x86_taa_state = taa_state;
1294	return;
1295}
1296
1297static void
1298taa_recalculate_boot(void * arg __unused)
1299{
1300
1301	x86_taa_recalculate();
1302}
1303SYSINIT(taa_recalc, SI_SUB_SMP, SI_ORDER_ANY, taa_recalculate_boot, NULL);
1304
1305SYSCTL_NODE(_machdep_mitigations, OID_AUTO, taa, CTLFLAG_RW, 0,
1306	"TSX Asynchronous Abort Mitigation");
1307
1308static int
1309sysctl_taa_handler(SYSCTL_HANDLER_ARGS)
1310{
1311	int error, val;
1312
1313	val = x86_taa_enable;
1314	error = sysctl_handle_int(oidp, &val, 0, req);
1315	if (error != 0 || req->newptr == NULL)
1316		return (error);
1317	if (val < TAA_NONE || val > TAA_AUTO)
1318		return (EINVAL);
1319	x86_taa_enable = val;
1320	x86_taa_recalculate();
1321	return (0);
1322}
1323
1324SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, enable, CTLTYPE_INT |
1325    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1326    sysctl_taa_handler, "I",
1327    "TAA Mitigation enablement control "
1328    "(0 - off, 1 - disable TSX, 2 - VERW, 3 - on AUTO");
1329
1330static int
1331sysctl_taa_state_handler(SYSCTL_HANDLER_ARGS)
1332{
1333	const char *state;
1334
1335	switch (x86_taa_state) {
1336	case TAA_NONE:
1337		state = "inactive";
1338		break;
1339	case TAA_TSX_DISABLE:
1340		state = "TSX disabled";
1341		break;
1342	case TAA_VERW:
1343		state = "VERW";
1344		break;
1345	case TAA_TAA_UC:
1346		state = "Mitigated in microcode";
1347		break;
1348	case TAA_NOT_PRESENT:
1349		state = "TSX not present";
1350		break;
1351	default:
1352		state = "unknown";
1353	}
1354
1355	return (SYSCTL_OUT(req, state, strlen(state)));
1356}
1357
1358SYSCTL_PROC(_machdep_mitigations_taa, OID_AUTO, state,
1359    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1360    sysctl_taa_state_handler, "A",
1361    "TAA Mitigation state");
1362
1363int __read_frequently cpu_flush_rsb_ctxsw;
1364SYSCTL_INT(_machdep_mitigations, OID_AUTO, flush_rsb_ctxsw,
1365    CTLFLAG_RW | CTLFLAG_NOFETCH, &cpu_flush_rsb_ctxsw, 0,
1366    "Flush Return Stack Buffer on context switch");
1367
1368SYSCTL_NODE(_machdep_mitigations, OID_AUTO, rngds,
1369    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1370    "MCU Optimization, disable RDSEED mitigation");
1371
1372int x86_rngds_mitg_enable = 1;
1373void
1374x86_rngds_mitg_recalculate(bool all_cpus)
1375{
1376	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0)
1377		return;
1378	x86_msr_op(MSR_IA32_MCU_OPT_CTRL,
1379	    (x86_rngds_mitg_enable ? MSR_OP_OR : MSR_OP_ANDNOT) |
1380	    (all_cpus ? MSR_OP_RENDEZVOUS : MSR_OP_LOCAL),
1381	    IA32_RNGDS_MITG_DIS);
1382}
1383
1384static int
1385sysctl_rngds_mitg_enable_handler(SYSCTL_HANDLER_ARGS)
1386{
1387	int error, val;
1388
1389	val = x86_rngds_mitg_enable;
1390	error = sysctl_handle_int(oidp, &val, 0, req);
1391	if (error != 0 || req->newptr == NULL)
1392		return (error);
1393	x86_rngds_mitg_enable = val;
1394	x86_rngds_mitg_recalculate(true);
1395	return (0);
1396}
1397SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, enable, CTLTYPE_INT |
1398    CTLFLAG_RWTUN | CTLFLAG_NOFETCH | CTLFLAG_MPSAFE, NULL, 0,
1399    sysctl_rngds_mitg_enable_handler, "I",
1400    "MCU Optimization, disabling RDSEED mitigation control "
1401    "(0 - mitigation disabled (RDSEED optimized), 1 - mitigation enabled");
1402
1403static int
1404sysctl_rngds_state_handler(SYSCTL_HANDLER_ARGS)
1405{
1406	const char *state;
1407
1408	if ((cpu_stdext_feature3 & CPUID_STDEXT3_MCUOPT) == 0) {
1409		state = "Not applicable";
1410	} else if (x86_rngds_mitg_enable == 0) {
1411		state = "RDSEED not serialized";
1412	} else {
1413		state = "Mitigated";
1414	}
1415	return (SYSCTL_OUT(req, state, strlen(state)));
1416}
1417SYSCTL_PROC(_machdep_mitigations_rngds, OID_AUTO, state,
1418    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1419    sysctl_rngds_state_handler, "A",
1420    "MCU Optimization state");
1421