vmx.c revision 264619
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 264619 2014-04-17 18:00:07Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 264619 2014-04-17 18:00:07Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/smp.h>
35#include <sys/kernel.h>
36#include <sys/malloc.h>
37#include <sys/pcpu.h>
38#include <sys/proc.h>
39#include <sys/sysctl.h>
40
41#include <vm/vm.h>
42#include <vm/pmap.h>
43
44#include <machine/psl.h>
45#include <machine/cpufunc.h>
46#include <machine/md_var.h>
47#include <machine/segments.h>
48#include <machine/specialreg.h>
49#include <machine/vmparam.h>
50
51#include <machine/vmm.h>
52#include "vmm_host.h"
53#include "vmm_lapic.h"
54#include "vmm_msr.h"
55#include "vmm_ktr.h"
56#include "vmm_stat.h"
57
58#include "vmx_msr.h"
59#include "ept.h"
60#include "vmx_cpufunc.h"
61#include "vmx.h"
62#include "x86.h"
63#include "vmx_controls.h"
64
65#define	PINBASED_CTLS_ONE_SETTING					\
66	(PINBASED_EXTINT_EXITING	|				\
67	 PINBASED_NMI_EXITING		|				\
68	 PINBASED_VIRTUAL_NMI)
69#define	PINBASED_CTLS_ZERO_SETTING	0
70
71#define PROCBASED_CTLS_WINDOW_SETTING					\
72	(PROCBASED_INT_WINDOW_EXITING	|				\
73	 PROCBASED_NMI_WINDOW_EXITING)
74
75#define	PROCBASED_CTLS_ONE_SETTING 					\
76	(PROCBASED_SECONDARY_CONTROLS	|				\
77	 PROCBASED_IO_EXITING		|				\
78	 PROCBASED_MSR_BITMAPS		|				\
79	 PROCBASED_CTLS_WINDOW_SETTING)
80#define	PROCBASED_CTLS_ZERO_SETTING	\
81	(PROCBASED_CR3_LOAD_EXITING |	\
82	PROCBASED_CR3_STORE_EXITING |	\
83	PROCBASED_IO_BITMAPS)
84
85#define	PROCBASED_CTLS2_ONE_SETTING	PROCBASED2_ENABLE_EPT
86#define	PROCBASED_CTLS2_ZERO_SETTING	0
87
88#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT					\
89	(VM_EXIT_HOST_LMA			|			\
90	VM_EXIT_SAVE_EFER			|			\
91	VM_EXIT_LOAD_EFER)
92
93#define	VM_EXIT_CTLS_ONE_SETTING					\
94	(VM_EXIT_CTLS_ONE_SETTING_NO_PAT       	|			\
95	VM_EXIT_SAVE_PAT			|			\
96	VM_EXIT_LOAD_PAT)
97#define	VM_EXIT_CTLS_ZERO_SETTING	VM_EXIT_SAVE_DEBUG_CONTROLS
98
99#define	VM_ENTRY_CTLS_ONE_SETTING_NO_PAT	VM_ENTRY_LOAD_EFER
100
101#define	VM_ENTRY_CTLS_ONE_SETTING					\
102	(VM_ENTRY_CTLS_ONE_SETTING_NO_PAT     	|			\
103	VM_ENTRY_LOAD_PAT)
104#define	VM_ENTRY_CTLS_ZERO_SETTING					\
105	(VM_ENTRY_LOAD_DEBUG_CONTROLS		|			\
106	VM_ENTRY_INTO_SMM			|			\
107	VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
108
109#define	guest_msr_rw(vmx, msr) \
110	msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
111
112#define	HANDLED		1
113#define	UNHANDLED	0
114
115MALLOC_DEFINE(M_VMX, "vmx", "vmx");
116
117SYSCTL_DECL(_hw_vmm);
118SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
119
120int vmxon_enabled[MAXCPU];
121static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
122
123static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
124static uint32_t exit_ctls, entry_ctls;
125
126static uint64_t cr0_ones_mask, cr0_zeros_mask;
127SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
128	     &cr0_ones_mask, 0, NULL);
129SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
130	     &cr0_zeros_mask, 0, NULL);
131
132static uint64_t cr4_ones_mask, cr4_zeros_mask;
133SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
134	     &cr4_ones_mask, 0, NULL);
135SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
136	     &cr4_zeros_mask, 0, NULL);
137
138static int vmx_no_patmsr;
139
140static int vmx_initialized;
141SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
142	   &vmx_initialized, 0, "Intel VMX initialized");
143
144/*
145 * Virtual NMI blocking conditions.
146 *
147 * Some processor implementations also require NMI to be blocked if
148 * the STI_BLOCKING bit is set. It is possible to detect this at runtime
149 * based on the (exit_reason,exit_qual) tuple being set to
150 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING).
151 *
152 * We take the easy way out and also include STI_BLOCKING as one of the
153 * gating items for vNMI injection.
154 */
155static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING |
156				    VMCS_INTERRUPTIBILITY_NMI_BLOCKING |
157				    VMCS_INTERRUPTIBILITY_STI_BLOCKING;
158
159/*
160 * Optional capabilities
161 */
162static int cap_halt_exit;
163static int cap_pause_exit;
164static int cap_unrestricted_guest;
165static int cap_monitor_trap;
166static int cap_invpcid;
167
168static struct unrhdr *vpid_unr;
169static u_int vpid_alloc_failed;
170SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
171	    &vpid_alloc_failed, 0, NULL);
172
173#ifdef KTR
174static const char *
175exit_reason_to_str(int reason)
176{
177	static char reasonbuf[32];
178
179	switch (reason) {
180	case EXIT_REASON_EXCEPTION:
181		return "exception";
182	case EXIT_REASON_EXT_INTR:
183		return "extint";
184	case EXIT_REASON_TRIPLE_FAULT:
185		return "triplefault";
186	case EXIT_REASON_INIT:
187		return "init";
188	case EXIT_REASON_SIPI:
189		return "sipi";
190	case EXIT_REASON_IO_SMI:
191		return "iosmi";
192	case EXIT_REASON_SMI:
193		return "smi";
194	case EXIT_REASON_INTR_WINDOW:
195		return "intrwindow";
196	case EXIT_REASON_NMI_WINDOW:
197		return "nmiwindow";
198	case EXIT_REASON_TASK_SWITCH:
199		return "taskswitch";
200	case EXIT_REASON_CPUID:
201		return "cpuid";
202	case EXIT_REASON_GETSEC:
203		return "getsec";
204	case EXIT_REASON_HLT:
205		return "hlt";
206	case EXIT_REASON_INVD:
207		return "invd";
208	case EXIT_REASON_INVLPG:
209		return "invlpg";
210	case EXIT_REASON_RDPMC:
211		return "rdpmc";
212	case EXIT_REASON_RDTSC:
213		return "rdtsc";
214	case EXIT_REASON_RSM:
215		return "rsm";
216	case EXIT_REASON_VMCALL:
217		return "vmcall";
218	case EXIT_REASON_VMCLEAR:
219		return "vmclear";
220	case EXIT_REASON_VMLAUNCH:
221		return "vmlaunch";
222	case EXIT_REASON_VMPTRLD:
223		return "vmptrld";
224	case EXIT_REASON_VMPTRST:
225		return "vmptrst";
226	case EXIT_REASON_VMREAD:
227		return "vmread";
228	case EXIT_REASON_VMRESUME:
229		return "vmresume";
230	case EXIT_REASON_VMWRITE:
231		return "vmwrite";
232	case EXIT_REASON_VMXOFF:
233		return "vmxoff";
234	case EXIT_REASON_VMXON:
235		return "vmxon";
236	case EXIT_REASON_CR_ACCESS:
237		return "craccess";
238	case EXIT_REASON_DR_ACCESS:
239		return "draccess";
240	case EXIT_REASON_INOUT:
241		return "inout";
242	case EXIT_REASON_RDMSR:
243		return "rdmsr";
244	case EXIT_REASON_WRMSR:
245		return "wrmsr";
246	case EXIT_REASON_INVAL_VMCS:
247		return "invalvmcs";
248	case EXIT_REASON_INVAL_MSR:
249		return "invalmsr";
250	case EXIT_REASON_MWAIT:
251		return "mwait";
252	case EXIT_REASON_MTF:
253		return "mtf";
254	case EXIT_REASON_MONITOR:
255		return "monitor";
256	case EXIT_REASON_PAUSE:
257		return "pause";
258	case EXIT_REASON_MCE:
259		return "mce";
260	case EXIT_REASON_TPR:
261		return "tpr";
262	case EXIT_REASON_APIC:
263		return "apic";
264	case EXIT_REASON_GDTR_IDTR:
265		return "gdtridtr";
266	case EXIT_REASON_LDTR_TR:
267		return "ldtrtr";
268	case EXIT_REASON_EPT_FAULT:
269		return "eptfault";
270	case EXIT_REASON_EPT_MISCONFIG:
271		return "eptmisconfig";
272	case EXIT_REASON_INVEPT:
273		return "invept";
274	case EXIT_REASON_RDTSCP:
275		return "rdtscp";
276	case EXIT_REASON_VMX_PREEMPT:
277		return "vmxpreempt";
278	case EXIT_REASON_INVVPID:
279		return "invvpid";
280	case EXIT_REASON_WBINVD:
281		return "wbinvd";
282	case EXIT_REASON_XSETBV:
283		return "xsetbv";
284	default:
285		snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
286		return (reasonbuf);
287	}
288}
289#endif	/* KTR */
290
291u_long
292vmx_fix_cr0(u_long cr0)
293{
294
295	return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
296}
297
298u_long
299vmx_fix_cr4(u_long cr4)
300{
301
302	return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
303}
304
305static void
306vpid_free(int vpid)
307{
308	if (vpid < 0 || vpid > 0xffff)
309		panic("vpid_free: invalid vpid %d", vpid);
310
311	/*
312	 * VPIDs [0,VM_MAXCPU] are special and are not allocated from
313	 * the unit number allocator.
314	 */
315
316	if (vpid > VM_MAXCPU)
317		free_unr(vpid_unr, vpid);
318}
319
320static void
321vpid_alloc(uint16_t *vpid, int num)
322{
323	int i, x;
324
325	if (num <= 0 || num > VM_MAXCPU)
326		panic("invalid number of vpids requested: %d", num);
327
328	/*
329	 * If the "enable vpid" execution control is not enabled then the
330	 * VPID is required to be 0 for all vcpus.
331	 */
332	if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
333		for (i = 0; i < num; i++)
334			vpid[i] = 0;
335		return;
336	}
337
338	/*
339	 * Allocate a unique VPID for each vcpu from the unit number allocator.
340	 */
341	for (i = 0; i < num; i++) {
342		x = alloc_unr(vpid_unr);
343		if (x == -1)
344			break;
345		else
346			vpid[i] = x;
347	}
348
349	if (i < num) {
350		atomic_add_int(&vpid_alloc_failed, 1);
351
352		/*
353		 * If the unit number allocator does not have enough unique
354		 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
355		 *
356		 * These VPIDs are not be unique across VMs but this does not
357		 * affect correctness because the combined mappings are also
358		 * tagged with the EP4TA which is unique for each VM.
359		 *
360		 * It is still sub-optimal because the invvpid will invalidate
361		 * combined mappings for a particular VPID across all EP4TAs.
362		 */
363		while (i-- > 0)
364			vpid_free(vpid[i]);
365
366		for (i = 0; i < num; i++)
367			vpid[i] = i + 1;
368	}
369}
370
371static void
372vpid_init(void)
373{
374	/*
375	 * VPID 0 is required when the "enable VPID" execution control is
376	 * disabled.
377	 *
378	 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
379	 * unit number allocator does not have sufficient unique VPIDs to
380	 * satisfy the allocation.
381	 *
382	 * The remaining VPIDs are managed by the unit number allocator.
383	 */
384	vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
385}
386
387static void
388msr_save_area_init(struct msr_entry *g_area, int *g_count)
389{
390	int cnt;
391
392	static struct msr_entry guest_msrs[] = {
393		{ MSR_KGSBASE, 0, 0 },
394	};
395
396	cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
397	if (cnt > GUEST_MSR_MAX_ENTRIES)
398		panic("guest msr save area overrun");
399	bcopy(guest_msrs, g_area, sizeof(guest_msrs));
400	*g_count = cnt;
401}
402
403static void
404vmx_disable(void *arg __unused)
405{
406	struct invvpid_desc invvpid_desc = { 0 };
407	struct invept_desc invept_desc = { 0 };
408
409	if (vmxon_enabled[curcpu]) {
410		/*
411		 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
412		 *
413		 * VMXON or VMXOFF are not required to invalidate any TLB
414		 * caching structures. This prevents potential retention of
415		 * cached information in the TLB between distinct VMX episodes.
416		 */
417		invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
418		invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
419		vmxoff();
420	}
421	load_cr4(rcr4() & ~CR4_VMXE);
422}
423
424static int
425vmx_cleanup(void)
426{
427
428	if (vpid_unr != NULL) {
429		delete_unrhdr(vpid_unr);
430		vpid_unr = NULL;
431	}
432
433	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
434
435	return (0);
436}
437
438static void
439vmx_enable(void *arg __unused)
440{
441	int error;
442
443	load_cr4(rcr4() | CR4_VMXE);
444
445	*(uint32_t *)vmxon_region[curcpu] = vmx_revision();
446	error = vmxon(vmxon_region[curcpu]);
447	if (error == 0)
448		vmxon_enabled[curcpu] = 1;
449}
450
451static void
452vmx_restore(void)
453{
454
455	if (vmxon_enabled[curcpu])
456		vmxon(vmxon_region[curcpu]);
457}
458
459static int
460vmx_init(void)
461{
462	int error;
463	uint64_t fixed0, fixed1, feature_control;
464	uint32_t tmp;
465
466	/* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
467	if (!(cpu_feature2 & CPUID2_VMX)) {
468		printf("vmx_init: processor does not support VMX operation\n");
469		return (ENXIO);
470	}
471
472	/*
473	 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
474	 * are set (bits 0 and 2 respectively).
475	 */
476	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
477	if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
478	    (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
479		printf("vmx_init: VMX operation disabled by BIOS\n");
480		return (ENXIO);
481	}
482
483	/* Check support for primary processor-based VM-execution controls */
484	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
485			       MSR_VMX_TRUE_PROCBASED_CTLS,
486			       PROCBASED_CTLS_ONE_SETTING,
487			       PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
488	if (error) {
489		printf("vmx_init: processor does not support desired primary "
490		       "processor-based controls\n");
491		return (error);
492	}
493
494	/* Clear the processor-based ctl bits that are set on demand */
495	procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
496
497	/* Check support for secondary processor-based VM-execution controls */
498	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
499			       MSR_VMX_PROCBASED_CTLS2,
500			       PROCBASED_CTLS2_ONE_SETTING,
501			       PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
502	if (error) {
503		printf("vmx_init: processor does not support desired secondary "
504		       "processor-based controls\n");
505		return (error);
506	}
507
508	/* Check support for VPID */
509	error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
510			       PROCBASED2_ENABLE_VPID, 0, &tmp);
511	if (error == 0)
512		procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
513
514	/* Check support for pin-based VM-execution controls */
515	error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
516			       MSR_VMX_TRUE_PINBASED_CTLS,
517			       PINBASED_CTLS_ONE_SETTING,
518			       PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
519	if (error) {
520		printf("vmx_init: processor does not support desired "
521		       "pin-based controls\n");
522		return (error);
523	}
524
525	/* Check support for VM-exit controls */
526	error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
527			       VM_EXIT_CTLS_ONE_SETTING,
528			       VM_EXIT_CTLS_ZERO_SETTING,
529			       &exit_ctls);
530	if (error) {
531		/* Try again without the PAT MSR bits */
532		error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS,
533				       MSR_VMX_TRUE_EXIT_CTLS,
534				       VM_EXIT_CTLS_ONE_SETTING_NO_PAT,
535				       VM_EXIT_CTLS_ZERO_SETTING,
536				       &exit_ctls);
537		if (error) {
538			printf("vmx_init: processor does not support desired "
539			       "exit controls\n");
540			return (error);
541		} else {
542			if (bootverbose)
543				printf("vmm: PAT MSR access not supported\n");
544			guest_msr_valid(MSR_PAT);
545			vmx_no_patmsr = 1;
546		}
547	}
548
549	/* Check support for VM-entry controls */
550	if (!vmx_no_patmsr) {
551		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
552				       MSR_VMX_TRUE_ENTRY_CTLS,
553				       VM_ENTRY_CTLS_ONE_SETTING,
554				       VM_ENTRY_CTLS_ZERO_SETTING,
555				       &entry_ctls);
556	} else {
557		error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS,
558				       MSR_VMX_TRUE_ENTRY_CTLS,
559				       VM_ENTRY_CTLS_ONE_SETTING_NO_PAT,
560				       VM_ENTRY_CTLS_ZERO_SETTING,
561				       &entry_ctls);
562	}
563
564	if (error) {
565		printf("vmx_init: processor does not support desired "
566		       "entry controls\n");
567		       return (error);
568	}
569
570	/*
571	 * Check support for optional features by testing them
572	 * as individual bits
573	 */
574	cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
575					MSR_VMX_TRUE_PROCBASED_CTLS,
576					PROCBASED_HLT_EXITING, 0,
577					&tmp) == 0);
578
579	cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
580					MSR_VMX_PROCBASED_CTLS,
581					PROCBASED_MTF, 0,
582					&tmp) == 0);
583
584	cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
585					 MSR_VMX_TRUE_PROCBASED_CTLS,
586					 PROCBASED_PAUSE_EXITING, 0,
587					 &tmp) == 0);
588
589	cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
590					MSR_VMX_PROCBASED_CTLS2,
591					PROCBASED2_UNRESTRICTED_GUEST, 0,
592				        &tmp) == 0);
593
594	cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
595	    MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
596	    &tmp) == 0);
597
598
599	/* Initialize EPT */
600	error = ept_init();
601	if (error) {
602		printf("vmx_init: ept initialization failed (%d)\n", error);
603		return (error);
604	}
605
606	/*
607	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
608	 */
609	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
610	fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
611	cr0_ones_mask = fixed0 & fixed1;
612	cr0_zeros_mask = ~fixed0 & ~fixed1;
613
614	/*
615	 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
616	 * if unrestricted guest execution is allowed.
617	 */
618	if (cap_unrestricted_guest)
619		cr0_ones_mask &= ~(CR0_PG | CR0_PE);
620
621	/*
622	 * Do not allow the guest to set CR0_NW or CR0_CD.
623	 */
624	cr0_zeros_mask |= (CR0_NW | CR0_CD);
625
626	fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
627	fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
628	cr4_ones_mask = fixed0 & fixed1;
629	cr4_zeros_mask = ~fixed0 & ~fixed1;
630
631	vpid_init();
632
633	/* enable VMX operation */
634	smp_rendezvous(NULL, vmx_enable, NULL, NULL);
635
636	vmx_initialized = 1;
637
638	return (0);
639}
640
641static int
642vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
643{
644	int error, mask_ident, shadow_ident;
645	uint64_t mask_value;
646
647	if (which != 0 && which != 4)
648		panic("vmx_setup_cr_shadow: unknown cr%d", which);
649
650	if (which == 0) {
651		mask_ident = VMCS_CR0_MASK;
652		mask_value = cr0_ones_mask | cr0_zeros_mask;
653		shadow_ident = VMCS_CR0_SHADOW;
654	} else {
655		mask_ident = VMCS_CR4_MASK;
656		mask_value = cr4_ones_mask | cr4_zeros_mask;
657		shadow_ident = VMCS_CR4_SHADOW;
658	}
659
660	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
661	if (error)
662		return (error);
663
664	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
665	if (error)
666		return (error);
667
668	return (0);
669}
670#define	vmx_setup_cr0_shadow(vmcs,init)	vmx_setup_cr_shadow(0, (vmcs), (init))
671#define	vmx_setup_cr4_shadow(vmcs,init)	vmx_setup_cr_shadow(4, (vmcs), (init))
672
673static void *
674vmx_vminit(struct vm *vm, pmap_t pmap)
675{
676	uint16_t vpid[VM_MAXCPU];
677	int i, error, guest_msr_count;
678	struct vmx *vmx;
679
680	vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
681	if ((uintptr_t)vmx & PAGE_MASK) {
682		panic("malloc of struct vmx not aligned on %d byte boundary",
683		      PAGE_SIZE);
684	}
685	vmx->vm = vm;
686
687	vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
688
689	/*
690	 * Clean up EPTP-tagged guest physical and combined mappings
691	 *
692	 * VMX transitions are not required to invalidate any guest physical
693	 * mappings. So, it may be possible for stale guest physical mappings
694	 * to be present in the processor TLBs.
695	 *
696	 * Combined mappings for this EP4TA are also invalidated for all VPIDs.
697	 */
698	ept_invalidate_mappings(vmx->eptp);
699
700	msr_bitmap_initialize(vmx->msr_bitmap);
701
702	/*
703	 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
704	 * The guest FSBASE and GSBASE are saved and restored during
705	 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
706	 * always restored from the vmcs host state area on vm-exit.
707	 *
708	 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
709	 * how they are saved/restored so can be directly accessed by the
710	 * guest.
711	 *
712	 * Guest KGSBASE is saved and restored in the guest MSR save area.
713	 * Host KGSBASE is restored before returning to userland from the pcb.
714	 * There will be a window of time when we are executing in the host
715	 * kernel context with a value of KGSBASE from the guest. This is ok
716	 * because the value of KGSBASE is inconsequential in kernel context.
717	 *
718	 * MSR_EFER is saved and restored in the guest VMCS area on a
719	 * VM exit and entry respectively. It is also restored from the
720	 * host VMCS area on a VM exit.
721	 */
722	if (guest_msr_rw(vmx, MSR_GSBASE) ||
723	    guest_msr_rw(vmx, MSR_FSBASE) ||
724	    guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
725	    guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
726	    guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
727	    guest_msr_rw(vmx, MSR_KGSBASE) ||
728	    guest_msr_rw(vmx, MSR_EFER))
729		panic("vmx_vminit: error setting guest msr access");
730
731	/*
732	 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
733	 * and entry respectively. It is also restored from the host VMCS
734	 * area on a VM exit. However, if running on a system with no
735	 * MSR_PAT save/restore support, leave access disabled so accesses
736	 * will be trapped.
737	 */
738	if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT))
739		panic("vmx_vminit: error setting guest pat msr access");
740
741	vpid_alloc(vpid, VM_MAXCPU);
742
743	for (i = 0; i < VM_MAXCPU; i++) {
744		vmx->vmcs[i].identifier = vmx_revision();
745		error = vmclear(&vmx->vmcs[i]);
746		if (error != 0) {
747			panic("vmx_vminit: vmclear error %d on vcpu %d\n",
748			      error, i);
749		}
750
751		error = vmcs_set_defaults(&vmx->vmcs[i],
752					  (u_long)vmx_exit_guest,
753					  (u_long)&vmx->ctx[i],
754					  vmx->eptp,
755					  pinbased_ctls,
756					  procbased_ctls,
757					  procbased_ctls2,
758					  exit_ctls, entry_ctls,
759					  vtophys(vmx->msr_bitmap),
760					  vpid[i]);
761
762		if (error != 0)
763			panic("vmx_vminit: vmcs_set_defaults error %d", error);
764
765		vmx->cap[i].set = 0;
766		vmx->cap[i].proc_ctls = procbased_ctls;
767		vmx->cap[i].proc_ctls2 = procbased_ctls2;
768
769		vmx->state[i].lastcpu = -1;
770		vmx->state[i].vpid = vpid[i];
771
772		msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
773
774		error = vmcs_set_msr_save(&vmx->vmcs[i],
775					  vtophys(vmx->guest_msrs[i]),
776					  guest_msr_count);
777		if (error != 0)
778			panic("vmcs_set_msr_save error %d", error);
779
780		/*
781		 * Set up the CR0/4 shadows, and init the read shadow
782		 * to the power-on register value from the Intel Sys Arch.
783		 *  CR0 - 0x60000010
784		 *  CR4 - 0
785		 */
786		error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010);
787		if (error != 0)
788			panic("vmx_setup_cr0_shadow %d", error);
789
790		error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0);
791		if (error != 0)
792			panic("vmx_setup_cr4_shadow %d", error);
793
794		vmx->ctx[i].pmap = pmap;
795		vmx->ctx[i].eptp = vmx->eptp;
796	}
797
798	return (vmx);
799}
800
801static int
802vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
803{
804	int handled, func;
805
806	func = vmxctx->guest_rax;
807
808	handled = x86_emulate_cpuid(vm, vcpu,
809				    (uint32_t*)(&vmxctx->guest_rax),
810				    (uint32_t*)(&vmxctx->guest_rbx),
811				    (uint32_t*)(&vmxctx->guest_rcx),
812				    (uint32_t*)(&vmxctx->guest_rdx));
813	return (handled);
814}
815
816static __inline void
817vmx_run_trace(struct vmx *vmx, int vcpu)
818{
819#ifdef KTR
820	VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
821#endif
822}
823
824static __inline void
825vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
826	       int handled)
827{
828#ifdef KTR
829	VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
830		 handled ? "handled" : "unhandled",
831		 exit_reason_to_str(exit_reason), rip);
832#endif
833}
834
835static __inline void
836vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
837{
838#ifdef KTR
839	VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
840#endif
841}
842
843static void
844vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
845{
846	int lastcpu;
847	struct vmxstate *vmxstate;
848	struct invvpid_desc invvpid_desc = { 0 };
849
850	vmxstate = &vmx->state[vcpu];
851	lastcpu = vmxstate->lastcpu;
852	vmxstate->lastcpu = curcpu;
853
854	if (lastcpu == curcpu)
855		return;
856
857	vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
858
859	vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
860	vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
861	vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
862
863	/*
864	 * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
865	 *
866	 * We do this because this vcpu was executing on a different host
867	 * cpu when it last ran. We do not track whether it invalidated
868	 * mappings associated with its 'vpid' during that run. So we must
869	 * assume that the mappings associated with 'vpid' on 'curcpu' are
870	 * stale and invalidate them.
871	 *
872	 * Note that we incur this penalty only when the scheduler chooses to
873	 * move the thread associated with this vcpu between host cpus.
874	 *
875	 * Note also that this will invalidate mappings tagged with 'vpid'
876	 * for "all" EP4TAs.
877	 */
878	if (vmxstate->vpid != 0) {
879		invvpid_desc.vpid = vmxstate->vpid;
880		invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
881	}
882}
883
884/*
885 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
886 */
887CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
888
889static void __inline
890vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
891{
892
893	vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
894	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
895}
896
897static void __inline
898vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
899{
900
901	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
902	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
903}
904
905static void __inline
906vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
907{
908
909	vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
910	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
911}
912
913static void __inline
914vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
915{
916
917	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
918	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
919}
920
921static int
922vmx_inject_nmi(struct vmx *vmx, int vcpu)
923{
924	uint64_t info, interruptibility;
925
926	/* Bail out if no NMI requested */
927	if (!vm_nmi_pending(vmx->vm, vcpu))
928		return (0);
929
930	interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
931	if (interruptibility & nmi_blocking_bits)
932		goto nmiblocked;
933
934	/*
935	 * Inject the virtual NMI. The vector must be the NMI IDT entry
936	 * or the VMCS entry check will fail.
937	 */
938	info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID;
939	info |= IDT_NMI;
940	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
941
942	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
943
944	/* Clear the request */
945	vm_nmi_clear(vmx->vm, vcpu);
946	return (1);
947
948nmiblocked:
949	/*
950	 * Set the NMI Window Exiting execution control so we can inject
951	 * the virtual NMI as soon as blocking condition goes away.
952	 */
953	vmx_set_nmi_window_exiting(vmx, vcpu);
954
955	VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
956	return (1);
957}
958
959static void
960vmx_inject_interrupts(struct vmx *vmx, int vcpu)
961{
962	int vector;
963	uint64_t info, rflags, interruptibility;
964
965	const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING |
966				   VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING;
967
968	/*
969	 * If there is already an interrupt pending then just return.
970	 *
971	 * This could happen if an interrupt was injected on a prior
972	 * VM entry but the actual entry into guest mode was aborted
973	 * because of a pending AST.
974	 */
975	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
976	if (info & VMCS_INTERRUPTION_INFO_VALID)
977		return;
978
979	/*
980	 * NMI injection has priority so deal with those first
981	 */
982	if (vmx_inject_nmi(vmx, vcpu))
983		return;
984
985	/* Ask the local apic for a vector to inject */
986	vector = lapic_pending_intr(vmx->vm, vcpu);
987	if (vector < 0)
988		return;
989
990	if (vector < 32 || vector > 255)
991		panic("vmx_inject_interrupts: invalid vector %d\n", vector);
992
993	/* Check RFLAGS.IF and the interruptibility state of the guest */
994	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
995	if ((rflags & PSL_I) == 0)
996		goto cantinject;
997
998	interruptibility = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
999	if (interruptibility & HWINTR_BLOCKED)
1000		goto cantinject;
1001
1002	/* Inject the interrupt */
1003	info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID;
1004	info |= vector;
1005	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
1006
1007	/* Update the Local APIC ISR */
1008	lapic_intr_accepted(vmx->vm, vcpu, vector);
1009
1010	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
1011
1012	return;
1013
1014cantinject:
1015	/*
1016	 * Set the Interrupt Window Exiting execution control so we can inject
1017	 * the interrupt as soon as blocking condition goes away.
1018	 */
1019	vmx_set_int_window_exiting(vmx, vcpu);
1020
1021	VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
1022}
1023
1024static int
1025vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
1026{
1027	int cr, vmcs_guest_cr, vmcs_shadow_cr;
1028	uint64_t crval, regval, ones_mask, zeros_mask;
1029	const struct vmxctx *vmxctx;
1030
1031	/* We only handle mov to %cr0 or %cr4 at this time */
1032	if ((exitqual & 0xf0) != 0x00)
1033		return (UNHANDLED);
1034
1035	cr = exitqual & 0xf;
1036	if (cr != 0 && cr != 4)
1037		return (UNHANDLED);
1038
1039	vmxctx = &vmx->ctx[vcpu];
1040
1041	/*
1042	 * We must use vmcs_write() directly here because vmcs_setreg() will
1043	 * call vmclear(vmcs) as a side-effect which we certainly don't want.
1044	 */
1045	switch ((exitqual >> 8) & 0xf) {
1046	case 0:
1047		regval = vmxctx->guest_rax;
1048		break;
1049	case 1:
1050		regval = vmxctx->guest_rcx;
1051		break;
1052	case 2:
1053		regval = vmxctx->guest_rdx;
1054		break;
1055	case 3:
1056		regval = vmxctx->guest_rbx;
1057		break;
1058	case 4:
1059		regval = vmcs_read(VMCS_GUEST_RSP);
1060		break;
1061	case 5:
1062		regval = vmxctx->guest_rbp;
1063		break;
1064	case 6:
1065		regval = vmxctx->guest_rsi;
1066		break;
1067	case 7:
1068		regval = vmxctx->guest_rdi;
1069		break;
1070	case 8:
1071		regval = vmxctx->guest_r8;
1072		break;
1073	case 9:
1074		regval = vmxctx->guest_r9;
1075		break;
1076	case 10:
1077		regval = vmxctx->guest_r10;
1078		break;
1079	case 11:
1080		regval = vmxctx->guest_r11;
1081		break;
1082	case 12:
1083		regval = vmxctx->guest_r12;
1084		break;
1085	case 13:
1086		regval = vmxctx->guest_r13;
1087		break;
1088	case 14:
1089		regval = vmxctx->guest_r14;
1090		break;
1091	case 15:
1092		regval = vmxctx->guest_r15;
1093		break;
1094	}
1095
1096	if (cr == 0) {
1097		ones_mask = cr0_ones_mask;
1098		zeros_mask = cr0_zeros_mask;
1099		vmcs_guest_cr = VMCS_GUEST_CR0;
1100		vmcs_shadow_cr = VMCS_CR0_SHADOW;
1101	} else {
1102		ones_mask = cr4_ones_mask;
1103		zeros_mask = cr4_zeros_mask;
1104		vmcs_guest_cr = VMCS_GUEST_CR4;
1105		vmcs_shadow_cr = VMCS_CR4_SHADOW;
1106	}
1107	vmcs_write(vmcs_shadow_cr, regval);
1108
1109	crval = regval | ones_mask;
1110	crval &= ~zeros_mask;
1111	vmcs_write(vmcs_guest_cr, crval);
1112
1113	if (cr == 0 && regval & CR0_PG) {
1114		uint64_t efer, entry_ctls;
1115
1116		/*
1117		 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
1118		 * the "IA-32e mode guest" bit in VM-entry control must be
1119		 * equal.
1120		 */
1121		efer = vmcs_read(VMCS_GUEST_IA32_EFER);
1122		if (efer & EFER_LME) {
1123			efer |= EFER_LMA;
1124			vmcs_write(VMCS_GUEST_IA32_EFER, efer);
1125			entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
1126			entry_ctls |= VM_ENTRY_GUEST_LMA;
1127			vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
1128		}
1129	}
1130
1131	return (HANDLED);
1132}
1133
1134static int
1135ept_fault_type(uint64_t ept_qual)
1136{
1137	int fault_type;
1138
1139	if (ept_qual & EPT_VIOLATION_DATA_WRITE)
1140		fault_type = VM_PROT_WRITE;
1141	else if (ept_qual & EPT_VIOLATION_INST_FETCH)
1142		fault_type = VM_PROT_EXECUTE;
1143	else
1144		fault_type= VM_PROT_READ;
1145
1146	return (fault_type);
1147}
1148
1149static boolean_t
1150ept_emulation_fault(uint64_t ept_qual)
1151{
1152	int read, write;
1153
1154	/* EPT fault on an instruction fetch doesn't make sense here */
1155	if (ept_qual & EPT_VIOLATION_INST_FETCH)
1156		return (FALSE);
1157
1158	/* EPT fault must be a read fault or a write fault */
1159	read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
1160	write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
1161	if ((read | write) == 0)
1162		return (FALSE);
1163
1164	/*
1165	 * The EPT violation must have been caused by accessing a
1166	 * guest-physical address that is a translation of a guest-linear
1167	 * address.
1168	 */
1169	if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
1170	    (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
1171		return (FALSE);
1172	}
1173
1174	return (TRUE);
1175}
1176
1177static int
1178vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1179{
1180	int error, handled;
1181	struct vmxctx *vmxctx;
1182	uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason;
1183	uint64_t qual, gpa;
1184	bool retu;
1185
1186	handled = 0;
1187	vmxctx = &vmx->ctx[vcpu];
1188
1189	qual = vmexit->u.vmx.exit_qualification;
1190	reason = vmexit->u.vmx.exit_reason;
1191	vmexit->exitcode = VM_EXITCODE_BOGUS;
1192
1193	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
1194
1195	/*
1196	 * VM exits that could be triggered during event injection on the
1197	 * previous VM entry need to be handled specially by re-injecting
1198	 * the event.
1199	 *
1200	 * See "Information for VM Exits During Event Delivery" in Intel SDM
1201	 * for details.
1202	 */
1203	switch (reason) {
1204	case EXIT_REASON_EPT_FAULT:
1205	case EXIT_REASON_EPT_MISCONFIG:
1206	case EXIT_REASON_APIC:
1207	case EXIT_REASON_TASK_SWITCH:
1208	case EXIT_REASON_EXCEPTION:
1209		idtvec_info = vmcs_idt_vectoring_info();
1210		if (idtvec_info & VMCS_IDT_VEC_VALID) {
1211			idtvec_info &= ~(1 << 12); /* clear undefined bit */
1212			vmcs_write(VMCS_ENTRY_INTR_INFO, idtvec_info);
1213			if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
1214				idtvec_err = vmcs_idt_vectoring_err();
1215				vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR,
1216				    idtvec_err);
1217			}
1218			vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
1219		}
1220	default:
1221		break;
1222	}
1223
1224	switch (reason) {
1225	case EXIT_REASON_CR_ACCESS:
1226		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
1227		handled = vmx_emulate_cr_access(vmx, vcpu, qual);
1228		break;
1229	case EXIT_REASON_RDMSR:
1230		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
1231		retu = false;
1232		ecx = vmxctx->guest_rcx;
1233		error = emulate_rdmsr(vmx->vm, vcpu, ecx, &retu);
1234		if (error) {
1235			vmexit->exitcode = VM_EXITCODE_RDMSR;
1236			vmexit->u.msr.code = ecx;
1237		} else if (!retu) {
1238			handled = 1;
1239		} else {
1240			/* Return to userspace with a valid exitcode */
1241			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1242			    ("emulate_wrmsr retu with bogus exitcode"));
1243		}
1244		break;
1245	case EXIT_REASON_WRMSR:
1246		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
1247		retu = false;
1248		eax = vmxctx->guest_rax;
1249		ecx = vmxctx->guest_rcx;
1250		edx = vmxctx->guest_rdx;
1251		error = emulate_wrmsr(vmx->vm, vcpu, ecx,
1252		    (uint64_t)edx << 32 | eax, &retu);
1253		if (error) {
1254			vmexit->exitcode = VM_EXITCODE_WRMSR;
1255			vmexit->u.msr.code = ecx;
1256			vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
1257		} else if (!retu) {
1258			handled = 1;
1259		} else {
1260			/* Return to userspace with a valid exitcode */
1261			KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
1262			    ("emulate_wrmsr retu with bogus exitcode"));
1263		}
1264		break;
1265	case EXIT_REASON_HLT:
1266		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
1267		vmexit->exitcode = VM_EXITCODE_HLT;
1268		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
1269		break;
1270	case EXIT_REASON_MTF:
1271		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
1272		vmexit->exitcode = VM_EXITCODE_MTRAP;
1273		break;
1274	case EXIT_REASON_PAUSE:
1275		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
1276		vmexit->exitcode = VM_EXITCODE_PAUSE;
1277		break;
1278	case EXIT_REASON_INTR_WINDOW:
1279		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
1280		vmx_clear_int_window_exiting(vmx, vcpu);
1281		VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
1282		return (1);
1283	case EXIT_REASON_EXT_INTR:
1284		/*
1285		 * External interrupts serve only to cause VM exits and allow
1286		 * the host interrupt handler to run.
1287		 *
1288		 * If this external interrupt triggers a virtual interrupt
1289		 * to a VM, then that state will be recorded by the
1290		 * host interrupt handler in the VM's softc. We will inject
1291		 * this virtual interrupt during the subsequent VM enter.
1292		 */
1293
1294		/*
1295		 * This is special. We want to treat this as an 'handled'
1296		 * VM-exit but not increment the instruction pointer.
1297		 */
1298		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
1299		return (1);
1300	case EXIT_REASON_NMI_WINDOW:
1301		/* Exit to allow the pending virtual NMI to be injected */
1302		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
1303		vmx_clear_nmi_window_exiting(vmx, vcpu);
1304		VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
1305		return (1);
1306	case EXIT_REASON_INOUT:
1307		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
1308		vmexit->exitcode = VM_EXITCODE_INOUT;
1309		vmexit->u.inout.bytes = (qual & 0x7) + 1;
1310		vmexit->u.inout.in = (qual & 0x8) ? 1 : 0;
1311		vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
1312		vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
1313		vmexit->u.inout.port = (uint16_t)(qual >> 16);
1314		vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
1315		break;
1316	case EXIT_REASON_CPUID:
1317		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
1318		handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
1319		break;
1320	case EXIT_REASON_EPT_FAULT:
1321		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1);
1322		/*
1323		 * If 'gpa' lies within the address space allocated to
1324		 * memory then this must be a nested page fault otherwise
1325		 * this must be an instruction that accesses MMIO space.
1326		 */
1327		gpa = vmcs_gpa();
1328		if (vm_mem_allocated(vmx->vm, gpa)) {
1329			vmexit->exitcode = VM_EXITCODE_PAGING;
1330			vmexit->u.paging.gpa = gpa;
1331			vmexit->u.paging.fault_type = ept_fault_type(qual);
1332		} else if (ept_emulation_fault(qual)) {
1333			vmexit->exitcode = VM_EXITCODE_INST_EMUL;
1334			vmexit->u.inst_emul.gpa = gpa;
1335			vmexit->u.inst_emul.gla = vmcs_gla();
1336			vmexit->u.inst_emul.cr3 = vmcs_guest_cr3();
1337		}
1338		break;
1339	default:
1340		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
1341		break;
1342	}
1343
1344	if (handled) {
1345		/*
1346		 * It is possible that control is returned to userland
1347		 * even though we were able to handle the VM exit in the
1348		 * kernel.
1349		 *
1350		 * In such a case we want to make sure that the userland
1351		 * restarts guest execution at the instruction *after*
1352		 * the one we just processed. Therefore we update the
1353		 * guest rip in the VMCS and in 'vmexit'.
1354		 */
1355		vmexit->rip += vmexit->inst_length;
1356		vmexit->inst_length = 0;
1357		vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
1358	} else {
1359		if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
1360			/*
1361			 * If this VM exit was not claimed by anybody then
1362			 * treat it as a generic VMX exit.
1363			 */
1364			vmexit->exitcode = VM_EXITCODE_VMX;
1365			vmexit->u.vmx.status = VM_SUCCESS;
1366		} else {
1367			/*
1368			 * The exitcode and collateral have been populated.
1369			 * The VM exit will be processed further in userland.
1370			 */
1371		}
1372	}
1373	return (handled);
1374}
1375
1376static __inline int
1377vmx_exit_astpending(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
1378{
1379
1380	vmexit->rip = vmcs_guest_rip();
1381	vmexit->inst_length = 0;
1382	vmexit->exitcode = VM_EXITCODE_BOGUS;
1383	vmx_astpending_trace(vmx, vcpu, vmexit->rip);
1384	vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
1385
1386	return (HANDLED);
1387}
1388
1389static __inline int
1390vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
1391{
1392
1393	KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
1394	    ("vmx_exit_inst_error: invalid inst_fail_status %d",
1395	    vmxctx->inst_fail_status));
1396
1397	vmexit->inst_length = 0;
1398	vmexit->exitcode = VM_EXITCODE_VMX;
1399	vmexit->u.vmx.status = vmxctx->inst_fail_status;
1400	vmexit->u.vmx.inst_error = vmcs_instruction_error();
1401	vmexit->u.vmx.exit_reason = ~0;
1402	vmexit->u.vmx.exit_qualification = ~0;
1403
1404	switch (rc) {
1405	case VMX_VMRESUME_ERROR:
1406	case VMX_VMLAUNCH_ERROR:
1407	case VMX_INVEPT_ERROR:
1408		vmexit->u.vmx.inst_type = rc;
1409		break;
1410	default:
1411		panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
1412	}
1413
1414	return (UNHANDLED);
1415}
1416
1417static int
1418vmx_run(void *arg, int vcpu, register_t startrip, pmap_t pmap)
1419{
1420	int rc, handled, launched;
1421	struct vmx *vmx;
1422	struct vmxctx *vmxctx;
1423	struct vmcs *vmcs;
1424	struct vm_exit *vmexit;
1425	uint64_t rip;
1426	uint32_t exit_reason;
1427
1428	vmx = arg;
1429	vmcs = &vmx->vmcs[vcpu];
1430	vmxctx = &vmx->ctx[vcpu];
1431	vmexit = vm_exitinfo(vmx->vm, vcpu);
1432	launched = 0;
1433
1434	KASSERT(vmxctx->pmap == pmap,
1435	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
1436	KASSERT(vmxctx->eptp == vmx->eptp,
1437	    ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp));
1438
1439	VMPTRLD(vmcs);
1440
1441	/*
1442	 * XXX
1443	 * We do this every time because we may setup the virtual machine
1444	 * from a different process than the one that actually runs it.
1445	 *
1446	 * If the life of a virtual machine was spent entirely in the context
1447	 * of a single process we could do this once in vmcs_set_defaults().
1448	 */
1449	vmcs_write(VMCS_HOST_CR3, rcr3());
1450
1451	vmcs_write(VMCS_GUEST_RIP, startrip);
1452	vmx_set_pcpu_defaults(vmx, vcpu);
1453	do {
1454		/*
1455		 * Interrupts are disabled from this point on until the
1456		 * guest starts executing. This is done for the following
1457		 * reasons:
1458		 *
1459		 * If an AST is asserted on this thread after the check below,
1460		 * then the IPI_AST notification will not be lost, because it
1461		 * will cause a VM exit due to external interrupt as soon as
1462		 * the guest state is loaded.
1463		 *
1464		 * A posted interrupt after 'vmx_inject_interrupts()' will
1465		 * not be "lost" because it will be held pending in the host
1466		 * APIC because interrupts are disabled. The pending interrupt
1467		 * will be recognized as soon as the guest state is loaded.
1468		 *
1469		 * The same reasoning applies to the IPI generated by
1470		 * pmap_invalidate_ept().
1471		 */
1472		disable_intr();
1473		if (curthread->td_flags & (TDF_ASTPENDING | TDF_NEEDRESCHED)) {
1474			enable_intr();
1475			handled = vmx_exit_astpending(vmx, vcpu, vmexit);
1476			break;
1477		}
1478
1479		vmx_inject_interrupts(vmx, vcpu);
1480		vmx_run_trace(vmx, vcpu);
1481		rc = vmx_enter_guest(vmxctx, launched);
1482
1483		enable_intr();
1484
1485		/* Collect some information for VM exit processing */
1486		vmexit->rip = rip = vmcs_guest_rip();
1487		vmexit->inst_length = vmexit_instruction_length();
1488		vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
1489		vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
1490
1491		if (rc == VMX_GUEST_VMEXIT) {
1492			launched = 1;
1493			handled = vmx_exit_process(vmx, vcpu, vmexit);
1494		} else {
1495			handled = vmx_exit_inst_error(vmxctx, rc, vmexit);
1496		}
1497
1498		vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
1499	} while (handled);
1500
1501	/*
1502	 * If a VM exit has been handled then the exitcode must be BOGUS
1503	 * If a VM exit is not handled then the exitcode must not be BOGUS
1504	 */
1505	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
1506	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
1507		panic("Mismatch between handled (%d) and exitcode (%d)",
1508		      handled, vmexit->exitcode);
1509	}
1510
1511	if (!handled)
1512		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
1513
1514	VCPU_CTR1(vmx->vm, vcpu, "returning from vmx_run: exitcode %d",
1515	    vmexit->exitcode);
1516
1517	VMCLEAR(vmcs);
1518	return (0);
1519}
1520
1521static void
1522vmx_vmcleanup(void *arg)
1523{
1524	int i, error;
1525	struct vmx *vmx = arg;
1526
1527	for (i = 0; i < VM_MAXCPU; i++)
1528		vpid_free(vmx->state[i].vpid);
1529
1530	/*
1531	 * XXXSMP we also need to clear the VMCS active on the other vcpus.
1532	 */
1533	error = vmclear(&vmx->vmcs[0]);
1534	if (error != 0)
1535		panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
1536
1537	free(vmx, M_VMX);
1538
1539	return;
1540}
1541
1542static register_t *
1543vmxctx_regptr(struct vmxctx *vmxctx, int reg)
1544{
1545
1546	switch (reg) {
1547	case VM_REG_GUEST_RAX:
1548		return (&vmxctx->guest_rax);
1549	case VM_REG_GUEST_RBX:
1550		return (&vmxctx->guest_rbx);
1551	case VM_REG_GUEST_RCX:
1552		return (&vmxctx->guest_rcx);
1553	case VM_REG_GUEST_RDX:
1554		return (&vmxctx->guest_rdx);
1555	case VM_REG_GUEST_RSI:
1556		return (&vmxctx->guest_rsi);
1557	case VM_REG_GUEST_RDI:
1558		return (&vmxctx->guest_rdi);
1559	case VM_REG_GUEST_RBP:
1560		return (&vmxctx->guest_rbp);
1561	case VM_REG_GUEST_R8:
1562		return (&vmxctx->guest_r8);
1563	case VM_REG_GUEST_R9:
1564		return (&vmxctx->guest_r9);
1565	case VM_REG_GUEST_R10:
1566		return (&vmxctx->guest_r10);
1567	case VM_REG_GUEST_R11:
1568		return (&vmxctx->guest_r11);
1569	case VM_REG_GUEST_R12:
1570		return (&vmxctx->guest_r12);
1571	case VM_REG_GUEST_R13:
1572		return (&vmxctx->guest_r13);
1573	case VM_REG_GUEST_R14:
1574		return (&vmxctx->guest_r14);
1575	case VM_REG_GUEST_R15:
1576		return (&vmxctx->guest_r15);
1577	default:
1578		break;
1579	}
1580	return (NULL);
1581}
1582
1583static int
1584vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
1585{
1586	register_t *regp;
1587
1588	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1589		*retval = *regp;
1590		return (0);
1591	} else
1592		return (EINVAL);
1593}
1594
1595static int
1596vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
1597{
1598	register_t *regp;
1599
1600	if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
1601		*regp = val;
1602		return (0);
1603	} else
1604		return (EINVAL);
1605}
1606
1607static int
1608vmx_shadow_reg(int reg)
1609{
1610	int shreg;
1611
1612	shreg = -1;
1613
1614	switch (reg) {
1615	case VM_REG_GUEST_CR0:
1616		shreg = VMCS_CR0_SHADOW;
1617                break;
1618        case VM_REG_GUEST_CR4:
1619		shreg = VMCS_CR4_SHADOW;
1620		break;
1621	default:
1622		break;
1623	}
1624
1625	return (shreg);
1626}
1627
1628static int
1629vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
1630{
1631	int running, hostcpu;
1632	struct vmx *vmx = arg;
1633
1634	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1635	if (running && hostcpu != curcpu)
1636		panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
1637
1638	if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
1639		return (0);
1640
1641	return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
1642}
1643
1644static int
1645vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
1646{
1647	int error, hostcpu, running, shadow;
1648	uint64_t ctls;
1649	struct vmx *vmx = arg;
1650
1651	running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
1652	if (running && hostcpu != curcpu)
1653		panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
1654
1655	if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
1656		return (0);
1657
1658	error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
1659
1660	if (error == 0) {
1661		/*
1662		 * If the "load EFER" VM-entry control is 1 then the
1663		 * value of EFER.LMA must be identical to "IA-32e mode guest"
1664		 * bit in the VM-entry control.
1665		 */
1666		if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
1667		    (reg == VM_REG_GUEST_EFER)) {
1668			vmcs_getreg(&vmx->vmcs[vcpu], running,
1669				    VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
1670			if (val & EFER_LMA)
1671				ctls |= VM_ENTRY_GUEST_LMA;
1672			else
1673				ctls &= ~VM_ENTRY_GUEST_LMA;
1674			vmcs_setreg(&vmx->vmcs[vcpu], running,
1675				    VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
1676		}
1677
1678		shadow = vmx_shadow_reg(reg);
1679		if (shadow > 0) {
1680			/*
1681			 * Store the unmodified value in the shadow
1682			 */
1683			error = vmcs_setreg(&vmx->vmcs[vcpu], running,
1684				    VMCS_IDENT(shadow), val);
1685		}
1686	}
1687
1688	return (error);
1689}
1690
1691static int
1692vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1693{
1694	struct vmx *vmx = arg;
1695
1696	return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc));
1697}
1698
1699static int
1700vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
1701{
1702	struct vmx *vmx = arg;
1703
1704	return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc));
1705}
1706
1707static int
1708vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code,
1709	   int code_valid)
1710{
1711	int error;
1712	uint64_t info;
1713	struct vmx *vmx = arg;
1714	struct vmcs *vmcs = &vmx->vmcs[vcpu];
1715
1716	static uint32_t type_map[VM_EVENT_MAX] = {
1717		0x1,		/* VM_EVENT_NONE */
1718		0x0,		/* VM_HW_INTR */
1719		0x2,		/* VM_NMI */
1720		0x3,		/* VM_HW_EXCEPTION */
1721		0x4,		/* VM_SW_INTR */
1722		0x5,		/* VM_PRIV_SW_EXCEPTION */
1723		0x6,		/* VM_SW_EXCEPTION */
1724	};
1725
1726	/*
1727	 * If there is already an exception pending to be delivered to the
1728	 * vcpu then just return.
1729	 */
1730	error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info);
1731	if (error)
1732		return (error);
1733
1734	if (info & VMCS_INTERRUPTION_INFO_VALID)
1735		return (EAGAIN);
1736
1737	info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0);
1738	info |= VMCS_INTERRUPTION_INFO_VALID;
1739	error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info);
1740	if (error != 0)
1741		return (error);
1742
1743	if (code_valid) {
1744		error = vmcs_setreg(vmcs, 0,
1745				    VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR),
1746				    code);
1747	}
1748	return (error);
1749}
1750
1751static int
1752vmx_getcap(void *arg, int vcpu, int type, int *retval)
1753{
1754	struct vmx *vmx = arg;
1755	int vcap;
1756	int ret;
1757
1758	ret = ENOENT;
1759
1760	vcap = vmx->cap[vcpu].set;
1761
1762	switch (type) {
1763	case VM_CAP_HALT_EXIT:
1764		if (cap_halt_exit)
1765			ret = 0;
1766		break;
1767	case VM_CAP_PAUSE_EXIT:
1768		if (cap_pause_exit)
1769			ret = 0;
1770		break;
1771	case VM_CAP_MTRAP_EXIT:
1772		if (cap_monitor_trap)
1773			ret = 0;
1774		break;
1775	case VM_CAP_UNRESTRICTED_GUEST:
1776		if (cap_unrestricted_guest)
1777			ret = 0;
1778		break;
1779	case VM_CAP_ENABLE_INVPCID:
1780		if (cap_invpcid)
1781			ret = 0;
1782		break;
1783	default:
1784		break;
1785	}
1786
1787	if (ret == 0)
1788		*retval = (vcap & (1 << type)) ? 1 : 0;
1789
1790	return (ret);
1791}
1792
1793static int
1794vmx_setcap(void *arg, int vcpu, int type, int val)
1795{
1796	struct vmx *vmx = arg;
1797	struct vmcs *vmcs = &vmx->vmcs[vcpu];
1798	uint32_t baseval;
1799	uint32_t *pptr;
1800	int error;
1801	int flag;
1802	int reg;
1803	int retval;
1804
1805	retval = ENOENT;
1806	pptr = NULL;
1807
1808	switch (type) {
1809	case VM_CAP_HALT_EXIT:
1810		if (cap_halt_exit) {
1811			retval = 0;
1812			pptr = &vmx->cap[vcpu].proc_ctls;
1813			baseval = *pptr;
1814			flag = PROCBASED_HLT_EXITING;
1815			reg = VMCS_PRI_PROC_BASED_CTLS;
1816		}
1817		break;
1818	case VM_CAP_MTRAP_EXIT:
1819		if (cap_monitor_trap) {
1820			retval = 0;
1821			pptr = &vmx->cap[vcpu].proc_ctls;
1822			baseval = *pptr;
1823			flag = PROCBASED_MTF;
1824			reg = VMCS_PRI_PROC_BASED_CTLS;
1825		}
1826		break;
1827	case VM_CAP_PAUSE_EXIT:
1828		if (cap_pause_exit) {
1829			retval = 0;
1830			pptr = &vmx->cap[vcpu].proc_ctls;
1831			baseval = *pptr;
1832			flag = PROCBASED_PAUSE_EXITING;
1833			reg = VMCS_PRI_PROC_BASED_CTLS;
1834		}
1835		break;
1836	case VM_CAP_UNRESTRICTED_GUEST:
1837		if (cap_unrestricted_guest) {
1838			retval = 0;
1839			pptr = &vmx->cap[vcpu].proc_ctls2;
1840			baseval = *pptr;
1841			flag = PROCBASED2_UNRESTRICTED_GUEST;
1842			reg = VMCS_SEC_PROC_BASED_CTLS;
1843		}
1844		break;
1845	case VM_CAP_ENABLE_INVPCID:
1846		if (cap_invpcid) {
1847			retval = 0;
1848			pptr = &vmx->cap[vcpu].proc_ctls2;
1849			baseval = *pptr;
1850			flag = PROCBASED2_ENABLE_INVPCID;
1851			reg = VMCS_SEC_PROC_BASED_CTLS;
1852		}
1853		break;
1854	default:
1855		break;
1856	}
1857
1858	if (retval == 0) {
1859		if (val) {
1860			baseval |= flag;
1861		} else {
1862			baseval &= ~flag;
1863		}
1864		VMPTRLD(vmcs);
1865		error = vmwrite(reg, baseval);
1866		VMCLEAR(vmcs);
1867
1868		if (error) {
1869			retval = error;
1870		} else {
1871			/*
1872			 * Update optional stored flags, and record
1873			 * setting
1874			 */
1875			if (pptr != NULL) {
1876				*pptr = baseval;
1877			}
1878
1879			if (val) {
1880				vmx->cap[vcpu].set |= (1 << type);
1881			} else {
1882				vmx->cap[vcpu].set &= ~(1 << type);
1883			}
1884		}
1885	}
1886
1887        return (retval);
1888}
1889
1890struct vmm_ops vmm_ops_intel = {
1891	vmx_init,
1892	vmx_cleanup,
1893	vmx_restore,
1894	vmx_vminit,
1895	vmx_run,
1896	vmx_vmcleanup,
1897	vmx_getreg,
1898	vmx_setreg,
1899	vmx_getdesc,
1900	vmx_setdesc,
1901	vmx_inject,
1902	vmx_getcap,
1903	vmx_setcap,
1904	ept_vmspace_alloc,
1905	ept_vmspace_free,
1906};
1907