vmx.c revision 261275
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 261275 2014-01-29 21:23:37Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/intel/vmx.c 261275 2014-01-29 21:23:37Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/smp.h> 35#include <sys/kernel.h> 36#include <sys/malloc.h> 37#include <sys/pcpu.h> 38#include <sys/proc.h> 39#include <sys/sysctl.h> 40 41#include <vm/vm.h> 42#include <vm/pmap.h> 43 44#include <machine/psl.h> 45#include <machine/cpufunc.h> 46#include <machine/md_var.h> 47#include <machine/pmap.h> 48#include <machine/segments.h> 49#include <machine/specialreg.h> 50#include <machine/vmparam.h> 51 52#include <machine/vmm.h> 53#include "vmm_host.h" 54#include "vmm_lapic.h" 55#include "vmm_msr.h" 56#include "vmm_ktr.h" 57#include "vmm_stat.h" 58 59#include "vmx_msr.h" 60#include "ept.h" 61#include "vmx_cpufunc.h" 62#include "vmx.h" 63#include "x86.h" 64#include "vmx_controls.h" 65 66#define PINBASED_CTLS_ONE_SETTING \ 67 (PINBASED_EXTINT_EXITING | \ 68 PINBASED_NMI_EXITING | \ 69 PINBASED_VIRTUAL_NMI) 70#define PINBASED_CTLS_ZERO_SETTING 0 71 72#define PROCBASED_CTLS_WINDOW_SETTING \ 73 (PROCBASED_INT_WINDOW_EXITING | \ 74 PROCBASED_NMI_WINDOW_EXITING) 75 76#define PROCBASED_CTLS_ONE_SETTING \ 77 (PROCBASED_SECONDARY_CONTROLS | \ 78 PROCBASED_IO_EXITING | \ 79 PROCBASED_MSR_BITMAPS | \ 80 PROCBASED_CTLS_WINDOW_SETTING) 81#define PROCBASED_CTLS_ZERO_SETTING \ 82 (PROCBASED_CR3_LOAD_EXITING | \ 83 PROCBASED_CR3_STORE_EXITING | \ 84 PROCBASED_IO_BITMAPS) 85 86#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT 87#define PROCBASED_CTLS2_ZERO_SETTING 0 88 89#define VM_EXIT_CTLS_ONE_SETTING_NO_PAT \ 90 (VM_EXIT_HOST_LMA | \ 91 VM_EXIT_SAVE_EFER | \ 92 VM_EXIT_LOAD_EFER) 93 94#define VM_EXIT_CTLS_ONE_SETTING \ 95 (VM_EXIT_CTLS_ONE_SETTING_NO_PAT | \ 96 VM_EXIT_SAVE_PAT | \ 97 VM_EXIT_LOAD_PAT) 98#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS 99 100#define VM_ENTRY_CTLS_ONE_SETTING_NO_PAT VM_ENTRY_LOAD_EFER 101 102#define VM_ENTRY_CTLS_ONE_SETTING \ 103 (VM_ENTRY_CTLS_ONE_SETTING_NO_PAT | \ 104 VM_ENTRY_LOAD_PAT) 105#define VM_ENTRY_CTLS_ZERO_SETTING \ 106 (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ 107 VM_ENTRY_INTO_SMM | \ 108 VM_ENTRY_DEACTIVATE_DUAL_MONITOR) 109 110#define guest_msr_rw(vmx, msr) \ 111 msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW) 112 113#define HANDLED 1 114#define UNHANDLED 0 115 116MALLOC_DEFINE(M_VMX, "vmx", "vmx"); 117 118SYSCTL_DECL(_hw_vmm); 119SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); 120 121int vmxon_enabled[MAXCPU]; 122static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); 123 124static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; 125static uint32_t exit_ctls, entry_ctls; 126 127static uint64_t cr0_ones_mask, cr0_zeros_mask; 128SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, 129 &cr0_ones_mask, 0, NULL); 130SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, 131 &cr0_zeros_mask, 0, NULL); 132 133static uint64_t cr4_ones_mask, cr4_zeros_mask; 134SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, 135 &cr4_ones_mask, 0, NULL); 136SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, 137 &cr4_zeros_mask, 0, NULL); 138 139static int vmx_no_patmsr; 140 141static int vmx_initialized; 142SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, 143 &vmx_initialized, 0, "Intel VMX initialized"); 144 145/* 146 * Virtual NMI blocking conditions. 147 * 148 * Some processor implementations also require NMI to be blocked if 149 * the STI_BLOCKING bit is set. It is possible to detect this at runtime 150 * based on the (exit_reason,exit_qual) tuple being set to 151 * (EXIT_REASON_INVAL_VMCS, EXIT_QUAL_NMI_WHILE_STI_BLOCKING). 152 * 153 * We take the easy way out and also include STI_BLOCKING as one of the 154 * gating items for vNMI injection. 155 */ 156static uint64_t nmi_blocking_bits = VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING | 157 VMCS_INTERRUPTIBILITY_NMI_BLOCKING | 158 VMCS_INTERRUPTIBILITY_STI_BLOCKING; 159 160/* 161 * Optional capabilities 162 */ 163static int cap_halt_exit; 164static int cap_pause_exit; 165static int cap_unrestricted_guest; 166static int cap_monitor_trap; 167static int cap_invpcid; 168 169static struct unrhdr *vpid_unr; 170static u_int vpid_alloc_failed; 171SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, 172 &vpid_alloc_failed, 0, NULL); 173 174#ifdef KTR 175static const char * 176exit_reason_to_str(int reason) 177{ 178 static char reasonbuf[32]; 179 180 switch (reason) { 181 case EXIT_REASON_EXCEPTION: 182 return "exception"; 183 case EXIT_REASON_EXT_INTR: 184 return "extint"; 185 case EXIT_REASON_TRIPLE_FAULT: 186 return "triplefault"; 187 case EXIT_REASON_INIT: 188 return "init"; 189 case EXIT_REASON_SIPI: 190 return "sipi"; 191 case EXIT_REASON_IO_SMI: 192 return "iosmi"; 193 case EXIT_REASON_SMI: 194 return "smi"; 195 case EXIT_REASON_INTR_WINDOW: 196 return "intrwindow"; 197 case EXIT_REASON_NMI_WINDOW: 198 return "nmiwindow"; 199 case EXIT_REASON_TASK_SWITCH: 200 return "taskswitch"; 201 case EXIT_REASON_CPUID: 202 return "cpuid"; 203 case EXIT_REASON_GETSEC: 204 return "getsec"; 205 case EXIT_REASON_HLT: 206 return "hlt"; 207 case EXIT_REASON_INVD: 208 return "invd"; 209 case EXIT_REASON_INVLPG: 210 return "invlpg"; 211 case EXIT_REASON_RDPMC: 212 return "rdpmc"; 213 case EXIT_REASON_RDTSC: 214 return "rdtsc"; 215 case EXIT_REASON_RSM: 216 return "rsm"; 217 case EXIT_REASON_VMCALL: 218 return "vmcall"; 219 case EXIT_REASON_VMCLEAR: 220 return "vmclear"; 221 case EXIT_REASON_VMLAUNCH: 222 return "vmlaunch"; 223 case EXIT_REASON_VMPTRLD: 224 return "vmptrld"; 225 case EXIT_REASON_VMPTRST: 226 return "vmptrst"; 227 case EXIT_REASON_VMREAD: 228 return "vmread"; 229 case EXIT_REASON_VMRESUME: 230 return "vmresume"; 231 case EXIT_REASON_VMWRITE: 232 return "vmwrite"; 233 case EXIT_REASON_VMXOFF: 234 return "vmxoff"; 235 case EXIT_REASON_VMXON: 236 return "vmxon"; 237 case EXIT_REASON_CR_ACCESS: 238 return "craccess"; 239 case EXIT_REASON_DR_ACCESS: 240 return "draccess"; 241 case EXIT_REASON_INOUT: 242 return "inout"; 243 case EXIT_REASON_RDMSR: 244 return "rdmsr"; 245 case EXIT_REASON_WRMSR: 246 return "wrmsr"; 247 case EXIT_REASON_INVAL_VMCS: 248 return "invalvmcs"; 249 case EXIT_REASON_INVAL_MSR: 250 return "invalmsr"; 251 case EXIT_REASON_MWAIT: 252 return "mwait"; 253 case EXIT_REASON_MTF: 254 return "mtf"; 255 case EXIT_REASON_MONITOR: 256 return "monitor"; 257 case EXIT_REASON_PAUSE: 258 return "pause"; 259 case EXIT_REASON_MCE: 260 return "mce"; 261 case EXIT_REASON_TPR: 262 return "tpr"; 263 case EXIT_REASON_APIC: 264 return "apic"; 265 case EXIT_REASON_GDTR_IDTR: 266 return "gdtridtr"; 267 case EXIT_REASON_LDTR_TR: 268 return "ldtrtr"; 269 case EXIT_REASON_EPT_FAULT: 270 return "eptfault"; 271 case EXIT_REASON_EPT_MISCONFIG: 272 return "eptmisconfig"; 273 case EXIT_REASON_INVEPT: 274 return "invept"; 275 case EXIT_REASON_RDTSCP: 276 return "rdtscp"; 277 case EXIT_REASON_VMX_PREEMPT: 278 return "vmxpreempt"; 279 case EXIT_REASON_INVVPID: 280 return "invvpid"; 281 case EXIT_REASON_WBINVD: 282 return "wbinvd"; 283 case EXIT_REASON_XSETBV: 284 return "xsetbv"; 285 default: 286 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); 287 return (reasonbuf); 288 } 289} 290 291#ifdef SETJMP_TRACE 292static const char * 293vmx_setjmp_rc2str(int rc) 294{ 295 switch (rc) { 296 case VMX_RETURN_DIRECT: 297 return "direct"; 298 case VMX_RETURN_LONGJMP: 299 return "longjmp"; 300 case VMX_RETURN_VMRESUME: 301 return "vmresume"; 302 case VMX_RETURN_VMLAUNCH: 303 return "vmlaunch"; 304 case VMX_RETURN_AST: 305 return "ast"; 306 default: 307 return "unknown"; 308 } 309} 310 311#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ 312 VCPU_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ 313 (vmxctx)->regname) 314 315static void 316vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 317{ 318 uint64_t host_rip, host_rsp; 319 320 if (vmxctx != &vmx->ctx[vcpu]) 321 panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", 322 vmxctx, &vmx->ctx[vcpu]); 323 324 VCPU_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); 325 VCPU_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", 326 vmx_setjmp_rc2str(rc), rc); 327 328 host_rsp = host_rip = ~0; 329 vmread(VMCS_HOST_RIP, &host_rip); 330 vmread(VMCS_HOST_RSP, &host_rsp); 331 VCPU_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp %#lx", 332 host_rip, host_rsp); 333 334 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); 335 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); 336 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); 337 SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); 338 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); 339 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); 340 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); 341 SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); 342 343 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); 344 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); 345 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); 346 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); 347 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); 348 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); 349 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); 350 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); 351 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); 352 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); 353 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); 354 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); 355 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); 356 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); 357 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); 358 SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); 359} 360#endif 361#else 362static void __inline 363vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) 364{ 365 return; 366} 367#endif /* KTR */ 368 369u_long 370vmx_fix_cr0(u_long cr0) 371{ 372 373 return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); 374} 375 376u_long 377vmx_fix_cr4(u_long cr4) 378{ 379 380 return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); 381} 382 383static void 384vpid_free(int vpid) 385{ 386 if (vpid < 0 || vpid > 0xffff) 387 panic("vpid_free: invalid vpid %d", vpid); 388 389 /* 390 * VPIDs [0,VM_MAXCPU] are special and are not allocated from 391 * the unit number allocator. 392 */ 393 394 if (vpid > VM_MAXCPU) 395 free_unr(vpid_unr, vpid); 396} 397 398static void 399vpid_alloc(uint16_t *vpid, int num) 400{ 401 int i, x; 402 403 if (num <= 0 || num > VM_MAXCPU) 404 panic("invalid number of vpids requested: %d", num); 405 406 /* 407 * If the "enable vpid" execution control is not enabled then the 408 * VPID is required to be 0 for all vcpus. 409 */ 410 if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { 411 for (i = 0; i < num; i++) 412 vpid[i] = 0; 413 return; 414 } 415 416 /* 417 * Allocate a unique VPID for each vcpu from the unit number allocator. 418 */ 419 for (i = 0; i < num; i++) { 420 x = alloc_unr(vpid_unr); 421 if (x == -1) 422 break; 423 else 424 vpid[i] = x; 425 } 426 427 if (i < num) { 428 atomic_add_int(&vpid_alloc_failed, 1); 429 430 /* 431 * If the unit number allocator does not have enough unique 432 * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. 433 * 434 * These VPIDs are not be unique across VMs but this does not 435 * affect correctness because the combined mappings are also 436 * tagged with the EP4TA which is unique for each VM. 437 * 438 * It is still sub-optimal because the invvpid will invalidate 439 * combined mappings for a particular VPID across all EP4TAs. 440 */ 441 while (i-- > 0) 442 vpid_free(vpid[i]); 443 444 for (i = 0; i < num; i++) 445 vpid[i] = i + 1; 446 } 447} 448 449static void 450vpid_init(void) 451{ 452 /* 453 * VPID 0 is required when the "enable VPID" execution control is 454 * disabled. 455 * 456 * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the 457 * unit number allocator does not have sufficient unique VPIDs to 458 * satisfy the allocation. 459 * 460 * The remaining VPIDs are managed by the unit number allocator. 461 */ 462 vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); 463} 464 465static void 466msr_save_area_init(struct msr_entry *g_area, int *g_count) 467{ 468 int cnt; 469 470 static struct msr_entry guest_msrs[] = { 471 { MSR_KGSBASE, 0, 0 }, 472 }; 473 474 cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); 475 if (cnt > GUEST_MSR_MAX_ENTRIES) 476 panic("guest msr save area overrun"); 477 bcopy(guest_msrs, g_area, sizeof(guest_msrs)); 478 *g_count = cnt; 479} 480 481static void 482vmx_disable(void *arg __unused) 483{ 484 struct invvpid_desc invvpid_desc = { 0 }; 485 struct invept_desc invept_desc = { 0 }; 486 487 if (vmxon_enabled[curcpu]) { 488 /* 489 * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. 490 * 491 * VMXON or VMXOFF are not required to invalidate any TLB 492 * caching structures. This prevents potential retention of 493 * cached information in the TLB between distinct VMX episodes. 494 */ 495 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); 496 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); 497 vmxoff(); 498 } 499 load_cr4(rcr4() & ~CR4_VMXE); 500} 501 502static int 503vmx_cleanup(void) 504{ 505 506 if (vpid_unr != NULL) { 507 delete_unrhdr(vpid_unr); 508 vpid_unr = NULL; 509 } 510 511 smp_rendezvous(NULL, vmx_disable, NULL, NULL); 512 513 return (0); 514} 515 516static void 517vmx_enable(void *arg __unused) 518{ 519 int error; 520 521 load_cr4(rcr4() | CR4_VMXE); 522 523 *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); 524 error = vmxon(vmxon_region[curcpu]); 525 if (error == 0) 526 vmxon_enabled[curcpu] = 1; 527} 528 529static void 530vmx_restore(void) 531{ 532 533 if (vmxon_enabled[curcpu]) 534 vmxon(vmxon_region[curcpu]); 535} 536 537static int 538vmx_init(void) 539{ 540 int error; 541 uint64_t fixed0, fixed1, feature_control; 542 uint32_t tmp; 543 544 /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ 545 if (!(cpu_feature2 & CPUID2_VMX)) { 546 printf("vmx_init: processor does not support VMX operation\n"); 547 return (ENXIO); 548 } 549 550 /* 551 * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits 552 * are set (bits 0 and 2 respectively). 553 */ 554 feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); 555 if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || 556 (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { 557 printf("vmx_init: VMX operation disabled by BIOS\n"); 558 return (ENXIO); 559 } 560 561 /* Check support for primary processor-based VM-execution controls */ 562 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 563 MSR_VMX_TRUE_PROCBASED_CTLS, 564 PROCBASED_CTLS_ONE_SETTING, 565 PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); 566 if (error) { 567 printf("vmx_init: processor does not support desired primary " 568 "processor-based controls\n"); 569 return (error); 570 } 571 572 /* Clear the processor-based ctl bits that are set on demand */ 573 procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; 574 575 /* Check support for secondary processor-based VM-execution controls */ 576 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 577 MSR_VMX_PROCBASED_CTLS2, 578 PROCBASED_CTLS2_ONE_SETTING, 579 PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); 580 if (error) { 581 printf("vmx_init: processor does not support desired secondary " 582 "processor-based controls\n"); 583 return (error); 584 } 585 586 /* Check support for VPID */ 587 error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, 588 PROCBASED2_ENABLE_VPID, 0, &tmp); 589 if (error == 0) 590 procbased_ctls2 |= PROCBASED2_ENABLE_VPID; 591 592 /* Check support for pin-based VM-execution controls */ 593 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, 594 MSR_VMX_TRUE_PINBASED_CTLS, 595 PINBASED_CTLS_ONE_SETTING, 596 PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); 597 if (error) { 598 printf("vmx_init: processor does not support desired " 599 "pin-based controls\n"); 600 return (error); 601 } 602 603 /* Check support for VM-exit controls */ 604 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, 605 VM_EXIT_CTLS_ONE_SETTING, 606 VM_EXIT_CTLS_ZERO_SETTING, 607 &exit_ctls); 608 if (error) { 609 /* Try again without the PAT MSR bits */ 610 error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, 611 MSR_VMX_TRUE_EXIT_CTLS, 612 VM_EXIT_CTLS_ONE_SETTING_NO_PAT, 613 VM_EXIT_CTLS_ZERO_SETTING, 614 &exit_ctls); 615 if (error) { 616 printf("vmx_init: processor does not support desired " 617 "exit controls\n"); 618 return (error); 619 } else { 620 if (bootverbose) 621 printf("vmm: PAT MSR access not supported\n"); 622 guest_msr_valid(MSR_PAT); 623 vmx_no_patmsr = 1; 624 } 625 } 626 627 /* Check support for VM-entry controls */ 628 if (!vmx_no_patmsr) { 629 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 630 MSR_VMX_TRUE_ENTRY_CTLS, 631 VM_ENTRY_CTLS_ONE_SETTING, 632 VM_ENTRY_CTLS_ZERO_SETTING, 633 &entry_ctls); 634 } else { 635 error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, 636 MSR_VMX_TRUE_ENTRY_CTLS, 637 VM_ENTRY_CTLS_ONE_SETTING_NO_PAT, 638 VM_ENTRY_CTLS_ZERO_SETTING, 639 &entry_ctls); 640 } 641 642 if (error) { 643 printf("vmx_init: processor does not support desired " 644 "entry controls\n"); 645 return (error); 646 } 647 648 /* 649 * Check support for optional features by testing them 650 * as individual bits 651 */ 652 cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 653 MSR_VMX_TRUE_PROCBASED_CTLS, 654 PROCBASED_HLT_EXITING, 0, 655 &tmp) == 0); 656 657 cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 658 MSR_VMX_PROCBASED_CTLS, 659 PROCBASED_MTF, 0, 660 &tmp) == 0); 661 662 cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, 663 MSR_VMX_TRUE_PROCBASED_CTLS, 664 PROCBASED_PAUSE_EXITING, 0, 665 &tmp) == 0); 666 667 cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 668 MSR_VMX_PROCBASED_CTLS2, 669 PROCBASED2_UNRESTRICTED_GUEST, 0, 670 &tmp) == 0); 671 672 cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, 673 MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, 674 &tmp) == 0); 675 676 677 /* Initialize EPT */ 678 error = ept_init(); 679 if (error) { 680 printf("vmx_init: ept initialization failed (%d)\n", error); 681 return (error); 682 } 683 684 /* 685 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 686 */ 687 fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); 688 fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); 689 cr0_ones_mask = fixed0 & fixed1; 690 cr0_zeros_mask = ~fixed0 & ~fixed1; 691 692 /* 693 * CR0_PE and CR0_PG can be set to zero in VMX non-root operation 694 * if unrestricted guest execution is allowed. 695 */ 696 if (cap_unrestricted_guest) 697 cr0_ones_mask &= ~(CR0_PG | CR0_PE); 698 699 /* 700 * Do not allow the guest to set CR0_NW or CR0_CD. 701 */ 702 cr0_zeros_mask |= (CR0_NW | CR0_CD); 703 704 fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); 705 fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); 706 cr4_ones_mask = fixed0 & fixed1; 707 cr4_zeros_mask = ~fixed0 & ~fixed1; 708 709 vpid_init(); 710 711 /* enable VMX operation */ 712 smp_rendezvous(NULL, vmx_enable, NULL, NULL); 713 714 vmx_initialized = 1; 715 716 return (0); 717} 718 719static int 720vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) 721{ 722 int error, mask_ident, shadow_ident; 723 uint64_t mask_value; 724 725 if (which != 0 && which != 4) 726 panic("vmx_setup_cr_shadow: unknown cr%d", which); 727 728 if (which == 0) { 729 mask_ident = VMCS_CR0_MASK; 730 mask_value = cr0_ones_mask | cr0_zeros_mask; 731 shadow_ident = VMCS_CR0_SHADOW; 732 } else { 733 mask_ident = VMCS_CR4_MASK; 734 mask_value = cr4_ones_mask | cr4_zeros_mask; 735 shadow_ident = VMCS_CR4_SHADOW; 736 } 737 738 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); 739 if (error) 740 return (error); 741 742 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); 743 if (error) 744 return (error); 745 746 return (0); 747} 748#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) 749#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) 750 751static void * 752vmx_vminit(struct vm *vm, pmap_t pmap) 753{ 754 uint16_t vpid[VM_MAXCPU]; 755 int i, error, guest_msr_count; 756 struct vmx *vmx; 757 758 vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); 759 if ((uintptr_t)vmx & PAGE_MASK) { 760 panic("malloc of struct vmx not aligned on %d byte boundary", 761 PAGE_SIZE); 762 } 763 vmx->vm = vm; 764 765 vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); 766 767 /* 768 * Clean up EPTP-tagged guest physical and combined mappings 769 * 770 * VMX transitions are not required to invalidate any guest physical 771 * mappings. So, it may be possible for stale guest physical mappings 772 * to be present in the processor TLBs. 773 * 774 * Combined mappings for this EP4TA are also invalidated for all VPIDs. 775 */ 776 ept_invalidate_mappings(vmx->eptp); 777 778 msr_bitmap_initialize(vmx->msr_bitmap); 779 780 /* 781 * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. 782 * The guest FSBASE and GSBASE are saved and restored during 783 * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are 784 * always restored from the vmcs host state area on vm-exit. 785 * 786 * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in 787 * how they are saved/restored so can be directly accessed by the 788 * guest. 789 * 790 * Guest KGSBASE is saved and restored in the guest MSR save area. 791 * Host KGSBASE is restored before returning to userland from the pcb. 792 * There will be a window of time when we are executing in the host 793 * kernel context with a value of KGSBASE from the guest. This is ok 794 * because the value of KGSBASE is inconsequential in kernel context. 795 * 796 * MSR_EFER is saved and restored in the guest VMCS area on a 797 * VM exit and entry respectively. It is also restored from the 798 * host VMCS area on a VM exit. 799 */ 800 if (guest_msr_rw(vmx, MSR_GSBASE) || 801 guest_msr_rw(vmx, MSR_FSBASE) || 802 guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || 803 guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || 804 guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || 805 guest_msr_rw(vmx, MSR_KGSBASE) || 806 guest_msr_rw(vmx, MSR_EFER)) 807 panic("vmx_vminit: error setting guest msr access"); 808 809 /* 810 * MSR_PAT is saved and restored in the guest VMCS are on a VM exit 811 * and entry respectively. It is also restored from the host VMCS 812 * area on a VM exit. However, if running on a system with no 813 * MSR_PAT save/restore support, leave access disabled so accesses 814 * will be trapped. 815 */ 816 if (!vmx_no_patmsr && guest_msr_rw(vmx, MSR_PAT)) 817 panic("vmx_vminit: error setting guest pat msr access"); 818 819 vpid_alloc(vpid, VM_MAXCPU); 820 821 for (i = 0; i < VM_MAXCPU; i++) { 822 vmx->vmcs[i].identifier = vmx_revision(); 823 error = vmclear(&vmx->vmcs[i]); 824 if (error != 0) { 825 panic("vmx_vminit: vmclear error %d on vcpu %d\n", 826 error, i); 827 } 828 829 error = vmcs_set_defaults(&vmx->vmcs[i], 830 (u_long)vmx_longjmp, 831 (u_long)&vmx->ctx[i], 832 vmx->eptp, 833 pinbased_ctls, 834 procbased_ctls, 835 procbased_ctls2, 836 exit_ctls, entry_ctls, 837 vtophys(vmx->msr_bitmap), 838 vpid[i]); 839 840 if (error != 0) 841 panic("vmx_vminit: vmcs_set_defaults error %d", error); 842 843 vmx->cap[i].set = 0; 844 vmx->cap[i].proc_ctls = procbased_ctls; 845 vmx->cap[i].proc_ctls2 = procbased_ctls2; 846 847 vmx->state[i].lastcpu = -1; 848 vmx->state[i].vpid = vpid[i]; 849 850 msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); 851 852 error = vmcs_set_msr_save(&vmx->vmcs[i], 853 vtophys(vmx->guest_msrs[i]), 854 guest_msr_count); 855 if (error != 0) 856 panic("vmcs_set_msr_save error %d", error); 857 858 /* 859 * Set up the CR0/4 shadows, and init the read shadow 860 * to the power-on register value from the Intel Sys Arch. 861 * CR0 - 0x60000010 862 * CR4 - 0 863 */ 864 error = vmx_setup_cr0_shadow(&vmx->vmcs[i], 0x60000010); 865 if (error != 0) 866 panic("vmx_setup_cr0_shadow %d", error); 867 868 error = vmx_setup_cr4_shadow(&vmx->vmcs[i], 0); 869 if (error != 0) 870 panic("vmx_setup_cr4_shadow %d", error); 871 872 vmx->ctx[i].pmap = pmap; 873 vmx->ctx[i].eptp = vmx->eptp; 874 } 875 876 return (vmx); 877} 878 879static int 880vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) 881{ 882 int handled, func; 883 884 func = vmxctx->guest_rax; 885 886 handled = x86_emulate_cpuid(vm, vcpu, 887 (uint32_t*)(&vmxctx->guest_rax), 888 (uint32_t*)(&vmxctx->guest_rbx), 889 (uint32_t*)(&vmxctx->guest_rcx), 890 (uint32_t*)(&vmxctx->guest_rdx)); 891 return (handled); 892} 893 894static __inline void 895vmx_run_trace(struct vmx *vmx, int vcpu) 896{ 897#ifdef KTR 898 VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); 899#endif 900} 901 902static __inline void 903vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, 904 int handled) 905{ 906#ifdef KTR 907 VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", 908 handled ? "handled" : "unhandled", 909 exit_reason_to_str(exit_reason), rip); 910#endif 911} 912 913static __inline void 914vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) 915{ 916#ifdef KTR 917 VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); 918#endif 919} 920 921static int 922vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) 923{ 924 int error, lastcpu; 925 struct vmxstate *vmxstate; 926 struct invvpid_desc invvpid_desc = { 0 }; 927 928 vmxstate = &vmx->state[vcpu]; 929 lastcpu = vmxstate->lastcpu; 930 vmxstate->lastcpu = curcpu; 931 932 if (lastcpu == curcpu) { 933 error = 0; 934 goto done; 935 } 936 937 vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); 938 939 error = vmwrite(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); 940 if (error != 0) 941 goto done; 942 943 error = vmwrite(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); 944 if (error != 0) 945 goto done; 946 947 error = vmwrite(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); 948 if (error != 0) 949 goto done; 950 951 /* 952 * If we are using VPIDs then invalidate all mappings tagged with 'vpid' 953 * 954 * We do this because this vcpu was executing on a different host 955 * cpu when it last ran. We do not track whether it invalidated 956 * mappings associated with its 'vpid' during that run. So we must 957 * assume that the mappings associated with 'vpid' on 'curcpu' are 958 * stale and invalidate them. 959 * 960 * Note that we incur this penalty only when the scheduler chooses to 961 * move the thread associated with this vcpu between host cpus. 962 * 963 * Note also that this will invalidate mappings tagged with 'vpid' 964 * for "all" EP4TAs. 965 */ 966 if (vmxstate->vpid != 0) { 967 invvpid_desc.vpid = vmxstate->vpid; 968 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); 969 } 970done: 971 return (error); 972} 973 974static void 975vm_exit_update_rip(struct vm_exit *vmexit) 976{ 977 int error; 978 979 error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); 980 if (error) 981 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 982} 983 984/* 985 * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. 986 */ 987CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); 988 989static void __inline 990vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) 991{ 992 int error; 993 994 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; 995 996 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 997 if (error) 998 panic("vmx_set_int_window_exiting: vmwrite error %d", error); 999} 1000 1001static void __inline 1002vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) 1003{ 1004 int error; 1005 1006 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; 1007 1008 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1009 if (error) 1010 panic("vmx_clear_int_window_exiting: vmwrite error %d", error); 1011} 1012 1013static void __inline 1014vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) 1015{ 1016 int error; 1017 1018 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; 1019 1020 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1021 if (error) 1022 panic("vmx_set_nmi_window_exiting: vmwrite error %d", error); 1023} 1024 1025static void __inline 1026vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) 1027{ 1028 int error; 1029 1030 vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; 1031 1032 error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); 1033 if (error) 1034 panic("vmx_clear_nmi_window_exiting: vmwrite error %d", error); 1035} 1036 1037static int 1038vmx_inject_nmi(struct vmx *vmx, int vcpu) 1039{ 1040 int error; 1041 uint64_t info, interruptibility; 1042 1043 /* Bail out if no NMI requested */ 1044 if (!vm_nmi_pending(vmx->vm, vcpu)) 1045 return (0); 1046 1047 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1048 if (error) { 1049 panic("vmx_inject_nmi: vmread(interruptibility) %d", 1050 error); 1051 } 1052 if (interruptibility & nmi_blocking_bits) 1053 goto nmiblocked; 1054 1055 /* 1056 * Inject the virtual NMI. The vector must be the NMI IDT entry 1057 * or the VMCS entry check will fail. 1058 */ 1059 info = VMCS_INTERRUPTION_INFO_NMI | VMCS_INTERRUPTION_INFO_VALID; 1060 info |= IDT_NMI; 1061 1062 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1063 if (error) 1064 panic("vmx_inject_nmi: vmwrite(intrinfo) %d", error); 1065 1066 VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); 1067 1068 /* Clear the request */ 1069 vm_nmi_clear(vmx->vm, vcpu); 1070 return (1); 1071 1072nmiblocked: 1073 /* 1074 * Set the NMI Window Exiting execution control so we can inject 1075 * the virtual NMI as soon as blocking condition goes away. 1076 */ 1077 vmx_set_nmi_window_exiting(vmx, vcpu); 1078 1079 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); 1080 return (1); 1081} 1082 1083static void 1084vmx_inject_interrupts(struct vmx *vmx, int vcpu) 1085{ 1086 int error, vector; 1087 uint64_t info, rflags, interruptibility; 1088 1089 const int HWINTR_BLOCKED = VMCS_INTERRUPTIBILITY_STI_BLOCKING | 1090 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING; 1091 1092 /* 1093 * If there is already an interrupt pending then just return. 1094 * 1095 * This could happen if an interrupt was injected on a prior 1096 * VM entry but the actual entry into guest mode was aborted 1097 * because of a pending AST. 1098 */ 1099 error = vmread(VMCS_ENTRY_INTR_INFO, &info); 1100 if (error) 1101 panic("vmx_inject_interrupts: vmread(intrinfo) %d", error); 1102 if (info & VMCS_INTERRUPTION_INFO_VALID) 1103 return; 1104 1105 /* 1106 * NMI injection has priority so deal with those first 1107 */ 1108 if (vmx_inject_nmi(vmx, vcpu)) 1109 return; 1110 1111 /* Ask the local apic for a vector to inject */ 1112 vector = lapic_pending_intr(vmx->vm, vcpu); 1113 if (vector < 0) 1114 return; 1115 1116 if (vector < 32 || vector > 255) 1117 panic("vmx_inject_interrupts: invalid vector %d\n", vector); 1118 1119 /* Check RFLAGS.IF and the interruptibility state of the guest */ 1120 error = vmread(VMCS_GUEST_RFLAGS, &rflags); 1121 if (error) 1122 panic("vmx_inject_interrupts: vmread(rflags) %d", error); 1123 1124 if ((rflags & PSL_I) == 0) 1125 goto cantinject; 1126 1127 error = vmread(VMCS_GUEST_INTERRUPTIBILITY, &interruptibility); 1128 if (error) { 1129 panic("vmx_inject_interrupts: vmread(interruptibility) %d", 1130 error); 1131 } 1132 if (interruptibility & HWINTR_BLOCKED) 1133 goto cantinject; 1134 1135 /* Inject the interrupt */ 1136 info = VMCS_INTERRUPTION_INFO_HW_INTR | VMCS_INTERRUPTION_INFO_VALID; 1137 info |= vector; 1138 error = vmwrite(VMCS_ENTRY_INTR_INFO, info); 1139 if (error) 1140 panic("vmx_inject_interrupts: vmwrite(intrinfo) %d", error); 1141 1142 /* Update the Local APIC ISR */ 1143 lapic_intr_accepted(vmx->vm, vcpu, vector); 1144 1145 VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); 1146 1147 return; 1148 1149cantinject: 1150 /* 1151 * Set the Interrupt Window Exiting execution control so we can inject 1152 * the interrupt as soon as blocking condition goes away. 1153 */ 1154 vmx_set_int_window_exiting(vmx, vcpu); 1155 1156 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); 1157} 1158 1159static int 1160vmx_emulate_cr_access(struct vmx *vmx, int vcpu, uint64_t exitqual) 1161{ 1162 int error, cr, vmcs_guest_cr, vmcs_shadow_cr; 1163 uint64_t crval, regval, ones_mask, zeros_mask; 1164 const struct vmxctx *vmxctx; 1165 1166 /* We only handle mov to %cr0 or %cr4 at this time */ 1167 if ((exitqual & 0xf0) != 0x00) 1168 return (UNHANDLED); 1169 1170 cr = exitqual & 0xf; 1171 if (cr != 0 && cr != 4) 1172 return (UNHANDLED); 1173 1174 vmxctx = &vmx->ctx[vcpu]; 1175 1176 /* 1177 * We must use vmwrite() directly here because vmcs_setreg() will 1178 * call vmclear(vmcs) as a side-effect which we certainly don't want. 1179 */ 1180 switch ((exitqual >> 8) & 0xf) { 1181 case 0: 1182 regval = vmxctx->guest_rax; 1183 break; 1184 case 1: 1185 regval = vmxctx->guest_rcx; 1186 break; 1187 case 2: 1188 regval = vmxctx->guest_rdx; 1189 break; 1190 case 3: 1191 regval = vmxctx->guest_rbx; 1192 break; 1193 case 4: 1194 error = vmread(VMCS_GUEST_RSP, ®val); 1195 if (error) { 1196 panic("vmx_emulate_cr_access: " 1197 "error %d reading guest rsp", error); 1198 } 1199 break; 1200 case 5: 1201 regval = vmxctx->guest_rbp; 1202 break; 1203 case 6: 1204 regval = vmxctx->guest_rsi; 1205 break; 1206 case 7: 1207 regval = vmxctx->guest_rdi; 1208 break; 1209 case 8: 1210 regval = vmxctx->guest_r8; 1211 break; 1212 case 9: 1213 regval = vmxctx->guest_r9; 1214 break; 1215 case 10: 1216 regval = vmxctx->guest_r10; 1217 break; 1218 case 11: 1219 regval = vmxctx->guest_r11; 1220 break; 1221 case 12: 1222 regval = vmxctx->guest_r12; 1223 break; 1224 case 13: 1225 regval = vmxctx->guest_r13; 1226 break; 1227 case 14: 1228 regval = vmxctx->guest_r14; 1229 break; 1230 case 15: 1231 regval = vmxctx->guest_r15; 1232 break; 1233 } 1234 1235 if (cr == 0) { 1236 ones_mask = cr0_ones_mask; 1237 zeros_mask = cr0_zeros_mask; 1238 vmcs_guest_cr = VMCS_GUEST_CR0; 1239 vmcs_shadow_cr = VMCS_CR0_SHADOW; 1240 } else { 1241 ones_mask = cr4_ones_mask; 1242 zeros_mask = cr4_zeros_mask; 1243 vmcs_guest_cr = VMCS_GUEST_CR4; 1244 vmcs_shadow_cr = VMCS_CR4_SHADOW; 1245 } 1246 1247 error = vmwrite(vmcs_shadow_cr, regval); 1248 if (error) { 1249 panic("vmx_emulate_cr_access: error %d writing cr%d shadow", 1250 error, cr); 1251 } 1252 1253 crval = regval | ones_mask; 1254 crval &= ~zeros_mask; 1255 error = vmwrite(vmcs_guest_cr, crval); 1256 if (error) { 1257 panic("vmx_emulate_cr_access: error %d writing cr%d", 1258 error, cr); 1259 } 1260 1261 if (cr == 0 && regval & CR0_PG) { 1262 uint64_t efer, entry_ctls; 1263 1264 /* 1265 * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and 1266 * the "IA-32e mode guest" bit in VM-entry control must be 1267 * equal. 1268 */ 1269 error = vmread(VMCS_GUEST_IA32_EFER, &efer); 1270 if (error) { 1271 panic("vmx_emulate_cr_access: error %d efer read", 1272 error); 1273 } 1274 if (efer & EFER_LME) { 1275 efer |= EFER_LMA; 1276 error = vmwrite(VMCS_GUEST_IA32_EFER, efer); 1277 if (error) { 1278 panic("vmx_emulate_cr_access: error %d" 1279 " efer write", error); 1280 } 1281 error = vmread(VMCS_ENTRY_CTLS, &entry_ctls); 1282 if (error) { 1283 panic("vmx_emulate_cr_access: error %d" 1284 " entry ctls read", error); 1285 } 1286 entry_ctls |= VM_ENTRY_GUEST_LMA; 1287 error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls); 1288 if (error) { 1289 panic("vmx_emulate_cr_access: error %d" 1290 " entry ctls write", error); 1291 } 1292 } 1293 } 1294 1295 return (HANDLED); 1296} 1297 1298static int 1299ept_fault_type(uint64_t ept_qual) 1300{ 1301 int fault_type; 1302 1303 if (ept_qual & EPT_VIOLATION_DATA_WRITE) 1304 fault_type = VM_PROT_WRITE; 1305 else if (ept_qual & EPT_VIOLATION_INST_FETCH) 1306 fault_type = VM_PROT_EXECUTE; 1307 else 1308 fault_type= VM_PROT_READ; 1309 1310 return (fault_type); 1311} 1312 1313static int 1314ept_protection(uint64_t ept_qual) 1315{ 1316 int prot = 0; 1317 1318 if (ept_qual & EPT_VIOLATION_GPA_READABLE) 1319 prot |= VM_PROT_READ; 1320 if (ept_qual & EPT_VIOLATION_GPA_WRITEABLE) 1321 prot |= VM_PROT_WRITE; 1322 if (ept_qual & EPT_VIOLATION_GPA_EXECUTABLE) 1323 prot |= VM_PROT_EXECUTE; 1324 1325 return (prot); 1326} 1327 1328static boolean_t 1329ept_emulation_fault(uint64_t ept_qual) 1330{ 1331 int read, write; 1332 1333 /* EPT fault on an instruction fetch doesn't make sense here */ 1334 if (ept_qual & EPT_VIOLATION_INST_FETCH) 1335 return (FALSE); 1336 1337 /* EPT fault must be a read fault or a write fault */ 1338 read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; 1339 write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; 1340 if ((read | write) == 0) 1341 return (FALSE); 1342 1343 /* 1344 * The EPT violation must have been caused by accessing a 1345 * guest-physical address that is a translation of a guest-linear 1346 * address. 1347 */ 1348 if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || 1349 (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { 1350 return (FALSE); 1351 } 1352 1353 return (TRUE); 1354} 1355 1356static int 1357vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) 1358{ 1359 int error, handled; 1360 struct vmcs *vmcs; 1361 struct vmxctx *vmxctx; 1362 uint32_t eax, ecx, edx, idtvec_info, idtvec_err, reason; 1363 uint64_t qual, gpa; 1364 1365 handled = 0; 1366 vmcs = &vmx->vmcs[vcpu]; 1367 vmxctx = &vmx->ctx[vcpu]; 1368 qual = vmexit->u.vmx.exit_qualification; 1369 reason = vmexit->u.vmx.exit_reason; 1370 vmexit->exitcode = VM_EXITCODE_BOGUS; 1371 1372 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); 1373 1374 /* 1375 * VM exits that could be triggered during event injection on the 1376 * previous VM entry need to be handled specially by re-injecting 1377 * the event. 1378 * 1379 * See "Information for VM Exits During Event Delivery" in Intel SDM 1380 * for details. 1381 */ 1382 switch (reason) { 1383 case EXIT_REASON_EPT_FAULT: 1384 case EXIT_REASON_EPT_MISCONFIG: 1385 case EXIT_REASON_APIC: 1386 case EXIT_REASON_TASK_SWITCH: 1387 case EXIT_REASON_EXCEPTION: 1388 idtvec_info = vmcs_idt_vectoring_info(); 1389 if (idtvec_info & VMCS_IDT_VEC_VALID) { 1390 idtvec_info &= ~(1 << 12); /* clear undefined bit */ 1391 vmwrite(VMCS_ENTRY_INTR_INFO, idtvec_info); 1392 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { 1393 idtvec_err = vmcs_idt_vectoring_err(); 1394 vmwrite(VMCS_ENTRY_EXCEPTION_ERROR, idtvec_err); 1395 } 1396 vmwrite(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); 1397 } 1398 default: 1399 break; 1400 } 1401 1402 switch (reason) { 1403 case EXIT_REASON_CR_ACCESS: 1404 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); 1405 handled = vmx_emulate_cr_access(vmx, vcpu, qual); 1406 break; 1407 case EXIT_REASON_RDMSR: 1408 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); 1409 ecx = vmxctx->guest_rcx; 1410 error = emulate_rdmsr(vmx->vm, vcpu, ecx); 1411 if (error) { 1412 vmexit->exitcode = VM_EXITCODE_RDMSR; 1413 vmexit->u.msr.code = ecx; 1414 } else 1415 handled = 1; 1416 break; 1417 case EXIT_REASON_WRMSR: 1418 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); 1419 eax = vmxctx->guest_rax; 1420 ecx = vmxctx->guest_rcx; 1421 edx = vmxctx->guest_rdx; 1422 error = emulate_wrmsr(vmx->vm, vcpu, ecx, 1423 (uint64_t)edx << 32 | eax); 1424 if (error) { 1425 vmexit->exitcode = VM_EXITCODE_WRMSR; 1426 vmexit->u.msr.code = ecx; 1427 vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; 1428 } else 1429 handled = 1; 1430 break; 1431 case EXIT_REASON_HLT: 1432 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); 1433 vmexit->exitcode = VM_EXITCODE_HLT; 1434 break; 1435 case EXIT_REASON_MTF: 1436 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); 1437 vmexit->exitcode = VM_EXITCODE_MTRAP; 1438 break; 1439 case EXIT_REASON_PAUSE: 1440 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); 1441 vmexit->exitcode = VM_EXITCODE_PAUSE; 1442 break; 1443 case EXIT_REASON_INTR_WINDOW: 1444 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); 1445 vmx_clear_int_window_exiting(vmx, vcpu); 1446 VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); 1447 return (1); 1448 case EXIT_REASON_EXT_INTR: 1449 /* 1450 * External interrupts serve only to cause VM exits and allow 1451 * the host interrupt handler to run. 1452 * 1453 * If this external interrupt triggers a virtual interrupt 1454 * to a VM, then that state will be recorded by the 1455 * host interrupt handler in the VM's softc. We will inject 1456 * this virtual interrupt during the subsequent VM enter. 1457 */ 1458 1459 /* 1460 * This is special. We want to treat this as an 'handled' 1461 * VM-exit but not increment the instruction pointer. 1462 */ 1463 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); 1464 return (1); 1465 case EXIT_REASON_NMI_WINDOW: 1466 /* Exit to allow the pending virtual NMI to be injected */ 1467 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); 1468 vmx_clear_nmi_window_exiting(vmx, vcpu); 1469 VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); 1470 return (1); 1471 case EXIT_REASON_INOUT: 1472 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); 1473 vmexit->exitcode = VM_EXITCODE_INOUT; 1474 vmexit->u.inout.bytes = (qual & 0x7) + 1; 1475 vmexit->u.inout.in = (qual & 0x8) ? 1 : 0; 1476 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; 1477 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; 1478 vmexit->u.inout.port = (uint16_t)(qual >> 16); 1479 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); 1480 break; 1481 case EXIT_REASON_CPUID: 1482 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); 1483 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); 1484 break; 1485 case EXIT_REASON_EPT_FAULT: 1486 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EPT_FAULT, 1); 1487 /* 1488 * If 'gpa' lies within the address space allocated to 1489 * memory then this must be a nested page fault otherwise 1490 * this must be an instruction that accesses MMIO space. 1491 */ 1492 gpa = vmcs_gpa(); 1493 if (vm_mem_allocated(vmx->vm, gpa)) { 1494 vmexit->exitcode = VM_EXITCODE_PAGING; 1495 vmexit->u.paging.gpa = gpa; 1496 vmexit->u.paging.fault_type = ept_fault_type(qual); 1497 vmexit->u.paging.protection = ept_protection(qual); 1498 } else if (ept_emulation_fault(qual)) { 1499 vmexit->exitcode = VM_EXITCODE_INST_EMUL; 1500 vmexit->u.inst_emul.gpa = gpa; 1501 vmexit->u.inst_emul.gla = vmcs_gla(); 1502 vmexit->u.inst_emul.cr3 = vmcs_guest_cr3(); 1503 } 1504 break; 1505 default: 1506 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); 1507 break; 1508 } 1509 1510 if (handled) { 1511 /* 1512 * It is possible that control is returned to userland 1513 * even though we were able to handle the VM exit in the 1514 * kernel. 1515 * 1516 * In such a case we want to make sure that the userland 1517 * restarts guest execution at the instruction *after* 1518 * the one we just processed. Therefore we update the 1519 * guest rip in the VMCS and in 'vmexit'. 1520 */ 1521 vm_exit_update_rip(vmexit); 1522 vmexit->rip += vmexit->inst_length; 1523 vmexit->inst_length = 0; 1524 } else { 1525 if (vmexit->exitcode == VM_EXITCODE_BOGUS) { 1526 /* 1527 * If this VM exit was not claimed by anybody then 1528 * treat it as a generic VMX exit. 1529 */ 1530 vmexit->exitcode = VM_EXITCODE_VMX; 1531 vmexit->u.vmx.error = 0; 1532 } else { 1533 /* 1534 * The exitcode and collateral have been populated. 1535 * The VM exit will be processed further in userland. 1536 */ 1537 } 1538 } 1539 return (handled); 1540} 1541 1542static int 1543vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap) 1544{ 1545 int error, vie, rc, handled, astpending; 1546 uint32_t exit_reason; 1547 struct vmx *vmx; 1548 struct vmxctx *vmxctx; 1549 struct vmcs *vmcs; 1550 struct vm_exit *vmexit; 1551 1552 vmx = arg; 1553 vmcs = &vmx->vmcs[vcpu]; 1554 vmxctx = &vmx->ctx[vcpu]; 1555 vmxctx->launched = 0; 1556 1557 astpending = 0; 1558 vmexit = vm_exitinfo(vmx->vm, vcpu); 1559 1560 KASSERT(vmxctx->pmap == pmap, 1561 ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); 1562 KASSERT(vmxctx->eptp == vmx->eptp, 1563 ("eptp %p different than ctx eptp %#lx", eptp, vmxctx->eptp)); 1564 1565 /* 1566 * XXX Can we avoid doing this every time we do a vm run? 1567 */ 1568 VMPTRLD(vmcs); 1569 1570 /* 1571 * XXX 1572 * We do this every time because we may setup the virtual machine 1573 * from a different process than the one that actually runs it. 1574 * 1575 * If the life of a virtual machine was spent entirely in the context 1576 * of a single process we could do this once in vmcs_set_defaults(). 1577 */ 1578 if ((error = vmwrite(VMCS_HOST_CR3, rcr3())) != 0) 1579 panic("vmx_run: error %d writing to VMCS_HOST_CR3", error); 1580 1581 if ((error = vmwrite(VMCS_GUEST_RIP, rip)) != 0) 1582 panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); 1583 1584 if ((error = vmx_set_pcpu_defaults(vmx, vcpu)) != 0) 1585 panic("vmx_run: error %d setting up pcpu defaults", error); 1586 1587 do { 1588 lapic_timer_tick(vmx->vm, vcpu); 1589 vmx_inject_interrupts(vmx, vcpu); 1590 vmx_run_trace(vmx, vcpu); 1591 rc = vmx_setjmp(vmxctx); 1592#ifdef SETJMP_TRACE 1593 vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); 1594#endif 1595 switch (rc) { 1596 case VMX_RETURN_DIRECT: 1597 if (vmxctx->launched == 0) { 1598 vmxctx->launched = 1; 1599 vmx_launch(vmxctx); 1600 } else 1601 vmx_resume(vmxctx); 1602 panic("vmx_launch/resume should not return"); 1603 break; 1604 case VMX_RETURN_LONGJMP: 1605 break; /* vm exit */ 1606 case VMX_RETURN_AST: 1607 astpending = 1; 1608 break; 1609 case VMX_RETURN_VMRESUME: 1610 vie = vmcs_instruction_error(); 1611 if (vmxctx->launch_error == VM_FAIL_INVALID || 1612 vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { 1613 printf("vmresume error %d vmcs inst error %d\n", 1614 vmxctx->launch_error, vie); 1615 goto err_exit; 1616 } 1617 vmx_launch(vmxctx); /* try to launch the guest */ 1618 panic("vmx_launch should not return"); 1619 break; 1620 case VMX_RETURN_VMLAUNCH: 1621 vie = vmcs_instruction_error(); 1622#if 1 1623 printf("vmlaunch error %d vmcs inst error %d\n", 1624 vmxctx->launch_error, vie); 1625#endif 1626 goto err_exit; 1627 case VMX_RETURN_INVEPT: 1628 panic("vm %s:%d invept error %d", 1629 vm_name(vmx->vm), vcpu, vmxctx->launch_error); 1630 default: 1631 panic("vmx_setjmp returned %d", rc); 1632 } 1633 1634 /* enable interrupts */ 1635 enable_intr(); 1636 1637 /* collect some basic information for VM exit processing */ 1638 vmexit->rip = rip = vmcs_guest_rip(); 1639 vmexit->inst_length = vmexit_instruction_length(); 1640 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); 1641 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); 1642 1643 if (astpending) { 1644 handled = 1; 1645 vmexit->inst_length = 0; 1646 vmexit->exitcode = VM_EXITCODE_BOGUS; 1647 vmx_astpending_trace(vmx, vcpu, rip); 1648 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); 1649 break; 1650 } 1651 1652 handled = vmx_exit_process(vmx, vcpu, vmexit); 1653 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); 1654 1655 } while (handled); 1656 1657 /* 1658 * If a VM exit has been handled then the exitcode must be BOGUS 1659 * If a VM exit is not handled then the exitcode must not be BOGUS 1660 */ 1661 if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || 1662 (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { 1663 panic("Mismatch between handled (%d) and exitcode (%d)", 1664 handled, vmexit->exitcode); 1665 } 1666 1667 if (!handled) 1668 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); 1669 1670 VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",vmexit->exitcode); 1671 1672 /* 1673 * XXX 1674 * We need to do this to ensure that any VMCS state cached by the 1675 * processor is flushed to memory. We need to do this in case the 1676 * VM moves to a different cpu the next time it runs. 1677 * 1678 * Can we avoid doing this? 1679 */ 1680 VMCLEAR(vmcs); 1681 return (0); 1682 1683err_exit: 1684 vmexit->exitcode = VM_EXITCODE_VMX; 1685 vmexit->u.vmx.exit_reason = (uint32_t)-1; 1686 vmexit->u.vmx.exit_qualification = (uint32_t)-1; 1687 vmexit->u.vmx.error = vie; 1688 VMCLEAR(vmcs); 1689 return (ENOEXEC); 1690} 1691 1692static void 1693vmx_vmcleanup(void *arg) 1694{ 1695 int i, error; 1696 struct vmx *vmx = arg; 1697 1698 for (i = 0; i < VM_MAXCPU; i++) 1699 vpid_free(vmx->state[i].vpid); 1700 1701 /* 1702 * XXXSMP we also need to clear the VMCS active on the other vcpus. 1703 */ 1704 error = vmclear(&vmx->vmcs[0]); 1705 if (error != 0) 1706 panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); 1707 1708 free(vmx, M_VMX); 1709 1710 return; 1711} 1712 1713static register_t * 1714vmxctx_regptr(struct vmxctx *vmxctx, int reg) 1715{ 1716 1717 switch (reg) { 1718 case VM_REG_GUEST_RAX: 1719 return (&vmxctx->guest_rax); 1720 case VM_REG_GUEST_RBX: 1721 return (&vmxctx->guest_rbx); 1722 case VM_REG_GUEST_RCX: 1723 return (&vmxctx->guest_rcx); 1724 case VM_REG_GUEST_RDX: 1725 return (&vmxctx->guest_rdx); 1726 case VM_REG_GUEST_RSI: 1727 return (&vmxctx->guest_rsi); 1728 case VM_REG_GUEST_RDI: 1729 return (&vmxctx->guest_rdi); 1730 case VM_REG_GUEST_RBP: 1731 return (&vmxctx->guest_rbp); 1732 case VM_REG_GUEST_R8: 1733 return (&vmxctx->guest_r8); 1734 case VM_REG_GUEST_R9: 1735 return (&vmxctx->guest_r9); 1736 case VM_REG_GUEST_R10: 1737 return (&vmxctx->guest_r10); 1738 case VM_REG_GUEST_R11: 1739 return (&vmxctx->guest_r11); 1740 case VM_REG_GUEST_R12: 1741 return (&vmxctx->guest_r12); 1742 case VM_REG_GUEST_R13: 1743 return (&vmxctx->guest_r13); 1744 case VM_REG_GUEST_R14: 1745 return (&vmxctx->guest_r14); 1746 case VM_REG_GUEST_R15: 1747 return (&vmxctx->guest_r15); 1748 default: 1749 break; 1750 } 1751 return (NULL); 1752} 1753 1754static int 1755vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) 1756{ 1757 register_t *regp; 1758 1759 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1760 *retval = *regp; 1761 return (0); 1762 } else 1763 return (EINVAL); 1764} 1765 1766static int 1767vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) 1768{ 1769 register_t *regp; 1770 1771 if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { 1772 *regp = val; 1773 return (0); 1774 } else 1775 return (EINVAL); 1776} 1777 1778static int 1779vmx_shadow_reg(int reg) 1780{ 1781 int shreg; 1782 1783 shreg = -1; 1784 1785 switch (reg) { 1786 case VM_REG_GUEST_CR0: 1787 shreg = VMCS_CR0_SHADOW; 1788 break; 1789 case VM_REG_GUEST_CR4: 1790 shreg = VMCS_CR4_SHADOW; 1791 break; 1792 default: 1793 break; 1794 } 1795 1796 return (shreg); 1797} 1798 1799static int 1800vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) 1801{ 1802 int running, hostcpu; 1803 struct vmx *vmx = arg; 1804 1805 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1806 if (running && hostcpu != curcpu) 1807 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); 1808 1809 if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) 1810 return (0); 1811 1812 return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); 1813} 1814 1815static int 1816vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) 1817{ 1818 int error, hostcpu, running, shadow; 1819 uint64_t ctls; 1820 struct vmx *vmx = arg; 1821 1822 running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); 1823 if (running && hostcpu != curcpu) 1824 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); 1825 1826 if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) 1827 return (0); 1828 1829 error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); 1830 1831 if (error == 0) { 1832 /* 1833 * If the "load EFER" VM-entry control is 1 then the 1834 * value of EFER.LMA must be identical to "IA-32e mode guest" 1835 * bit in the VM-entry control. 1836 */ 1837 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && 1838 (reg == VM_REG_GUEST_EFER)) { 1839 vmcs_getreg(&vmx->vmcs[vcpu], running, 1840 VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); 1841 if (val & EFER_LMA) 1842 ctls |= VM_ENTRY_GUEST_LMA; 1843 else 1844 ctls &= ~VM_ENTRY_GUEST_LMA; 1845 vmcs_setreg(&vmx->vmcs[vcpu], running, 1846 VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); 1847 } 1848 1849 shadow = vmx_shadow_reg(reg); 1850 if (shadow > 0) { 1851 /* 1852 * Store the unmodified value in the shadow 1853 */ 1854 error = vmcs_setreg(&vmx->vmcs[vcpu], running, 1855 VMCS_IDENT(shadow), val); 1856 } 1857 } 1858 1859 return (error); 1860} 1861 1862static int 1863vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1864{ 1865 struct vmx *vmx = arg; 1866 1867 return (vmcs_getdesc(&vmx->vmcs[vcpu], reg, desc)); 1868} 1869 1870static int 1871vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) 1872{ 1873 struct vmx *vmx = arg; 1874 1875 return (vmcs_setdesc(&vmx->vmcs[vcpu], reg, desc)); 1876} 1877 1878static int 1879vmx_inject(void *arg, int vcpu, int type, int vector, uint32_t code, 1880 int code_valid) 1881{ 1882 int error; 1883 uint64_t info; 1884 struct vmx *vmx = arg; 1885 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1886 1887 static uint32_t type_map[VM_EVENT_MAX] = { 1888 0x1, /* VM_EVENT_NONE */ 1889 0x0, /* VM_HW_INTR */ 1890 0x2, /* VM_NMI */ 1891 0x3, /* VM_HW_EXCEPTION */ 1892 0x4, /* VM_SW_INTR */ 1893 0x5, /* VM_PRIV_SW_EXCEPTION */ 1894 0x6, /* VM_SW_EXCEPTION */ 1895 }; 1896 1897 /* 1898 * If there is already an exception pending to be delivered to the 1899 * vcpu then just return. 1900 */ 1901 error = vmcs_getreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), &info); 1902 if (error) 1903 return (error); 1904 1905 if (info & VMCS_INTERRUPTION_INFO_VALID) 1906 return (EAGAIN); 1907 1908 info = vector | (type_map[type] << 8) | (code_valid ? 1 << 11 : 0); 1909 info |= VMCS_INTERRUPTION_INFO_VALID; 1910 error = vmcs_setreg(vmcs, 0, VMCS_IDENT(VMCS_ENTRY_INTR_INFO), info); 1911 if (error != 0) 1912 return (error); 1913 1914 if (code_valid) { 1915 error = vmcs_setreg(vmcs, 0, 1916 VMCS_IDENT(VMCS_ENTRY_EXCEPTION_ERROR), 1917 code); 1918 } 1919 return (error); 1920} 1921 1922static int 1923vmx_getcap(void *arg, int vcpu, int type, int *retval) 1924{ 1925 struct vmx *vmx = arg; 1926 int vcap; 1927 int ret; 1928 1929 ret = ENOENT; 1930 1931 vcap = vmx->cap[vcpu].set; 1932 1933 switch (type) { 1934 case VM_CAP_HALT_EXIT: 1935 if (cap_halt_exit) 1936 ret = 0; 1937 break; 1938 case VM_CAP_PAUSE_EXIT: 1939 if (cap_pause_exit) 1940 ret = 0; 1941 break; 1942 case VM_CAP_MTRAP_EXIT: 1943 if (cap_monitor_trap) 1944 ret = 0; 1945 break; 1946 case VM_CAP_UNRESTRICTED_GUEST: 1947 if (cap_unrestricted_guest) 1948 ret = 0; 1949 break; 1950 case VM_CAP_ENABLE_INVPCID: 1951 if (cap_invpcid) 1952 ret = 0; 1953 break; 1954 default: 1955 break; 1956 } 1957 1958 if (ret == 0) 1959 *retval = (vcap & (1 << type)) ? 1 : 0; 1960 1961 return (ret); 1962} 1963 1964static int 1965vmx_setcap(void *arg, int vcpu, int type, int val) 1966{ 1967 struct vmx *vmx = arg; 1968 struct vmcs *vmcs = &vmx->vmcs[vcpu]; 1969 uint32_t baseval; 1970 uint32_t *pptr; 1971 int error; 1972 int flag; 1973 int reg; 1974 int retval; 1975 1976 retval = ENOENT; 1977 pptr = NULL; 1978 1979 switch (type) { 1980 case VM_CAP_HALT_EXIT: 1981 if (cap_halt_exit) { 1982 retval = 0; 1983 pptr = &vmx->cap[vcpu].proc_ctls; 1984 baseval = *pptr; 1985 flag = PROCBASED_HLT_EXITING; 1986 reg = VMCS_PRI_PROC_BASED_CTLS; 1987 } 1988 break; 1989 case VM_CAP_MTRAP_EXIT: 1990 if (cap_monitor_trap) { 1991 retval = 0; 1992 pptr = &vmx->cap[vcpu].proc_ctls; 1993 baseval = *pptr; 1994 flag = PROCBASED_MTF; 1995 reg = VMCS_PRI_PROC_BASED_CTLS; 1996 } 1997 break; 1998 case VM_CAP_PAUSE_EXIT: 1999 if (cap_pause_exit) { 2000 retval = 0; 2001 pptr = &vmx->cap[vcpu].proc_ctls; 2002 baseval = *pptr; 2003 flag = PROCBASED_PAUSE_EXITING; 2004 reg = VMCS_PRI_PROC_BASED_CTLS; 2005 } 2006 break; 2007 case VM_CAP_UNRESTRICTED_GUEST: 2008 if (cap_unrestricted_guest) { 2009 retval = 0; 2010 pptr = &vmx->cap[vcpu].proc_ctls2; 2011 baseval = *pptr; 2012 flag = PROCBASED2_UNRESTRICTED_GUEST; 2013 reg = VMCS_SEC_PROC_BASED_CTLS; 2014 } 2015 break; 2016 case VM_CAP_ENABLE_INVPCID: 2017 if (cap_invpcid) { 2018 retval = 0; 2019 pptr = &vmx->cap[vcpu].proc_ctls2; 2020 baseval = *pptr; 2021 flag = PROCBASED2_ENABLE_INVPCID; 2022 reg = VMCS_SEC_PROC_BASED_CTLS; 2023 } 2024 break; 2025 default: 2026 break; 2027 } 2028 2029 if (retval == 0) { 2030 if (val) { 2031 baseval |= flag; 2032 } else { 2033 baseval &= ~flag; 2034 } 2035 VMPTRLD(vmcs); 2036 error = vmwrite(reg, baseval); 2037 VMCLEAR(vmcs); 2038 2039 if (error) { 2040 retval = error; 2041 } else { 2042 /* 2043 * Update optional stored flags, and record 2044 * setting 2045 */ 2046 if (pptr != NULL) { 2047 *pptr = baseval; 2048 } 2049 2050 if (val) { 2051 vmx->cap[vcpu].set |= (1 << type); 2052 } else { 2053 vmx->cap[vcpu].set &= ~(1 << type); 2054 } 2055 } 2056 } 2057 2058 return (retval); 2059} 2060 2061struct vmm_ops vmm_ops_intel = { 2062 vmx_init, 2063 vmx_cleanup, 2064 vmx_restore, 2065 vmx_vminit, 2066 vmx_run, 2067 vmx_vmcleanup, 2068 vmx_getreg, 2069 vmx_setreg, 2070 vmx_getdesc, 2071 vmx_setdesc, 2072 vmx_inject, 2073 vmx_getcap, 2074 vmx_setcap, 2075 ept_vmspace_alloc, 2076 ept_vmspace_free, 2077}; 2078