vmm.c revision 284900
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 284900 2015-06-28 03:22:26Z neel $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 284900 2015-06-28 03:22:26Z neel $"); 31 32#include <sys/param.h> 33#include <sys/systm.h> 34#include <sys/kernel.h> 35#include <sys/module.h> 36#include <sys/sysctl.h> 37#include <sys/malloc.h> 38#include <sys/pcpu.h> 39#include <sys/lock.h> 40#include <sys/mutex.h> 41#include <sys/proc.h> 42#include <sys/rwlock.h> 43#include <sys/sched.h> 44#include <sys/smp.h> 45#include <sys/systm.h> 46 47#include <vm/vm.h> 48#include <vm/vm_object.h> 49#include <vm/vm_page.h> 50#include <vm/pmap.h> 51#include <vm/vm_map.h> 52#include <vm/vm_extern.h> 53#include <vm/vm_param.h> 54 55#include <machine/cpu.h> 56#include <machine/vm.h> 57#include <machine/pcb.h> 58#include <machine/smp.h> 59#include <x86/psl.h> 60#include <x86/apicreg.h> 61#include <machine/vmparam.h> 62 63#include <machine/vmm.h> 64#include <machine/vmm_dev.h> 65#include <machine/vmm_instruction_emul.h> 66 67#include "vmm_ioport.h" 68#include "vmm_ktr.h" 69#include "vmm_host.h" 70#include "vmm_mem.h" 71#include "vmm_util.h" 72#include "vatpic.h" 73#include "vatpit.h" 74#include "vhpet.h" 75#include "vioapic.h" 76#include "vlapic.h" 77#include "vpmtmr.h" 78#include "vrtc.h" 79#include "vmm_ipi.h" 80#include "vmm_stat.h" 81#include "vmm_lapic.h" 82 83#include "io/ppt.h" 84#include "io/iommu.h" 85 86struct vlapic; 87 88/* 89 * Initialization: 90 * (a) allocated when vcpu is created 91 * (i) initialized when vcpu is created and when it is reinitialized 92 * (o) initialized the first time the vcpu is created 93 * (x) initialized before use 94 */ 95struct vcpu { 96 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ 97 enum vcpu_state state; /* (o) vcpu state */ 98 int hostcpu; /* (o) vcpu's host cpu */ 99 int reqidle; /* (i) request vcpu to idle */ 100 struct vlapic *vlapic; /* (i) APIC device model */ 101 enum x2apic_state x2apic_state; /* (i) APIC mode */ 102 uint64_t exitintinfo; /* (i) events pending at VM exit */ 103 int nmi_pending; /* (i) NMI pending */ 104 int extint_pending; /* (i) INTR pending */ 105 int exception_pending; /* (i) exception pending */ 106 int exc_vector; /* (x) exception collateral */ 107 int exc_errcode_valid; 108 uint32_t exc_errcode; 109 struct savefpu *guestfpu; /* (a,i) guest fpu state */ 110 uint64_t guest_xcr0; /* (i) guest %xcr0 register */ 111 void *stats; /* (a,i) statistics */ 112 struct vm_exit exitinfo; /* (x) exit reason and collateral */ 113 uint64_t nextrip; /* (x) next instruction to execute */ 114}; 115 116#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) 117#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) 118#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) 119#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) 120#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) 121 122struct mem_seg { 123 vm_paddr_t gpa; 124 size_t len; 125 boolean_t wired; 126 vm_object_t object; 127}; 128#define VM_MAX_MEMORY_SEGMENTS 2 129 130/* 131 * Initialization: 132 * (o) initialized the first time the VM is created 133 * (i) initialized when VM is created and when it is reinitialized 134 * (x) initialized before use 135 */ 136struct vm { 137 void *cookie; /* (i) cpu-specific data */ 138 void *iommu; /* (x) iommu-specific data */ 139 struct vhpet *vhpet; /* (i) virtual HPET */ 140 struct vioapic *vioapic; /* (i) virtual ioapic */ 141 struct vatpic *vatpic; /* (i) virtual atpic */ 142 struct vatpit *vatpit; /* (i) virtual atpit */ 143 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ 144 struct vrtc *vrtc; /* (o) virtual RTC */ 145 volatile cpuset_t active_cpus; /* (i) active vcpus */ 146 int suspend; /* (i) stop VM execution */ 147 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ 148 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ 149 cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */ 150 cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */ 151 void *rendezvous_arg; /* (x) rendezvous func/arg */ 152 vm_rendezvous_func_t rendezvous_func; 153 struct mtx rendezvous_mtx; /* (o) rendezvous lock */ 154 int num_mem_segs; /* (o) guest memory segments */ 155 struct mem_seg mem_segs[VM_MAX_MEMORY_SEGMENTS]; 156 struct vmspace *vmspace; /* (o) guest's address space */ 157 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ 158 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ 159}; 160 161static int vmm_initialized; 162 163static struct vmm_ops *ops; 164#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) 165#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) 166#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) 167 168#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) 169#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ 170 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) 171#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) 172#define VMSPACE_ALLOC(min, max) \ 173 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) 174#define VMSPACE_FREE(vmspace) \ 175 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) 176#define VMGETREG(vmi, vcpu, num, retval) \ 177 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) 178#define VMSETREG(vmi, vcpu, num, val) \ 179 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO) 180#define VMGETDESC(vmi, vcpu, num, desc) \ 181 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO) 182#define VMSETDESC(vmi, vcpu, num, desc) \ 183 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO) 184#define VMGETCAP(vmi, vcpu, num, retval) \ 185 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO) 186#define VMSETCAP(vmi, vcpu, num, val) \ 187 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO) 188#define VLAPIC_INIT(vmi, vcpu) \ 189 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL) 190#define VLAPIC_CLEANUP(vmi, vlapic) \ 191 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL) 192 193#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) 194#define fpu_stop_emulating() clts() 195 196static MALLOC_DEFINE(M_VM, "vm", "vm"); 197 198/* statistics */ 199static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); 200 201SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); 202 203/* 204 * Halt the guest if all vcpus are executing a HLT instruction with 205 * interrupts disabled. 206 */ 207static int halt_detection_enabled = 1; 208TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled); 209SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, 210 &halt_detection_enabled, 0, 211 "Halt VM if all vcpus execute HLT with interrupts disabled"); 212 213static int vmm_ipinum; 214SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, 215 "IPI vector used for vcpu notifications"); 216 217static int trace_guest_exceptions; 218SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, 219 &trace_guest_exceptions, 0, 220 "Trap into hypervisor on all guest exceptions and reflect them back"); 221 222static int vmm_force_iommu = 0; 223TUNABLE_INT("hw.vmm.force_iommu", &vmm_force_iommu); 224SYSCTL_INT(_hw_vmm, OID_AUTO, force_iommu, CTLFLAG_RDTUN, &vmm_force_iommu, 0, 225 "Force use of I/O MMU even if no passthrough devices were found."); 226 227static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); 228 229#ifdef KTR 230static const char * 231vcpu_state2str(enum vcpu_state state) 232{ 233 234 switch (state) { 235 case VCPU_IDLE: 236 return ("idle"); 237 case VCPU_FROZEN: 238 return ("frozen"); 239 case VCPU_RUNNING: 240 return ("running"); 241 case VCPU_SLEEPING: 242 return ("sleeping"); 243 default: 244 return ("unknown"); 245 } 246} 247#endif 248 249static void 250vcpu_cleanup(struct vm *vm, int i, bool destroy) 251{ 252 struct vcpu *vcpu = &vm->vcpu[i]; 253 254 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); 255 if (destroy) { 256 vmm_stat_free(vcpu->stats); 257 fpu_save_area_free(vcpu->guestfpu); 258 } 259} 260 261static void 262vcpu_init(struct vm *vm, int vcpu_id, bool create) 263{ 264 struct vcpu *vcpu; 265 266 KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU, 267 ("vcpu_init: invalid vcpu %d", vcpu_id)); 268 269 vcpu = &vm->vcpu[vcpu_id]; 270 271 if (create) { 272 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " 273 "initialized", vcpu_id)); 274 vcpu_lock_init(vcpu); 275 vcpu->state = VCPU_IDLE; 276 vcpu->hostcpu = NOCPU; 277 vcpu->guestfpu = fpu_save_area_alloc(); 278 vcpu->stats = vmm_stat_alloc(); 279 } 280 281 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); 282 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); 283 vcpu->reqidle = 0; 284 vcpu->exitintinfo = 0; 285 vcpu->nmi_pending = 0; 286 vcpu->extint_pending = 0; 287 vcpu->exception_pending = 0; 288 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; 289 fpu_save_area_reset(vcpu->guestfpu); 290 vmm_stat_init(vcpu->stats); 291} 292 293int 294vcpu_trace_exceptions(struct vm *vm, int vcpuid) 295{ 296 297 return (trace_guest_exceptions); 298} 299 300struct vm_exit * 301vm_exitinfo(struct vm *vm, int cpuid) 302{ 303 struct vcpu *vcpu; 304 305 if (cpuid < 0 || cpuid >= VM_MAXCPU) 306 panic("vm_exitinfo: invalid cpuid %d", cpuid); 307 308 vcpu = &vm->vcpu[cpuid]; 309 310 return (&vcpu->exitinfo); 311} 312 313static void 314vmm_resume(void) 315{ 316 VMM_RESUME(); 317} 318 319static int 320vmm_init(void) 321{ 322 int error; 323 324 vmm_host_state_init(); 325 326 vmm_ipinum = vmm_ipi_alloc(); 327 if (vmm_ipinum == 0) 328 vmm_ipinum = IPI_AST; 329 330 error = vmm_mem_init(); 331 if (error) 332 return (error); 333 334 if (vmm_is_intel()) 335 ops = &vmm_ops_intel; 336 else if (vmm_is_amd()) 337 ops = &vmm_ops_amd; 338 else 339 return (ENXIO); 340 341 vmm_resume_p = vmm_resume; 342 343 return (VMM_INIT(vmm_ipinum)); 344} 345 346static int 347vmm_handler(module_t mod, int what, void *arg) 348{ 349 int error; 350 351 switch (what) { 352 case MOD_LOAD: 353 vmmdev_init(); 354 if (vmm_force_iommu || ppt_avail_devices() > 0) 355 iommu_init(); 356 error = vmm_init(); 357 if (error == 0) 358 vmm_initialized = 1; 359 break; 360 case MOD_UNLOAD: 361 error = vmmdev_cleanup(); 362 if (error == 0) { 363 vmm_resume_p = NULL; 364 iommu_cleanup(); 365 if (vmm_ipinum != IPI_AST) 366 vmm_ipi_free(vmm_ipinum); 367 error = VMM_CLEANUP(); 368 /* 369 * Something bad happened - prevent new 370 * VMs from being created 371 */ 372 if (error) 373 vmm_initialized = 0; 374 } 375 break; 376 default: 377 error = 0; 378 break; 379 } 380 return (error); 381} 382 383static moduledata_t vmm_kmod = { 384 "vmm", 385 vmm_handler, 386 NULL 387}; 388 389/* 390 * vmm initialization has the following dependencies: 391 * 392 * - iommu initialization must happen after the pci passthru driver has had 393 * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). 394 * 395 * - VT-x initialization requires smp_rendezvous() and therefore must happen 396 * after SMP is fully functional (after SI_SUB_SMP). 397 */ 398DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); 399MODULE_VERSION(vmm, 1); 400 401static void 402vm_init(struct vm *vm, bool create) 403{ 404 int i; 405 406 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); 407 vm->iommu = NULL; 408 vm->vioapic = vioapic_init(vm); 409 vm->vhpet = vhpet_init(vm); 410 vm->vatpic = vatpic_init(vm); 411 vm->vatpit = vatpit_init(vm); 412 vm->vpmtmr = vpmtmr_init(vm); 413 if (create) 414 vm->vrtc = vrtc_init(vm); 415 416 CPU_ZERO(&vm->active_cpus); 417 418 vm->suspend = 0; 419 CPU_ZERO(&vm->suspended_cpus); 420 421 for (i = 0; i < VM_MAXCPU; i++) 422 vcpu_init(vm, i, create); 423} 424 425int 426vm_create(const char *name, struct vm **retvm) 427{ 428 struct vm *vm; 429 struct vmspace *vmspace; 430 431 /* 432 * If vmm.ko could not be successfully initialized then don't attempt 433 * to create the virtual machine. 434 */ 435 if (!vmm_initialized) 436 return (ENXIO); 437 438 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) 439 return (EINVAL); 440 441 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); 442 if (vmspace == NULL) 443 return (ENOMEM); 444 445 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); 446 strcpy(vm->name, name); 447 vm->num_mem_segs = 0; 448 vm->vmspace = vmspace; 449 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF); 450 451 vm_init(vm, true); 452 453 *retvm = vm; 454 return (0); 455} 456 457static void 458vm_free_mem_seg(struct vm *vm, struct mem_seg *seg) 459{ 460 461 if (seg->object != NULL) 462 vmm_mem_free(vm->vmspace, seg->gpa, seg->len); 463 464 bzero(seg, sizeof(*seg)); 465} 466 467static void 468vm_cleanup(struct vm *vm, bool destroy) 469{ 470 int i; 471 472 ppt_unassign_all(vm); 473 474 if (vm->iommu != NULL) 475 iommu_destroy_domain(vm->iommu); 476 477 if (destroy) 478 vrtc_cleanup(vm->vrtc); 479 else 480 vrtc_reset(vm->vrtc); 481 vpmtmr_cleanup(vm->vpmtmr); 482 vatpit_cleanup(vm->vatpit); 483 vhpet_cleanup(vm->vhpet); 484 vatpic_cleanup(vm->vatpic); 485 vioapic_cleanup(vm->vioapic); 486 487 for (i = 0; i < VM_MAXCPU; i++) 488 vcpu_cleanup(vm, i, destroy); 489 490 VMCLEANUP(vm->cookie); 491 492 if (destroy) { 493 for (i = 0; i < vm->num_mem_segs; i++) 494 vm_free_mem_seg(vm, &vm->mem_segs[i]); 495 496 vm->num_mem_segs = 0; 497 498 VMSPACE_FREE(vm->vmspace); 499 vm->vmspace = NULL; 500 } 501} 502 503void 504vm_destroy(struct vm *vm) 505{ 506 vm_cleanup(vm, true); 507 free(vm, M_VM); 508} 509 510int 511vm_reinit(struct vm *vm) 512{ 513 int error; 514 515 /* 516 * A virtual machine can be reset only if all vcpus are suspended. 517 */ 518 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 519 vm_cleanup(vm, false); 520 vm_init(vm, false); 521 error = 0; 522 } else { 523 error = EBUSY; 524 } 525 526 return (error); 527} 528 529const char * 530vm_name(struct vm *vm) 531{ 532 return (vm->name); 533} 534 535int 536vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) 537{ 538 vm_object_t obj; 539 540 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) 541 return (ENOMEM); 542 else 543 return (0); 544} 545 546int 547vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) 548{ 549 550 vmm_mmio_free(vm->vmspace, gpa, len); 551 return (0); 552} 553 554boolean_t 555vm_mem_allocated(struct vm *vm, vm_paddr_t gpa) 556{ 557 int i; 558 vm_paddr_t gpabase, gpalimit; 559 560 for (i = 0; i < vm->num_mem_segs; i++) { 561 gpabase = vm->mem_segs[i].gpa; 562 gpalimit = gpabase + vm->mem_segs[i].len; 563 if (gpa >= gpabase && gpa < gpalimit) 564 return (TRUE); /* 'gpa' is regular memory */ 565 } 566 567 if (ppt_is_mmio(vm, gpa)) 568 return (TRUE); /* 'gpa' is pci passthru mmio */ 569 570 return (FALSE); 571} 572 573int 574vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) 575{ 576 int available, allocated; 577 struct mem_seg *seg; 578 vm_object_t object; 579 vm_paddr_t g; 580 581 if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) 582 return (EINVAL); 583 584 available = allocated = 0; 585 g = gpa; 586 while (g < gpa + len) { 587 if (vm_mem_allocated(vm, g)) 588 allocated++; 589 else 590 available++; 591 592 g += PAGE_SIZE; 593 } 594 595 /* 596 * If there are some allocated and some available pages in the address 597 * range then it is an error. 598 */ 599 if (allocated && available) 600 return (EINVAL); 601 602 /* 603 * If the entire address range being requested has already been 604 * allocated then there isn't anything more to do. 605 */ 606 if (allocated && available == 0) 607 return (0); 608 609 if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) 610 return (E2BIG); 611 612 seg = &vm->mem_segs[vm->num_mem_segs]; 613 614 if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL) 615 return (ENOMEM); 616 617 seg->gpa = gpa; 618 seg->len = len; 619 seg->object = object; 620 seg->wired = FALSE; 621 622 vm->num_mem_segs++; 623 624 return (0); 625} 626 627static vm_paddr_t 628vm_maxmem(struct vm *vm) 629{ 630 int i; 631 vm_paddr_t gpa, maxmem; 632 633 maxmem = 0; 634 for (i = 0; i < vm->num_mem_segs; i++) { 635 gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len; 636 if (gpa > maxmem) 637 maxmem = gpa; 638 } 639 return (maxmem); 640} 641 642static void 643vm_gpa_unwire(struct vm *vm) 644{ 645 int i, rv; 646 struct mem_seg *seg; 647 648 for (i = 0; i < vm->num_mem_segs; i++) { 649 seg = &vm->mem_segs[i]; 650 if (!seg->wired) 651 continue; 652 653 rv = vm_map_unwire(&vm->vmspace->vm_map, 654 seg->gpa, seg->gpa + seg->len, 655 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 656 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment " 657 "%#lx/%ld could not be unwired: %d", 658 vm_name(vm), seg->gpa, seg->len, rv)); 659 660 seg->wired = FALSE; 661 } 662} 663 664static int 665vm_gpa_wire(struct vm *vm) 666{ 667 int i, rv; 668 struct mem_seg *seg; 669 670 for (i = 0; i < vm->num_mem_segs; i++) { 671 seg = &vm->mem_segs[i]; 672 if (seg->wired) 673 continue; 674 675 /* XXX rlimits? */ 676 rv = vm_map_wire(&vm->vmspace->vm_map, 677 seg->gpa, seg->gpa + seg->len, 678 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); 679 if (rv != KERN_SUCCESS) 680 break; 681 682 seg->wired = TRUE; 683 } 684 685 if (i < vm->num_mem_segs) { 686 /* 687 * Undo the wiring before returning an error. 688 */ 689 vm_gpa_unwire(vm); 690 return (EAGAIN); 691 } 692 693 return (0); 694} 695 696static void 697vm_iommu_modify(struct vm *vm, boolean_t map) 698{ 699 int i, sz; 700 vm_paddr_t gpa, hpa; 701 struct mem_seg *seg; 702 void *vp, *cookie, *host_domain; 703 704 sz = PAGE_SIZE; 705 host_domain = iommu_host_domain(); 706 707 for (i = 0; i < vm->num_mem_segs; i++) { 708 seg = &vm->mem_segs[i]; 709 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired", 710 vm_name(vm), seg->gpa, seg->len)); 711 712 gpa = seg->gpa; 713 while (gpa < seg->gpa + seg->len) { 714 vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE, 715 &cookie); 716 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", 717 vm_name(vm), gpa)); 718 719 vm_gpa_release(cookie); 720 721 hpa = DMAP_TO_PHYS((uintptr_t)vp); 722 if (map) { 723 iommu_create_mapping(vm->iommu, gpa, hpa, sz); 724 iommu_remove_mapping(host_domain, hpa, sz); 725 } else { 726 iommu_remove_mapping(vm->iommu, gpa, sz); 727 iommu_create_mapping(host_domain, hpa, hpa, sz); 728 } 729 730 gpa += PAGE_SIZE; 731 } 732 } 733 734 /* 735 * Invalidate the cached translations associated with the domain 736 * from which pages were removed. 737 */ 738 if (map) 739 iommu_invalidate_tlb(host_domain); 740 else 741 iommu_invalidate_tlb(vm->iommu); 742} 743 744#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) 745#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) 746 747int 748vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) 749{ 750 int error; 751 752 error = ppt_unassign_device(vm, bus, slot, func); 753 if (error) 754 return (error); 755 756 if (ppt_assigned_devices(vm) == 0) { 757 vm_iommu_unmap(vm); 758 vm_gpa_unwire(vm); 759 } 760 return (0); 761} 762 763int 764vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) 765{ 766 int error; 767 vm_paddr_t maxaddr; 768 769 /* 770 * Virtual machines with pci passthru devices get special treatment: 771 * - the guest physical memory is wired 772 * - the iommu is programmed to do the 'gpa' to 'hpa' translation 773 * 774 * We need to do this before the first pci passthru device is attached. 775 */ 776 if (ppt_assigned_devices(vm) == 0) { 777 KASSERT(vm->iommu == NULL, 778 ("vm_assign_pptdev: iommu must be NULL")); 779 maxaddr = vm_maxmem(vm); 780 vm->iommu = iommu_create_domain(maxaddr); 781 782 error = vm_gpa_wire(vm); 783 if (error) 784 return (error); 785 786 vm_iommu_map(vm); 787 } 788 789 error = ppt_assign_device(vm, bus, slot, func); 790 return (error); 791} 792 793void * 794vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, 795 void **cookie) 796{ 797 int count, pageoff; 798 vm_page_t m; 799 800 pageoff = gpa & PAGE_MASK; 801 if (len > PAGE_SIZE - pageoff) 802 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); 803 804 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, 805 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); 806 807 if (count == 1) { 808 *cookie = m; 809 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff)); 810 } else { 811 *cookie = NULL; 812 return (NULL); 813 } 814} 815 816void 817vm_gpa_release(void *cookie) 818{ 819 vm_page_t m = cookie; 820 821 vm_page_lock(m); 822 vm_page_unhold(m); 823 vm_page_unlock(m); 824} 825 826int 827vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, 828 struct vm_memory_segment *seg) 829{ 830 int i; 831 832 for (i = 0; i < vm->num_mem_segs; i++) { 833 if (gpabase == vm->mem_segs[i].gpa) { 834 seg->gpa = vm->mem_segs[i].gpa; 835 seg->len = vm->mem_segs[i].len; 836 seg->wired = vm->mem_segs[i].wired; 837 return (0); 838 } 839 } 840 return (-1); 841} 842 843int 844vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len, 845 vm_offset_t *offset, struct vm_object **object) 846{ 847 int i; 848 size_t seg_len; 849 vm_paddr_t seg_gpa; 850 vm_object_t seg_obj; 851 852 for (i = 0; i < vm->num_mem_segs; i++) { 853 if ((seg_obj = vm->mem_segs[i].object) == NULL) 854 continue; 855 856 seg_gpa = vm->mem_segs[i].gpa; 857 seg_len = vm->mem_segs[i].len; 858 859 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) { 860 *offset = gpa - seg_gpa; 861 *object = seg_obj; 862 vm_object_reference(seg_obj); 863 return (0); 864 } 865 } 866 867 return (EINVAL); 868} 869 870int 871vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) 872{ 873 874 if (vcpu < 0 || vcpu >= VM_MAXCPU) 875 return (EINVAL); 876 877 if (reg >= VM_REG_LAST) 878 return (EINVAL); 879 880 return (VMGETREG(vm->cookie, vcpu, reg, retval)); 881} 882 883int 884vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) 885{ 886 struct vcpu *vcpu; 887 int error; 888 889 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 890 return (EINVAL); 891 892 if (reg >= VM_REG_LAST) 893 return (EINVAL); 894 895 error = VMSETREG(vm->cookie, vcpuid, reg, val); 896 if (error || reg != VM_REG_GUEST_RIP) 897 return (error); 898 899 /* Set 'nextrip' to match the value of %rip */ 900 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val); 901 vcpu = &vm->vcpu[vcpuid]; 902 vcpu->nextrip = val; 903 return (0); 904} 905 906static boolean_t 907is_descriptor_table(int reg) 908{ 909 910 switch (reg) { 911 case VM_REG_GUEST_IDTR: 912 case VM_REG_GUEST_GDTR: 913 return (TRUE); 914 default: 915 return (FALSE); 916 } 917} 918 919static boolean_t 920is_segment_register(int reg) 921{ 922 923 switch (reg) { 924 case VM_REG_GUEST_ES: 925 case VM_REG_GUEST_CS: 926 case VM_REG_GUEST_SS: 927 case VM_REG_GUEST_DS: 928 case VM_REG_GUEST_FS: 929 case VM_REG_GUEST_GS: 930 case VM_REG_GUEST_TR: 931 case VM_REG_GUEST_LDTR: 932 return (TRUE); 933 default: 934 return (FALSE); 935 } 936} 937 938int 939vm_get_seg_desc(struct vm *vm, int vcpu, int reg, 940 struct seg_desc *desc) 941{ 942 943 if (vcpu < 0 || vcpu >= VM_MAXCPU) 944 return (EINVAL); 945 946 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 947 return (EINVAL); 948 949 return (VMGETDESC(vm->cookie, vcpu, reg, desc)); 950} 951 952int 953vm_set_seg_desc(struct vm *vm, int vcpu, int reg, 954 struct seg_desc *desc) 955{ 956 if (vcpu < 0 || vcpu >= VM_MAXCPU) 957 return (EINVAL); 958 959 if (!is_segment_register(reg) && !is_descriptor_table(reg)) 960 return (EINVAL); 961 962 return (VMSETDESC(vm->cookie, vcpu, reg, desc)); 963} 964 965static void 966restore_guest_fpustate(struct vcpu *vcpu) 967{ 968 969 /* flush host state to the pcb */ 970 fpuexit(curthread); 971 972 /* restore guest FPU state */ 973 fpu_stop_emulating(); 974 fpurestore(vcpu->guestfpu); 975 976 /* restore guest XCR0 if XSAVE is enabled in the host */ 977 if (rcr4() & CR4_XSAVE) 978 load_xcr(0, vcpu->guest_xcr0); 979 980 /* 981 * The FPU is now "dirty" with the guest's state so turn on emulation 982 * to trap any access to the FPU by the host. 983 */ 984 fpu_start_emulating(); 985} 986 987static void 988save_guest_fpustate(struct vcpu *vcpu) 989{ 990 991 if ((rcr0() & CR0_TS) == 0) 992 panic("fpu emulation not enabled in host!"); 993 994 /* save guest XCR0 and restore host XCR0 */ 995 if (rcr4() & CR4_XSAVE) { 996 vcpu->guest_xcr0 = rxcr(0); 997 load_xcr(0, vmm_get_host_xcr0()); 998 } 999 1000 /* save guest FPU state */ 1001 fpu_stop_emulating(); 1002 fpusave(vcpu->guestfpu); 1003 fpu_start_emulating(); 1004} 1005 1006static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); 1007 1008static int 1009vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, 1010 bool from_idle) 1011{ 1012 struct vcpu *vcpu; 1013 int error; 1014 1015 vcpu = &vm->vcpu[vcpuid]; 1016 vcpu_assert_locked(vcpu); 1017 1018 /* 1019 * State transitions from the vmmdev_ioctl() must always begin from 1020 * the VCPU_IDLE state. This guarantees that there is only a single 1021 * ioctl() operating on a vcpu at any point. 1022 */ 1023 if (from_idle) { 1024 while (vcpu->state != VCPU_IDLE) { 1025 vcpu->reqidle = 1; 1026 vcpu_notify_event_locked(vcpu, false); 1027 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " 1028 "idle requested", vcpu_state2str(vcpu->state)); 1029 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); 1030 } 1031 } else { 1032 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " 1033 "vcpu idle state")); 1034 } 1035 1036 if (vcpu->state == VCPU_RUNNING) { 1037 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d " 1038 "mismatch for running vcpu", curcpu, vcpu->hostcpu)); 1039 } else { 1040 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a " 1041 "vcpu that is not running", vcpu->hostcpu)); 1042 } 1043 1044 /* 1045 * The following state transitions are allowed: 1046 * IDLE -> FROZEN -> IDLE 1047 * FROZEN -> RUNNING -> FROZEN 1048 * FROZEN -> SLEEPING -> FROZEN 1049 */ 1050 switch (vcpu->state) { 1051 case VCPU_IDLE: 1052 case VCPU_RUNNING: 1053 case VCPU_SLEEPING: 1054 error = (newstate != VCPU_FROZEN); 1055 break; 1056 case VCPU_FROZEN: 1057 error = (newstate == VCPU_FROZEN); 1058 break; 1059 default: 1060 error = 1; 1061 break; 1062 } 1063 1064 if (error) 1065 return (EBUSY); 1066 1067 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", 1068 vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); 1069 1070 vcpu->state = newstate; 1071 if (newstate == VCPU_RUNNING) 1072 vcpu->hostcpu = curcpu; 1073 else 1074 vcpu->hostcpu = NOCPU; 1075 1076 if (newstate == VCPU_IDLE) 1077 wakeup(&vcpu->state); 1078 1079 return (0); 1080} 1081 1082static void 1083vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1084{ 1085 int error; 1086 1087 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0) 1088 panic("Error %d setting state to %d\n", error, newstate); 1089} 1090 1091static void 1092vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) 1093{ 1094 int error; 1095 1096 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) 1097 panic("Error %d setting state to %d", error, newstate); 1098} 1099 1100static void 1101vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func) 1102{ 1103 1104 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked")); 1105 1106 /* 1107 * Update 'rendezvous_func' and execute a write memory barrier to 1108 * ensure that it is visible across all host cpus. This is not needed 1109 * for correctness but it does ensure that all the vcpus will notice 1110 * that the rendezvous is requested immediately. 1111 */ 1112 vm->rendezvous_func = func; 1113 wmb(); 1114} 1115 1116#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \ 1117 do { \ 1118 if (vcpuid >= 0) \ 1119 VCPU_CTR0(vm, vcpuid, fmt); \ 1120 else \ 1121 VM_CTR0(vm, fmt); \ 1122 } while (0) 1123 1124static void 1125vm_handle_rendezvous(struct vm *vm, int vcpuid) 1126{ 1127 1128 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 1129 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid)); 1130 1131 mtx_lock(&vm->rendezvous_mtx); 1132 while (vm->rendezvous_func != NULL) { 1133 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */ 1134 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus); 1135 1136 if (vcpuid != -1 && 1137 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) && 1138 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) { 1139 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func"); 1140 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg); 1141 CPU_SET(vcpuid, &vm->rendezvous_done_cpus); 1142 } 1143 if (CPU_CMP(&vm->rendezvous_req_cpus, 1144 &vm->rendezvous_done_cpus) == 0) { 1145 VCPU_CTR0(vm, vcpuid, "Rendezvous completed"); 1146 vm_set_rendezvous_func(vm, NULL); 1147 wakeup(&vm->rendezvous_func); 1148 break; 1149 } 1150 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion"); 1151 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0, 1152 "vmrndv", 0); 1153 } 1154 mtx_unlock(&vm->rendezvous_mtx); 1155} 1156 1157/* 1158 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run. 1159 */ 1160static int 1161vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) 1162{ 1163 struct vcpu *vcpu; 1164 const char *wmesg; 1165 int t, vcpu_halted, vm_halted; 1166 1167 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); 1168 1169 vcpu = &vm->vcpu[vcpuid]; 1170 vcpu_halted = 0; 1171 vm_halted = 0; 1172 1173 vcpu_lock(vcpu); 1174 while (1) { 1175 /* 1176 * Do a final check for pending NMI or interrupts before 1177 * really putting this thread to sleep. Also check for 1178 * software events that would cause this vcpu to wakeup. 1179 * 1180 * These interrupts/events could have happened after the 1181 * vcpu returned from VMRUN() and before it acquired the 1182 * vcpu lock above. 1183 */ 1184 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle) 1185 break; 1186 if (vm_nmi_pending(vm, vcpuid)) 1187 break; 1188 if (!intr_disabled) { 1189 if (vm_extint_pending(vm, vcpuid) || 1190 vlapic_pending_intr(vcpu->vlapic, NULL)) { 1191 break; 1192 } 1193 } 1194 1195 /* Don't go to sleep if the vcpu thread needs to yield */ 1196 if (vcpu_should_yield(vm, vcpuid)) 1197 break; 1198 1199 /* 1200 * Some Linux guests implement "halt" by having all vcpus 1201 * execute HLT with interrupts disabled. 'halted_cpus' keeps 1202 * track of the vcpus that have entered this state. When all 1203 * vcpus enter the halted state the virtual machine is halted. 1204 */ 1205 if (intr_disabled) { 1206 wmesg = "vmhalt"; 1207 VCPU_CTR0(vm, vcpuid, "Halted"); 1208 if (!vcpu_halted && halt_detection_enabled) { 1209 vcpu_halted = 1; 1210 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); 1211 } 1212 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { 1213 vm_halted = 1; 1214 break; 1215 } 1216 } else { 1217 wmesg = "vmidle"; 1218 } 1219 1220 t = ticks; 1221 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1222 /* 1223 * XXX msleep_spin() cannot be interrupted by signals so 1224 * wake up periodically to check pending signals. 1225 */ 1226 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); 1227 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1228 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); 1229 } 1230 1231 if (vcpu_halted) 1232 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); 1233 1234 vcpu_unlock(vcpu); 1235 1236 if (vm_halted) 1237 vm_suspend(vm, VM_SUSPEND_HALT); 1238 1239 return (0); 1240} 1241 1242static int 1243vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) 1244{ 1245 int rv, ftype; 1246 struct vm_map *map; 1247 struct vcpu *vcpu; 1248 struct vm_exit *vme; 1249 1250 vcpu = &vm->vcpu[vcpuid]; 1251 vme = &vcpu->exitinfo; 1252 1253 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1254 __func__, vme->inst_length)); 1255 1256 ftype = vme->u.paging.fault_type; 1257 KASSERT(ftype == VM_PROT_READ || 1258 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, 1259 ("vm_handle_paging: invalid fault_type %d", ftype)); 1260 1261 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { 1262 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), 1263 vme->u.paging.gpa, ftype); 1264 if (rv == 0) { 1265 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", 1266 ftype == VM_PROT_READ ? "accessed" : "dirty", 1267 vme->u.paging.gpa); 1268 goto done; 1269 } 1270 } 1271 1272 map = &vm->vmspace->vm_map; 1273 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); 1274 1275 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " 1276 "ftype = %d", rv, vme->u.paging.gpa, ftype); 1277 1278 if (rv != KERN_SUCCESS) 1279 return (EFAULT); 1280done: 1281 return (0); 1282} 1283 1284static int 1285vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) 1286{ 1287 struct vie *vie; 1288 struct vcpu *vcpu; 1289 struct vm_exit *vme; 1290 uint64_t gla, gpa, cs_base; 1291 struct vm_guest_paging *paging; 1292 mem_region_read_t mread; 1293 mem_region_write_t mwrite; 1294 enum vm_cpu_mode cpu_mode; 1295 int cs_d, error, fault; 1296 1297 vcpu = &vm->vcpu[vcpuid]; 1298 vme = &vcpu->exitinfo; 1299 1300 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", 1301 __func__, vme->inst_length)); 1302 1303 gla = vme->u.inst_emul.gla; 1304 gpa = vme->u.inst_emul.gpa; 1305 cs_base = vme->u.inst_emul.cs_base; 1306 cs_d = vme->u.inst_emul.cs_d; 1307 vie = &vme->u.inst_emul.vie; 1308 paging = &vme->u.inst_emul.paging; 1309 cpu_mode = paging->cpu_mode; 1310 1311 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); 1312 1313 /* Fetch, decode and emulate the faulting instruction */ 1314 if (vie->num_valid == 0) { 1315 error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + 1316 cs_base, VIE_INST_SIZE, vie, &fault); 1317 } else { 1318 /* 1319 * The instruction bytes have already been copied into 'vie' 1320 */ 1321 error = fault = 0; 1322 } 1323 if (error || fault) 1324 return (error); 1325 1326 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { 1327 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", 1328 vme->rip + cs_base); 1329 *retu = true; /* dump instruction bytes in userspace */ 1330 return (0); 1331 } 1332 1333 /* 1334 * Update 'nextrip' based on the length of the emulated instruction. 1335 */ 1336 vme->inst_length = vie->num_processed; 1337 vcpu->nextrip += vie->num_processed; 1338 VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " 1339 "decoding", vcpu->nextrip); 1340 1341 /* return to userland unless this is an in-kernel emulated device */ 1342 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { 1343 mread = lapic_mmio_read; 1344 mwrite = lapic_mmio_write; 1345 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { 1346 mread = vioapic_mmio_read; 1347 mwrite = vioapic_mmio_write; 1348 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { 1349 mread = vhpet_mmio_read; 1350 mwrite = vhpet_mmio_write; 1351 } else { 1352 *retu = true; 1353 return (0); 1354 } 1355 1356 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, 1357 mread, mwrite, retu); 1358 1359 return (error); 1360} 1361 1362static int 1363vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) 1364{ 1365 int i, done; 1366 struct vcpu *vcpu; 1367 1368 done = 0; 1369 vcpu = &vm->vcpu[vcpuid]; 1370 1371 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); 1372 1373 /* 1374 * Wait until all 'active_cpus' have suspended themselves. 1375 * 1376 * Since a VM may be suspended at any time including when one or 1377 * more vcpus are doing a rendezvous we need to call the rendezvous 1378 * handler while we are waiting to prevent a deadlock. 1379 */ 1380 vcpu_lock(vcpu); 1381 while (1) { 1382 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { 1383 VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); 1384 break; 1385 } 1386 1387 if (vm->rendezvous_func == NULL) { 1388 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); 1389 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); 1390 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); 1391 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); 1392 } else { 1393 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend"); 1394 vcpu_unlock(vcpu); 1395 vm_handle_rendezvous(vm, vcpuid); 1396 vcpu_lock(vcpu); 1397 } 1398 } 1399 vcpu_unlock(vcpu); 1400 1401 /* 1402 * Wakeup the other sleeping vcpus and return to userspace. 1403 */ 1404 for (i = 0; i < VM_MAXCPU; i++) { 1405 if (CPU_ISSET(i, &vm->suspended_cpus)) { 1406 vcpu_notify_event(vm, i, false); 1407 } 1408 } 1409 1410 *retu = true; 1411 return (0); 1412} 1413 1414static int 1415vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) 1416{ 1417 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 1418 1419 vcpu_lock(vcpu); 1420 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); 1421 vcpu->reqidle = 0; 1422 vcpu_unlock(vcpu); 1423 *retu = true; 1424 return (0); 1425} 1426 1427int 1428vm_suspend(struct vm *vm, enum vm_suspend_how how) 1429{ 1430 int i; 1431 1432 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) 1433 return (EINVAL); 1434 1435 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) { 1436 VM_CTR2(vm, "virtual machine already suspended %d/%d", 1437 vm->suspend, how); 1438 return (EALREADY); 1439 } 1440 1441 VM_CTR1(vm, "virtual machine successfully suspended %d", how); 1442 1443 /* 1444 * Notify all active vcpus that they are now suspended. 1445 */ 1446 for (i = 0; i < VM_MAXCPU; i++) { 1447 if (CPU_ISSET(i, &vm->active_cpus)) 1448 vcpu_notify_event(vm, i, false); 1449 } 1450 1451 return (0); 1452} 1453 1454void 1455vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) 1456{ 1457 struct vm_exit *vmexit; 1458 1459 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, 1460 ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); 1461 1462 vmexit = vm_exitinfo(vm, vcpuid); 1463 vmexit->rip = rip; 1464 vmexit->inst_length = 0; 1465 vmexit->exitcode = VM_EXITCODE_SUSPENDED; 1466 vmexit->u.suspended.how = vm->suspend; 1467} 1468 1469void 1470vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip) 1471{ 1472 struct vm_exit *vmexit; 1473 1474 KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress")); 1475 1476 vmexit = vm_exitinfo(vm, vcpuid); 1477 vmexit->rip = rip; 1478 vmexit->inst_length = 0; 1479 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS; 1480 vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1); 1481} 1482 1483void 1484vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) 1485{ 1486 struct vm_exit *vmexit; 1487 1488 vmexit = vm_exitinfo(vm, vcpuid); 1489 vmexit->rip = rip; 1490 vmexit->inst_length = 0; 1491 vmexit->exitcode = VM_EXITCODE_REQIDLE; 1492 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); 1493} 1494 1495void 1496vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) 1497{ 1498 struct vm_exit *vmexit; 1499 1500 vmexit = vm_exitinfo(vm, vcpuid); 1501 vmexit->rip = rip; 1502 vmexit->inst_length = 0; 1503 vmexit->exitcode = VM_EXITCODE_BOGUS; 1504 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); 1505} 1506 1507int 1508vm_run(struct vm *vm, struct vm_run *vmrun) 1509{ 1510 struct vm_eventinfo evinfo; 1511 int error, vcpuid; 1512 struct vcpu *vcpu; 1513 struct pcb *pcb; 1514 uint64_t tscval; 1515 struct vm_exit *vme; 1516 bool retu, intr_disabled; 1517 pmap_t pmap; 1518 1519 vcpuid = vmrun->cpuid; 1520 1521 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1522 return (EINVAL); 1523 1524 if (!CPU_ISSET(vcpuid, &vm->active_cpus)) 1525 return (EINVAL); 1526 1527 if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) 1528 return (EINVAL); 1529 1530 pmap = vmspace_pmap(vm->vmspace); 1531 vcpu = &vm->vcpu[vcpuid]; 1532 vme = &vcpu->exitinfo; 1533 evinfo.rptr = &vm->rendezvous_func; 1534 evinfo.sptr = &vm->suspend; 1535 evinfo.iptr = &vcpu->reqidle; 1536restart: 1537 critical_enter(); 1538 1539 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), 1540 ("vm_run: absurd pm_active")); 1541 1542 tscval = rdtsc(); 1543 1544 pcb = PCPU_GET(curpcb); 1545 set_pcb_flags(pcb, PCB_FULL_IRET); 1546 1547 restore_guest_fpustate(vcpu); 1548 1549 vcpu_require_state(vm, vcpuid, VCPU_RUNNING); 1550 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); 1551 vcpu_require_state(vm, vcpuid, VCPU_FROZEN); 1552 1553 save_guest_fpustate(vcpu); 1554 1555 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); 1556 1557 critical_exit(); 1558 1559 if (error == 0) { 1560 retu = false; 1561 vcpu->nextrip = vme->rip + vme->inst_length; 1562 switch (vme->exitcode) { 1563 case VM_EXITCODE_REQIDLE: 1564 error = vm_handle_reqidle(vm, vcpuid, &retu); 1565 break; 1566 case VM_EXITCODE_SUSPENDED: 1567 error = vm_handle_suspend(vm, vcpuid, &retu); 1568 break; 1569 case VM_EXITCODE_IOAPIC_EOI: 1570 vioapic_process_eoi(vm, vcpuid, 1571 vme->u.ioapic_eoi.vector); 1572 break; 1573 case VM_EXITCODE_RENDEZVOUS: 1574 vm_handle_rendezvous(vm, vcpuid); 1575 error = 0; 1576 break; 1577 case VM_EXITCODE_HLT: 1578 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); 1579 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); 1580 break; 1581 case VM_EXITCODE_PAGING: 1582 error = vm_handle_paging(vm, vcpuid, &retu); 1583 break; 1584 case VM_EXITCODE_INST_EMUL: 1585 error = vm_handle_inst_emul(vm, vcpuid, &retu); 1586 break; 1587 case VM_EXITCODE_INOUT: 1588 case VM_EXITCODE_INOUT_STR: 1589 error = vm_handle_inout(vm, vcpuid, vme, &retu); 1590 break; 1591 case VM_EXITCODE_MONITOR: 1592 case VM_EXITCODE_MWAIT: 1593 vm_inject_ud(vm, vcpuid); 1594 break; 1595 default: 1596 retu = true; /* handled in userland */ 1597 break; 1598 } 1599 } 1600 1601 if (error == 0 && retu == false) 1602 goto restart; 1603 1604 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); 1605 1606 /* copy the exit information */ 1607 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); 1608 return (error); 1609} 1610 1611int 1612vm_restart_instruction(void *arg, int vcpuid) 1613{ 1614 struct vm *vm; 1615 struct vcpu *vcpu; 1616 enum vcpu_state state; 1617 uint64_t rip; 1618 int error; 1619 1620 vm = arg; 1621 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1622 return (EINVAL); 1623 1624 vcpu = &vm->vcpu[vcpuid]; 1625 state = vcpu_get_state(vm, vcpuid, NULL); 1626 if (state == VCPU_RUNNING) { 1627 /* 1628 * When a vcpu is "running" the next instruction is determined 1629 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'. 1630 * Thus setting 'inst_length' to zero will cause the current 1631 * instruction to be restarted. 1632 */ 1633 vcpu->exitinfo.inst_length = 0; 1634 VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by " 1635 "setting inst_length to zero", vcpu->exitinfo.rip); 1636 } else if (state == VCPU_FROZEN) { 1637 /* 1638 * When a vcpu is "frozen" it is outside the critical section 1639 * around VMRUN() and 'nextrip' points to the next instruction. 1640 * Thus instruction restart is achieved by setting 'nextrip' 1641 * to the vcpu's %rip. 1642 */ 1643 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip); 1644 KASSERT(!error, ("%s: error %d getting rip", __func__, error)); 1645 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating " 1646 "nextrip from %#lx to %#lx", vcpu->nextrip, rip); 1647 vcpu->nextrip = rip; 1648 } else { 1649 panic("%s: invalid state %d", __func__, state); 1650 } 1651 return (0); 1652} 1653 1654int 1655vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) 1656{ 1657 struct vcpu *vcpu; 1658 int type, vector; 1659 1660 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1661 return (EINVAL); 1662 1663 vcpu = &vm->vcpu[vcpuid]; 1664 1665 if (info & VM_INTINFO_VALID) { 1666 type = info & VM_INTINFO_TYPE; 1667 vector = info & 0xff; 1668 if (type == VM_INTINFO_NMI && vector != IDT_NMI) 1669 return (EINVAL); 1670 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32) 1671 return (EINVAL); 1672 if (info & VM_INTINFO_RSVD) 1673 return (EINVAL); 1674 } else { 1675 info = 0; 1676 } 1677 VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info); 1678 vcpu->exitintinfo = info; 1679 return (0); 1680} 1681 1682enum exc_class { 1683 EXC_BENIGN, 1684 EXC_CONTRIBUTORY, 1685 EXC_PAGEFAULT 1686}; 1687 1688#define IDT_VE 20 /* Virtualization Exception (Intel specific) */ 1689 1690static enum exc_class 1691exception_class(uint64_t info) 1692{ 1693 int type, vector; 1694 1695 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info)); 1696 type = info & VM_INTINFO_TYPE; 1697 vector = info & 0xff; 1698 1699 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */ 1700 switch (type) { 1701 case VM_INTINFO_HWINTR: 1702 case VM_INTINFO_SWINTR: 1703 case VM_INTINFO_NMI: 1704 return (EXC_BENIGN); 1705 default: 1706 /* 1707 * Hardware exception. 1708 * 1709 * SVM and VT-x use identical type values to represent NMI, 1710 * hardware interrupt and software interrupt. 1711 * 1712 * SVM uses type '3' for all exceptions. VT-x uses type '3' 1713 * for exceptions except #BP and #OF. #BP and #OF use a type 1714 * value of '5' or '6'. Therefore we don't check for explicit 1715 * values of 'type' to classify 'intinfo' into a hardware 1716 * exception. 1717 */ 1718 break; 1719 } 1720 1721 switch (vector) { 1722 case IDT_PF: 1723 case IDT_VE: 1724 return (EXC_PAGEFAULT); 1725 case IDT_DE: 1726 case IDT_TS: 1727 case IDT_NP: 1728 case IDT_SS: 1729 case IDT_GP: 1730 return (EXC_CONTRIBUTORY); 1731 default: 1732 return (EXC_BENIGN); 1733 } 1734} 1735 1736static int 1737nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, 1738 uint64_t *retinfo) 1739{ 1740 enum exc_class exc1, exc2; 1741 int type1, vector1; 1742 1743 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1)); 1744 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2)); 1745 1746 /* 1747 * If an exception occurs while attempting to call the double-fault 1748 * handler the processor enters shutdown mode (aka triple fault). 1749 */ 1750 type1 = info1 & VM_INTINFO_TYPE; 1751 vector1 = info1 & 0xff; 1752 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { 1753 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", 1754 info1, info2); 1755 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); 1756 *retinfo = 0; 1757 return (0); 1758 } 1759 1760 /* 1761 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3 1762 */ 1763 exc1 = exception_class(info1); 1764 exc2 = exception_class(info2); 1765 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) || 1766 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) { 1767 /* Convert nested fault into a double fault. */ 1768 *retinfo = IDT_DF; 1769 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1770 *retinfo |= VM_INTINFO_DEL_ERRCODE; 1771 } else { 1772 /* Handle exceptions serially */ 1773 *retinfo = info2; 1774 } 1775 return (1); 1776} 1777 1778static uint64_t 1779vcpu_exception_intinfo(struct vcpu *vcpu) 1780{ 1781 uint64_t info = 0; 1782 1783 if (vcpu->exception_pending) { 1784 info = vcpu->exc_vector & 0xff; 1785 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; 1786 if (vcpu->exc_errcode_valid) { 1787 info |= VM_INTINFO_DEL_ERRCODE; 1788 info |= (uint64_t)vcpu->exc_errcode << 32; 1789 } 1790 } 1791 return (info); 1792} 1793 1794int 1795vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) 1796{ 1797 struct vcpu *vcpu; 1798 uint64_t info1, info2; 1799 int valid; 1800 1801 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); 1802 1803 vcpu = &vm->vcpu[vcpuid]; 1804 1805 info1 = vcpu->exitintinfo; 1806 vcpu->exitintinfo = 0; 1807 1808 info2 = 0; 1809 if (vcpu->exception_pending) { 1810 info2 = vcpu_exception_intinfo(vcpu); 1811 vcpu->exception_pending = 0; 1812 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", 1813 vcpu->exc_vector, info2); 1814 } 1815 1816 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { 1817 valid = nested_fault(vm, vcpuid, info1, info2, retinfo); 1818 } else if (info1 & VM_INTINFO_VALID) { 1819 *retinfo = info1; 1820 valid = 1; 1821 } else if (info2 & VM_INTINFO_VALID) { 1822 *retinfo = info2; 1823 valid = 1; 1824 } else { 1825 valid = 0; 1826 } 1827 1828 if (valid) { 1829 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), " 1830 "retinfo(%#lx)", __func__, info1, info2, *retinfo); 1831 } 1832 1833 return (valid); 1834} 1835 1836int 1837vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) 1838{ 1839 struct vcpu *vcpu; 1840 1841 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1842 return (EINVAL); 1843 1844 vcpu = &vm->vcpu[vcpuid]; 1845 *info1 = vcpu->exitintinfo; 1846 *info2 = vcpu_exception_intinfo(vcpu); 1847 return (0); 1848} 1849 1850int 1851vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, 1852 uint32_t errcode, int restart_instruction) 1853{ 1854 struct vcpu *vcpu; 1855 uint64_t regval; 1856 int error; 1857 1858 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1859 return (EINVAL); 1860 1861 if (vector < 0 || vector >= 32) 1862 return (EINVAL); 1863 1864 /* 1865 * A double fault exception should never be injected directly into 1866 * the guest. It is a derived exception that results from specific 1867 * combinations of nested faults. 1868 */ 1869 if (vector == IDT_DF) 1870 return (EINVAL); 1871 1872 vcpu = &vm->vcpu[vcpuid]; 1873 1874 if (vcpu->exception_pending) { 1875 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " 1876 "pending exception %d", vector, vcpu->exc_vector); 1877 return (EBUSY); 1878 } 1879 1880 if (errcode_valid) { 1881 /* 1882 * Exceptions don't deliver an error code in real mode. 1883 */ 1884 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); 1885 KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); 1886 if (!(regval & CR0_PE)) 1887 errcode_valid = 0; 1888 } 1889 1890 /* 1891 * From section 26.6.1 "Interruptibility State" in Intel SDM: 1892 * 1893 * Event blocking by "STI" or "MOV SS" is cleared after guest executes 1894 * one instruction or incurs an exception. 1895 */ 1896 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); 1897 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", 1898 __func__, error)); 1899 1900 if (restart_instruction) 1901 vm_restart_instruction(vm, vcpuid); 1902 1903 vcpu->exception_pending = 1; 1904 vcpu->exc_vector = vector; 1905 vcpu->exc_errcode = errcode; 1906 vcpu->exc_errcode_valid = errcode_valid; 1907 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); 1908 return (0); 1909} 1910 1911void 1912vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, 1913 int errcode) 1914{ 1915 struct vm *vm; 1916 int error, restart_instruction; 1917 1918 vm = vmarg; 1919 restart_instruction = 1; 1920 1921 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, 1922 errcode, restart_instruction); 1923 KASSERT(error == 0, ("vm_inject_exception error %d", error)); 1924} 1925 1926void 1927vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2) 1928{ 1929 struct vm *vm; 1930 int error; 1931 1932 vm = vmarg; 1933 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx", 1934 error_code, cr2); 1935 1936 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2); 1937 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error)); 1938 1939 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code); 1940} 1941 1942static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu"); 1943 1944int 1945vm_inject_nmi(struct vm *vm, int vcpuid) 1946{ 1947 struct vcpu *vcpu; 1948 1949 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1950 return (EINVAL); 1951 1952 vcpu = &vm->vcpu[vcpuid]; 1953 1954 vcpu->nmi_pending = 1; 1955 vcpu_notify_event(vm, vcpuid, false); 1956 return (0); 1957} 1958 1959int 1960vm_nmi_pending(struct vm *vm, int vcpuid) 1961{ 1962 struct vcpu *vcpu; 1963 1964 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1965 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1966 1967 vcpu = &vm->vcpu[vcpuid]; 1968 1969 return (vcpu->nmi_pending); 1970} 1971 1972void 1973vm_nmi_clear(struct vm *vm, int vcpuid) 1974{ 1975 struct vcpu *vcpu; 1976 1977 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1978 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); 1979 1980 vcpu = &vm->vcpu[vcpuid]; 1981 1982 if (vcpu->nmi_pending == 0) 1983 panic("vm_nmi_clear: inconsistent nmi_pending state"); 1984 1985 vcpu->nmi_pending = 0; 1986 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1); 1987} 1988 1989static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu"); 1990 1991int 1992vm_inject_extint(struct vm *vm, int vcpuid) 1993{ 1994 struct vcpu *vcpu; 1995 1996 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 1997 return (EINVAL); 1998 1999 vcpu = &vm->vcpu[vcpuid]; 2000 2001 vcpu->extint_pending = 1; 2002 vcpu_notify_event(vm, vcpuid, false); 2003 return (0); 2004} 2005 2006int 2007vm_extint_pending(struct vm *vm, int vcpuid) 2008{ 2009 struct vcpu *vcpu; 2010 2011 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2012 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2013 2014 vcpu = &vm->vcpu[vcpuid]; 2015 2016 return (vcpu->extint_pending); 2017} 2018 2019void 2020vm_extint_clear(struct vm *vm, int vcpuid) 2021{ 2022 struct vcpu *vcpu; 2023 2024 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2025 panic("vm_extint_pending: invalid vcpuid %d", vcpuid); 2026 2027 vcpu = &vm->vcpu[vcpuid]; 2028 2029 if (vcpu->extint_pending == 0) 2030 panic("vm_extint_clear: inconsistent extint_pending state"); 2031 2032 vcpu->extint_pending = 0; 2033 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1); 2034} 2035 2036int 2037vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) 2038{ 2039 if (vcpu < 0 || vcpu >= VM_MAXCPU) 2040 return (EINVAL); 2041 2042 if (type < 0 || type >= VM_CAP_MAX) 2043 return (EINVAL); 2044 2045 return (VMGETCAP(vm->cookie, vcpu, type, retval)); 2046} 2047 2048int 2049vm_set_capability(struct vm *vm, int vcpu, int type, int val) 2050{ 2051 if (vcpu < 0 || vcpu >= VM_MAXCPU) 2052 return (EINVAL); 2053 2054 if (type < 0 || type >= VM_CAP_MAX) 2055 return (EINVAL); 2056 2057 return (VMSETCAP(vm->cookie, vcpu, type, val)); 2058} 2059 2060struct vlapic * 2061vm_lapic(struct vm *vm, int cpu) 2062{ 2063 return (vm->vcpu[cpu].vlapic); 2064} 2065 2066struct vioapic * 2067vm_ioapic(struct vm *vm) 2068{ 2069 2070 return (vm->vioapic); 2071} 2072 2073struct vhpet * 2074vm_hpet(struct vm *vm) 2075{ 2076 2077 return (vm->vhpet); 2078} 2079 2080boolean_t 2081vmm_is_pptdev(int bus, int slot, int func) 2082{ 2083 int found, i, n; 2084 int b, s, f; 2085 char *val, *cp, *cp2; 2086 2087 /* 2088 * XXX 2089 * The length of an environment variable is limited to 128 bytes which 2090 * puts an upper limit on the number of passthru devices that may be 2091 * specified using a single environment variable. 2092 * 2093 * Work around this by scanning multiple environment variable 2094 * names instead of a single one - yuck! 2095 */ 2096 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL }; 2097 2098 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ 2099 found = 0; 2100 for (i = 0; names[i] != NULL && !found; i++) { 2101 cp = val = getenv(names[i]); 2102 while (cp != NULL && *cp != '\0') { 2103 if ((cp2 = strchr(cp, ' ')) != NULL) 2104 *cp2 = '\0'; 2105 2106 n = sscanf(cp, "%d/%d/%d", &b, &s, &f); 2107 if (n == 3 && bus == b && slot == s && func == f) { 2108 found = 1; 2109 break; 2110 } 2111 2112 if (cp2 != NULL) 2113 *cp2++ = ' '; 2114 2115 cp = cp2; 2116 } 2117 freeenv(val); 2118 } 2119 return (found); 2120} 2121 2122void * 2123vm_iommu_domain(struct vm *vm) 2124{ 2125 2126 return (vm->iommu); 2127} 2128 2129int 2130vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, 2131 bool from_idle) 2132{ 2133 int error; 2134 struct vcpu *vcpu; 2135 2136 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2137 panic("vm_set_run_state: invalid vcpuid %d", vcpuid); 2138 2139 vcpu = &vm->vcpu[vcpuid]; 2140 2141 vcpu_lock(vcpu); 2142 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); 2143 vcpu_unlock(vcpu); 2144 2145 return (error); 2146} 2147 2148enum vcpu_state 2149vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) 2150{ 2151 struct vcpu *vcpu; 2152 enum vcpu_state state; 2153 2154 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2155 panic("vm_get_run_state: invalid vcpuid %d", vcpuid); 2156 2157 vcpu = &vm->vcpu[vcpuid]; 2158 2159 vcpu_lock(vcpu); 2160 state = vcpu->state; 2161 if (hostcpu != NULL) 2162 *hostcpu = vcpu->hostcpu; 2163 vcpu_unlock(vcpu); 2164 2165 return (state); 2166} 2167 2168int 2169vm_activate_cpu(struct vm *vm, int vcpuid) 2170{ 2171 2172 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2173 return (EINVAL); 2174 2175 if (CPU_ISSET(vcpuid, &vm->active_cpus)) 2176 return (EBUSY); 2177 2178 VCPU_CTR0(vm, vcpuid, "activated"); 2179 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus); 2180 return (0); 2181} 2182 2183cpuset_t 2184vm_active_cpus(struct vm *vm) 2185{ 2186 2187 return (vm->active_cpus); 2188} 2189 2190cpuset_t 2191vm_suspended_cpus(struct vm *vm) 2192{ 2193 2194 return (vm->suspended_cpus); 2195} 2196 2197void * 2198vcpu_stats(struct vm *vm, int vcpuid) 2199{ 2200 2201 return (vm->vcpu[vcpuid].stats); 2202} 2203 2204int 2205vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) 2206{ 2207 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2208 return (EINVAL); 2209 2210 *state = vm->vcpu[vcpuid].x2apic_state; 2211 2212 return (0); 2213} 2214 2215int 2216vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) 2217{ 2218 if (vcpuid < 0 || vcpuid >= VM_MAXCPU) 2219 return (EINVAL); 2220 2221 if (state >= X2APIC_STATE_LAST) 2222 return (EINVAL); 2223 2224 vm->vcpu[vcpuid].x2apic_state = state; 2225 2226 vlapic_set_x2apic_state(vm, vcpuid, state); 2227 2228 return (0); 2229} 2230 2231/* 2232 * This function is called to ensure that a vcpu "sees" a pending event 2233 * as soon as possible: 2234 * - If the vcpu thread is sleeping then it is woken up. 2235 * - If the vcpu is running on a different host_cpu then an IPI will be directed 2236 * to the host_cpu to cause the vcpu to trap into the hypervisor. 2237 */ 2238static void 2239vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) 2240{ 2241 int hostcpu; 2242 2243 hostcpu = vcpu->hostcpu; 2244 if (vcpu->state == VCPU_RUNNING) { 2245 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); 2246 if (hostcpu != curcpu) { 2247 if (lapic_intr) { 2248 vlapic_post_intr(vcpu->vlapic, hostcpu, 2249 vmm_ipinum); 2250 } else { 2251 ipi_cpu(hostcpu, vmm_ipinum); 2252 } 2253 } else { 2254 /* 2255 * If the 'vcpu' is running on 'curcpu' then it must 2256 * be sending a notification to itself (e.g. SELF_IPI). 2257 * The pending event will be picked up when the vcpu 2258 * transitions back to guest context. 2259 */ 2260 } 2261 } else { 2262 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " 2263 "with hostcpu %d", vcpu->state, hostcpu)); 2264 if (vcpu->state == VCPU_SLEEPING) 2265 wakeup_one(vcpu); 2266 } 2267} 2268 2269void 2270vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) 2271{ 2272 struct vcpu *vcpu = &vm->vcpu[vcpuid]; 2273 2274 vcpu_lock(vcpu); 2275 vcpu_notify_event_locked(vcpu, lapic_intr); 2276 vcpu_unlock(vcpu); 2277} 2278 2279struct vmspace * 2280vm_get_vmspace(struct vm *vm) 2281{ 2282 2283 return (vm->vmspace); 2284} 2285 2286int 2287vm_apicid2vcpuid(struct vm *vm, int apicid) 2288{ 2289 /* 2290 * XXX apic id is assumed to be numerically identical to vcpu id 2291 */ 2292 return (apicid); 2293} 2294 2295void 2296vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest, 2297 vm_rendezvous_func_t func, void *arg) 2298{ 2299 int i; 2300 2301 /* 2302 * Enforce that this function is called without any locks 2303 */ 2304 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous"); 2305 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU), 2306 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid)); 2307 2308restart: 2309 mtx_lock(&vm->rendezvous_mtx); 2310 if (vm->rendezvous_func != NULL) { 2311 /* 2312 * If a rendezvous is already in progress then we need to 2313 * call the rendezvous handler in case this 'vcpuid' is one 2314 * of the targets of the rendezvous. 2315 */ 2316 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress"); 2317 mtx_unlock(&vm->rendezvous_mtx); 2318 vm_handle_rendezvous(vm, vcpuid); 2319 goto restart; 2320 } 2321 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous " 2322 "rendezvous is still in progress")); 2323 2324 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous"); 2325 vm->rendezvous_req_cpus = dest; 2326 CPU_ZERO(&vm->rendezvous_done_cpus); 2327 vm->rendezvous_arg = arg; 2328 vm_set_rendezvous_func(vm, func); 2329 mtx_unlock(&vm->rendezvous_mtx); 2330 2331 /* 2332 * Wake up any sleeping vcpus and trigger a VM-exit in any running 2333 * vcpus so they handle the rendezvous as soon as possible. 2334 */ 2335 for (i = 0; i < VM_MAXCPU; i++) { 2336 if (CPU_ISSET(i, &dest)) 2337 vcpu_notify_event(vm, i, false); 2338 } 2339 2340 vm_handle_rendezvous(vm, vcpuid); 2341} 2342 2343struct vatpic * 2344vm_atpic(struct vm *vm) 2345{ 2346 return (vm->vatpic); 2347} 2348 2349struct vatpit * 2350vm_atpit(struct vm *vm) 2351{ 2352 return (vm->vatpit); 2353} 2354 2355struct vpmtmr * 2356vm_pmtmr(struct vm *vm) 2357{ 2358 2359 return (vm->vpmtmr); 2360} 2361 2362struct vrtc * 2363vm_rtc(struct vm *vm) 2364{ 2365 2366 return (vm->vrtc); 2367} 2368 2369enum vm_reg_name 2370vm_segment_name(int seg) 2371{ 2372 static enum vm_reg_name seg_names[] = { 2373 VM_REG_GUEST_ES, 2374 VM_REG_GUEST_CS, 2375 VM_REG_GUEST_SS, 2376 VM_REG_GUEST_DS, 2377 VM_REG_GUEST_FS, 2378 VM_REG_GUEST_GS 2379 }; 2380 2381 KASSERT(seg >= 0 && seg < nitems(seg_names), 2382 ("%s: invalid segment encoding %d", __func__, seg)); 2383 return (seg_names[seg]); 2384} 2385 2386void 2387vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, 2388 int num_copyinfo) 2389{ 2390 int idx; 2391 2392 for (idx = 0; idx < num_copyinfo; idx++) { 2393 if (copyinfo[idx].cookie != NULL) 2394 vm_gpa_release(copyinfo[idx].cookie); 2395 } 2396 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); 2397} 2398 2399int 2400vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 2401 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, 2402 int num_copyinfo, int *fault) 2403{ 2404 int error, idx, nused; 2405 size_t n, off, remaining; 2406 void *hva, *cookie; 2407 uint64_t gpa; 2408 2409 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo); 2410 2411 nused = 0; 2412 remaining = len; 2413 while (remaining > 0) { 2414 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); 2415 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); 2416 if (error || *fault) 2417 return (error); 2418 off = gpa & PAGE_MASK; 2419 n = min(remaining, PAGE_SIZE - off); 2420 copyinfo[nused].gpa = gpa; 2421 copyinfo[nused].len = n; 2422 remaining -= n; 2423 gla += n; 2424 nused++; 2425 } 2426 2427 for (idx = 0; idx < nused; idx++) { 2428 hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, 2429 prot, &cookie); 2430 if (hva == NULL) 2431 break; 2432 copyinfo[idx].hva = hva; 2433 copyinfo[idx].cookie = cookie; 2434 } 2435 2436 if (idx != nused) { 2437 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); 2438 return (EFAULT); 2439 } else { 2440 *fault = 0; 2441 return (0); 2442 } 2443} 2444 2445void 2446vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, 2447 size_t len) 2448{ 2449 char *dst; 2450 int idx; 2451 2452 dst = kaddr; 2453 idx = 0; 2454 while (len > 0) { 2455 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len); 2456 len -= copyinfo[idx].len; 2457 dst += copyinfo[idx].len; 2458 idx++; 2459 } 2460} 2461 2462void 2463vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, 2464 struct vm_copyinfo *copyinfo, size_t len) 2465{ 2466 const char *src; 2467 int idx; 2468 2469 src = kaddr; 2470 idx = 0; 2471 while (len > 0) { 2472 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len); 2473 len -= copyinfo[idx].len; 2474 src += copyinfo[idx].len; 2475 idx++; 2476 } 2477} 2478 2479/* 2480 * Return the amount of in-use and wired memory for the VM. Since 2481 * these are global stats, only return the values with for vCPU 0 2482 */ 2483VMM_STAT_DECLARE(VMM_MEM_RESIDENT); 2484VMM_STAT_DECLARE(VMM_MEM_WIRED); 2485 2486static void 2487vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2488{ 2489 2490 if (vcpu == 0) { 2491 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, 2492 PAGE_SIZE * vmspace_resident_count(vm->vmspace)); 2493 } 2494} 2495 2496static void 2497vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) 2498{ 2499 2500 if (vcpu == 0) { 2501 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, 2502 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); 2503 } 2504} 2505 2506VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); 2507VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); 2508