vmm.c revision 270071
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270071 2014-08-17 01:00:42Z grehan $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270071 2014-08-17 01:00:42Z grehan $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65#include <machine/vmm_instruction_emul.h>
66
67#include "vmm_ioport.h"
68#include "vmm_ktr.h"
69#include "vmm_host.h"
70#include "vmm_mem.h"
71#include "vmm_util.h"
72#include "vatpic.h"
73#include "vatpit.h"
74#include "vhpet.h"
75#include "vioapic.h"
76#include "vlapic.h"
77#include "vmm_msr.h"
78#include "vmm_ipi.h"
79#include "vmm_stat.h"
80#include "vmm_lapic.h"
81
82#include "io/ppt.h"
83#include "io/iommu.h"
84
85struct vlapic;
86
87/*
88 * Initialization:
89 * (a) allocated when vcpu is created
90 * (i) initialized when vcpu is created and when it is reinitialized
91 * (o) initialized the first time the vcpu is created
92 * (x) initialized before use
93 */
94struct vcpu {
95	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
96	enum vcpu_state	state;		/* (o) vcpu state */
97	int		hostcpu;	/* (o) vcpu's host cpu */
98	struct vlapic	*vlapic;	/* (i) APIC device model */
99	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100	int		nmi_pending;	/* (i) NMI pending */
101	int		extint_pending;	/* (i) INTR pending */
102	struct vm_exception exception;	/* (x) exception collateral */
103	int	exception_pending;	/* (i) exception pending */
104	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
105	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
106	void		*stats;		/* (a,i) statistics */
107	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
108	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
109};
110
111#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
112#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
113#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
114#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
115#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
116
117struct mem_seg {
118	vm_paddr_t	gpa;
119	size_t		len;
120	boolean_t	wired;
121	vm_object_t	object;
122};
123#define	VM_MAX_MEMORY_SEGMENTS	2
124
125/*
126 * Initialization:
127 * (o) initialized the first time the VM is created
128 * (i) initialized when VM is created and when it is reinitialized
129 * (x) initialized before use
130 */
131struct vm {
132	void		*cookie;		/* (i) cpu-specific data */
133	void		*iommu;			/* (x) iommu-specific data */
134	struct vhpet	*vhpet;			/* (i) virtual HPET */
135	struct vioapic	*vioapic;		/* (i) virtual ioapic */
136	struct vatpic	*vatpic;		/* (i) virtual atpic */
137	struct vatpit	*vatpit;		/* (i) virtual atpit */
138	volatile cpuset_t active_cpus;		/* (i) active vcpus */
139	int		suspend;		/* (i) stop VM execution */
140	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
141	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
142	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
143	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
144	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
145	vm_rendezvous_func_t rendezvous_func;
146	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
147	int		num_mem_segs;		/* (o) guest memory segments */
148	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
149	struct vmspace	*vmspace;		/* (o) guest's address space */
150	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
151	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
152};
153
154static int vmm_initialized;
155
156static struct vmm_ops *ops;
157#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
158#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
159#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
160
161#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
162#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
163	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
164#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
165#define	VMSPACE_ALLOC(min, max) \
166	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
167#define	VMSPACE_FREE(vmspace) \
168	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
169#define	VMGETREG(vmi, vcpu, num, retval)		\
170	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
171#define	VMSETREG(vmi, vcpu, num, val)		\
172	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
173#define	VMGETDESC(vmi, vcpu, num, desc)		\
174	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
175#define	VMSETDESC(vmi, vcpu, num, desc)		\
176	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
177#define	VMGETCAP(vmi, vcpu, num, retval)	\
178	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
179#define	VMSETCAP(vmi, vcpu, num, val)		\
180	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
181#define	VLAPIC_INIT(vmi, vcpu)			\
182	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
183#define	VLAPIC_CLEANUP(vmi, vlapic)		\
184	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
185
186#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
187#define	fpu_stop_emulating()	clts()
188
189static MALLOC_DEFINE(M_VM, "vm", "vm");
190CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
191
192/* statistics */
193static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
194
195SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
196
197/*
198 * Halt the guest if all vcpus are executing a HLT instruction with
199 * interrupts disabled.
200 */
201static int halt_detection_enabled = 1;
202TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
203SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
204    &halt_detection_enabled, 0,
205    "Halt VM if all vcpus execute HLT with interrupts disabled");
206
207static int vmm_ipinum;
208SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
209    "IPI vector used for vcpu notifications");
210
211static void
212vcpu_cleanup(struct vm *vm, int i, bool destroy)
213{
214	struct vcpu *vcpu = &vm->vcpu[i];
215
216	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
217	if (destroy) {
218		vmm_stat_free(vcpu->stats);
219		fpu_save_area_free(vcpu->guestfpu);
220	}
221}
222
223static void
224vcpu_init(struct vm *vm, int vcpu_id, bool create)
225{
226	struct vcpu *vcpu;
227
228	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
229	    ("vcpu_init: invalid vcpu %d", vcpu_id));
230
231	vcpu = &vm->vcpu[vcpu_id];
232
233	if (create) {
234		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
235		    "initialized", vcpu_id));
236		vcpu_lock_init(vcpu);
237		vcpu->state = VCPU_IDLE;
238		vcpu->hostcpu = NOCPU;
239		vcpu->guestfpu = fpu_save_area_alloc();
240		vcpu->stats = vmm_stat_alloc();
241	}
242
243	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
244	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
245	vcpu->nmi_pending = 0;
246	vcpu->extint_pending = 0;
247	vcpu->exception_pending = 0;
248	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
249	fpu_save_area_reset(vcpu->guestfpu);
250	vmm_stat_init(vcpu->stats);
251	guest_msrs_init(vm, vcpu_id);
252}
253
254struct vm_exit *
255vm_exitinfo(struct vm *vm, int cpuid)
256{
257	struct vcpu *vcpu;
258
259	if (cpuid < 0 || cpuid >= VM_MAXCPU)
260		panic("vm_exitinfo: invalid cpuid %d", cpuid);
261
262	vcpu = &vm->vcpu[cpuid];
263
264	return (&vcpu->exitinfo);
265}
266
267static void
268vmm_resume(void)
269{
270	VMM_RESUME();
271}
272
273static int
274vmm_init(void)
275{
276	int error;
277
278	vmm_host_state_init();
279
280	vmm_ipinum = vmm_ipi_alloc();
281	if (vmm_ipinum == 0)
282		vmm_ipinum = IPI_AST;
283
284	error = vmm_mem_init();
285	if (error)
286		return (error);
287
288	if (vmm_is_intel())
289		ops = &vmm_ops_intel;
290	else if (vmm_is_amd())
291		ops = &vmm_ops_amd;
292	else
293		return (ENXIO);
294
295	vmm_msr_init();
296	vmm_resume_p = vmm_resume;
297
298	return (VMM_INIT(vmm_ipinum));
299}
300
301static int
302vmm_handler(module_t mod, int what, void *arg)
303{
304	int error;
305
306	switch (what) {
307	case MOD_LOAD:
308		vmmdev_init();
309		if (ppt_avail_devices() > 0)
310			iommu_init();
311		error = vmm_init();
312		if (error == 0)
313			vmm_initialized = 1;
314		break;
315	case MOD_UNLOAD:
316		error = vmmdev_cleanup();
317		if (error == 0) {
318			vmm_resume_p = NULL;
319			iommu_cleanup();
320			if (vmm_ipinum != IPI_AST)
321				vmm_ipi_free(vmm_ipinum);
322			error = VMM_CLEANUP();
323			/*
324			 * Something bad happened - prevent new
325			 * VMs from being created
326			 */
327			if (error)
328				vmm_initialized = 0;
329		}
330		break;
331	default:
332		error = 0;
333		break;
334	}
335	return (error);
336}
337
338static moduledata_t vmm_kmod = {
339	"vmm",
340	vmm_handler,
341	NULL
342};
343
344/*
345 * vmm initialization has the following dependencies:
346 *
347 * - iommu initialization must happen after the pci passthru driver has had
348 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
349 *
350 * - VT-x initialization requires smp_rendezvous() and therefore must happen
351 *   after SMP is fully functional (after SI_SUB_SMP).
352 */
353DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
354MODULE_VERSION(vmm, 1);
355
356static void
357vm_init(struct vm *vm, bool create)
358{
359	int i;
360
361	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
362	vm->iommu = NULL;
363	vm->vioapic = vioapic_init(vm);
364	vm->vhpet = vhpet_init(vm);
365	vm->vatpic = vatpic_init(vm);
366	vm->vatpit = vatpit_init(vm);
367
368	CPU_ZERO(&vm->active_cpus);
369
370	vm->suspend = 0;
371	CPU_ZERO(&vm->suspended_cpus);
372
373	for (i = 0; i < VM_MAXCPU; i++)
374		vcpu_init(vm, i, create);
375}
376
377int
378vm_create(const char *name, struct vm **retvm)
379{
380	struct vm *vm;
381	struct vmspace *vmspace;
382
383	/*
384	 * If vmm.ko could not be successfully initialized then don't attempt
385	 * to create the virtual machine.
386	 */
387	if (!vmm_initialized)
388		return (ENXIO);
389
390	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
391		return (EINVAL);
392
393	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
394	if (vmspace == NULL)
395		return (ENOMEM);
396
397	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
398	strcpy(vm->name, name);
399	vm->num_mem_segs = 0;
400	vm->vmspace = vmspace;
401	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
402
403	vm_init(vm, true);
404
405	*retvm = vm;
406	return (0);
407}
408
409static void
410vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
411{
412
413	if (seg->object != NULL)
414		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
415
416	bzero(seg, sizeof(*seg));
417}
418
419static void
420vm_cleanup(struct vm *vm, bool destroy)
421{
422	int i;
423
424	ppt_unassign_all(vm);
425
426	if (vm->iommu != NULL)
427		iommu_destroy_domain(vm->iommu);
428
429	vatpit_cleanup(vm->vatpit);
430	vhpet_cleanup(vm->vhpet);
431	vatpic_cleanup(vm->vatpic);
432	vioapic_cleanup(vm->vioapic);
433
434	for (i = 0; i < VM_MAXCPU; i++)
435		vcpu_cleanup(vm, i, destroy);
436
437	VMCLEANUP(vm->cookie);
438
439	if (destroy) {
440		for (i = 0; i < vm->num_mem_segs; i++)
441			vm_free_mem_seg(vm, &vm->mem_segs[i]);
442
443		vm->num_mem_segs = 0;
444
445		VMSPACE_FREE(vm->vmspace);
446		vm->vmspace = NULL;
447	}
448}
449
450void
451vm_destroy(struct vm *vm)
452{
453	vm_cleanup(vm, true);
454	free(vm, M_VM);
455}
456
457int
458vm_reinit(struct vm *vm)
459{
460	int error;
461
462	/*
463	 * A virtual machine can be reset only if all vcpus are suspended.
464	 */
465	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
466		vm_cleanup(vm, false);
467		vm_init(vm, false);
468		error = 0;
469	} else {
470		error = EBUSY;
471	}
472
473	return (error);
474}
475
476const char *
477vm_name(struct vm *vm)
478{
479	return (vm->name);
480}
481
482int
483vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
484{
485	vm_object_t obj;
486
487	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
488		return (ENOMEM);
489	else
490		return (0);
491}
492
493int
494vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
495{
496
497	vmm_mmio_free(vm->vmspace, gpa, len);
498	return (0);
499}
500
501boolean_t
502vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
503{
504	int i;
505	vm_paddr_t gpabase, gpalimit;
506
507	for (i = 0; i < vm->num_mem_segs; i++) {
508		gpabase = vm->mem_segs[i].gpa;
509		gpalimit = gpabase + vm->mem_segs[i].len;
510		if (gpa >= gpabase && gpa < gpalimit)
511			return (TRUE);		/* 'gpa' is regular memory */
512	}
513
514	if (ppt_is_mmio(vm, gpa))
515		return (TRUE);			/* 'gpa' is pci passthru mmio */
516
517	return (FALSE);
518}
519
520int
521vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
522{
523	int available, allocated;
524	struct mem_seg *seg;
525	vm_object_t object;
526	vm_paddr_t g;
527
528	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
529		return (EINVAL);
530
531	available = allocated = 0;
532	g = gpa;
533	while (g < gpa + len) {
534		if (vm_mem_allocated(vm, g))
535			allocated++;
536		else
537			available++;
538
539		g += PAGE_SIZE;
540	}
541
542	/*
543	 * If there are some allocated and some available pages in the address
544	 * range then it is an error.
545	 */
546	if (allocated && available)
547		return (EINVAL);
548
549	/*
550	 * If the entire address range being requested has already been
551	 * allocated then there isn't anything more to do.
552	 */
553	if (allocated && available == 0)
554		return (0);
555
556	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
557		return (E2BIG);
558
559	seg = &vm->mem_segs[vm->num_mem_segs];
560
561	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
562		return (ENOMEM);
563
564	seg->gpa = gpa;
565	seg->len = len;
566	seg->object = object;
567	seg->wired = FALSE;
568
569	vm->num_mem_segs++;
570
571	return (0);
572}
573
574static void
575vm_gpa_unwire(struct vm *vm)
576{
577	int i, rv;
578	struct mem_seg *seg;
579
580	for (i = 0; i < vm->num_mem_segs; i++) {
581		seg = &vm->mem_segs[i];
582		if (!seg->wired)
583			continue;
584
585		rv = vm_map_unwire(&vm->vmspace->vm_map,
586				   seg->gpa, seg->gpa + seg->len,
587				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
588		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
589		    "%#lx/%ld could not be unwired: %d",
590		    vm_name(vm), seg->gpa, seg->len, rv));
591
592		seg->wired = FALSE;
593	}
594}
595
596static int
597vm_gpa_wire(struct vm *vm)
598{
599	int i, rv;
600	struct mem_seg *seg;
601
602	for (i = 0; i < vm->num_mem_segs; i++) {
603		seg = &vm->mem_segs[i];
604		if (seg->wired)
605			continue;
606
607		/* XXX rlimits? */
608		rv = vm_map_wire(&vm->vmspace->vm_map,
609				 seg->gpa, seg->gpa + seg->len,
610				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
611		if (rv != KERN_SUCCESS)
612			break;
613
614		seg->wired = TRUE;
615	}
616
617	if (i < vm->num_mem_segs) {
618		/*
619		 * Undo the wiring before returning an error.
620		 */
621		vm_gpa_unwire(vm);
622		return (EAGAIN);
623	}
624
625	return (0);
626}
627
628static void
629vm_iommu_modify(struct vm *vm, boolean_t map)
630{
631	int i, sz;
632	vm_paddr_t gpa, hpa;
633	struct mem_seg *seg;
634	void *vp, *cookie, *host_domain;
635
636	sz = PAGE_SIZE;
637	host_domain = iommu_host_domain();
638
639	for (i = 0; i < vm->num_mem_segs; i++) {
640		seg = &vm->mem_segs[i];
641		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
642		    vm_name(vm), seg->gpa, seg->len));
643
644		gpa = seg->gpa;
645		while (gpa < seg->gpa + seg->len) {
646			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
647					 &cookie);
648			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
649			    vm_name(vm), gpa));
650
651			vm_gpa_release(cookie);
652
653			hpa = DMAP_TO_PHYS((uintptr_t)vp);
654			if (map) {
655				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
656				iommu_remove_mapping(host_domain, hpa, sz);
657			} else {
658				iommu_remove_mapping(vm->iommu, gpa, sz);
659				iommu_create_mapping(host_domain, hpa, hpa, sz);
660			}
661
662			gpa += PAGE_SIZE;
663		}
664	}
665
666	/*
667	 * Invalidate the cached translations associated with the domain
668	 * from which pages were removed.
669	 */
670	if (map)
671		iommu_invalidate_tlb(host_domain);
672	else
673		iommu_invalidate_tlb(vm->iommu);
674}
675
676#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
677#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
678
679int
680vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
681{
682	int error;
683
684	error = ppt_unassign_device(vm, bus, slot, func);
685	if (error)
686		return (error);
687
688	if (ppt_assigned_devices(vm) == 0) {
689		vm_iommu_unmap(vm);
690		vm_gpa_unwire(vm);
691	}
692	return (0);
693}
694
695int
696vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
697{
698	int error;
699	vm_paddr_t maxaddr;
700
701	/*
702	 * Virtual machines with pci passthru devices get special treatment:
703	 * - the guest physical memory is wired
704	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
705	 *
706	 * We need to do this before the first pci passthru device is attached.
707	 */
708	if (ppt_assigned_devices(vm) == 0) {
709		KASSERT(vm->iommu == NULL,
710		    ("vm_assign_pptdev: iommu must be NULL"));
711		maxaddr = vmm_mem_maxaddr();
712		vm->iommu = iommu_create_domain(maxaddr);
713
714		error = vm_gpa_wire(vm);
715		if (error)
716			return (error);
717
718		vm_iommu_map(vm);
719	}
720
721	error = ppt_assign_device(vm, bus, slot, func);
722	return (error);
723}
724
725void *
726vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
727	    void **cookie)
728{
729	int count, pageoff;
730	vm_page_t m;
731
732	pageoff = gpa & PAGE_MASK;
733	if (len > PAGE_SIZE - pageoff)
734		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
735
736	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
737	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
738
739	if (count == 1) {
740		*cookie = m;
741		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
742	} else {
743		*cookie = NULL;
744		return (NULL);
745	}
746}
747
748void
749vm_gpa_release(void *cookie)
750{
751	vm_page_t m = cookie;
752
753	vm_page_lock(m);
754	vm_page_unhold(m);
755	vm_page_unlock(m);
756}
757
758int
759vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
760		  struct vm_memory_segment *seg)
761{
762	int i;
763
764	for (i = 0; i < vm->num_mem_segs; i++) {
765		if (gpabase == vm->mem_segs[i].gpa) {
766			seg->gpa = vm->mem_segs[i].gpa;
767			seg->len = vm->mem_segs[i].len;
768			seg->wired = vm->mem_segs[i].wired;
769			return (0);
770		}
771	}
772	return (-1);
773}
774
775int
776vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
777	      vm_offset_t *offset, struct vm_object **object)
778{
779	int i;
780	size_t seg_len;
781	vm_paddr_t seg_gpa;
782	vm_object_t seg_obj;
783
784	for (i = 0; i < vm->num_mem_segs; i++) {
785		if ((seg_obj = vm->mem_segs[i].object) == NULL)
786			continue;
787
788		seg_gpa = vm->mem_segs[i].gpa;
789		seg_len = vm->mem_segs[i].len;
790
791		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
792			*offset = gpa - seg_gpa;
793			*object = seg_obj;
794			vm_object_reference(seg_obj);
795			return (0);
796		}
797	}
798
799	return (EINVAL);
800}
801
802int
803vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
804{
805
806	if (vcpu < 0 || vcpu >= VM_MAXCPU)
807		return (EINVAL);
808
809	if (reg >= VM_REG_LAST)
810		return (EINVAL);
811
812	return (VMGETREG(vm->cookie, vcpu, reg, retval));
813}
814
815int
816vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
817{
818
819	if (vcpu < 0 || vcpu >= VM_MAXCPU)
820		return (EINVAL);
821
822	if (reg >= VM_REG_LAST)
823		return (EINVAL);
824
825	return (VMSETREG(vm->cookie, vcpu, reg, val));
826}
827
828static boolean_t
829is_descriptor_table(int reg)
830{
831
832	switch (reg) {
833	case VM_REG_GUEST_IDTR:
834	case VM_REG_GUEST_GDTR:
835		return (TRUE);
836	default:
837		return (FALSE);
838	}
839}
840
841static boolean_t
842is_segment_register(int reg)
843{
844
845	switch (reg) {
846	case VM_REG_GUEST_ES:
847	case VM_REG_GUEST_CS:
848	case VM_REG_GUEST_SS:
849	case VM_REG_GUEST_DS:
850	case VM_REG_GUEST_FS:
851	case VM_REG_GUEST_GS:
852	case VM_REG_GUEST_TR:
853	case VM_REG_GUEST_LDTR:
854		return (TRUE);
855	default:
856		return (FALSE);
857	}
858}
859
860int
861vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
862		struct seg_desc *desc)
863{
864
865	if (vcpu < 0 || vcpu >= VM_MAXCPU)
866		return (EINVAL);
867
868	if (!is_segment_register(reg) && !is_descriptor_table(reg))
869		return (EINVAL);
870
871	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
872}
873
874int
875vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
876		struct seg_desc *desc)
877{
878	if (vcpu < 0 || vcpu >= VM_MAXCPU)
879		return (EINVAL);
880
881	if (!is_segment_register(reg) && !is_descriptor_table(reg))
882		return (EINVAL);
883
884	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
885}
886
887static void
888restore_guest_fpustate(struct vcpu *vcpu)
889{
890
891	/* flush host state to the pcb */
892	fpuexit(curthread);
893
894	/* restore guest FPU state */
895	fpu_stop_emulating();
896	fpurestore(vcpu->guestfpu);
897
898	/* restore guest XCR0 if XSAVE is enabled in the host */
899	if (rcr4() & CR4_XSAVE)
900		load_xcr(0, vcpu->guest_xcr0);
901
902	/*
903	 * The FPU is now "dirty" with the guest's state so turn on emulation
904	 * to trap any access to the FPU by the host.
905	 */
906	fpu_start_emulating();
907}
908
909static void
910save_guest_fpustate(struct vcpu *vcpu)
911{
912
913	if ((rcr0() & CR0_TS) == 0)
914		panic("fpu emulation not enabled in host!");
915
916	/* save guest XCR0 and restore host XCR0 */
917	if (rcr4() & CR4_XSAVE) {
918		vcpu->guest_xcr0 = rxcr(0);
919		load_xcr(0, vmm_get_host_xcr0());
920	}
921
922	/* save guest FPU state */
923	fpu_stop_emulating();
924	fpusave(vcpu->guestfpu);
925	fpu_start_emulating();
926}
927
928static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
929
930static int
931vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
932    bool from_idle)
933{
934	int error;
935
936	vcpu_assert_locked(vcpu);
937
938	/*
939	 * State transitions from the vmmdev_ioctl() must always begin from
940	 * the VCPU_IDLE state. This guarantees that there is only a single
941	 * ioctl() operating on a vcpu at any point.
942	 */
943	if (from_idle) {
944		while (vcpu->state != VCPU_IDLE)
945			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
946	} else {
947		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
948		    "vcpu idle state"));
949	}
950
951	if (vcpu->state == VCPU_RUNNING) {
952		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
953		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
954	} else {
955		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
956		    "vcpu that is not running", vcpu->hostcpu));
957	}
958
959	/*
960	 * The following state transitions are allowed:
961	 * IDLE -> FROZEN -> IDLE
962	 * FROZEN -> RUNNING -> FROZEN
963	 * FROZEN -> SLEEPING -> FROZEN
964	 */
965	switch (vcpu->state) {
966	case VCPU_IDLE:
967	case VCPU_RUNNING:
968	case VCPU_SLEEPING:
969		error = (newstate != VCPU_FROZEN);
970		break;
971	case VCPU_FROZEN:
972		error = (newstate == VCPU_FROZEN);
973		break;
974	default:
975		error = 1;
976		break;
977	}
978
979	if (error)
980		return (EBUSY);
981
982	vcpu->state = newstate;
983	if (newstate == VCPU_RUNNING)
984		vcpu->hostcpu = curcpu;
985	else
986		vcpu->hostcpu = NOCPU;
987
988	if (newstate == VCPU_IDLE)
989		wakeup(&vcpu->state);
990
991	return (0);
992}
993
994static void
995vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
996{
997	int error;
998
999	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1000		panic("Error %d setting state to %d\n", error, newstate);
1001}
1002
1003static void
1004vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1005{
1006	int error;
1007
1008	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1009		panic("Error %d setting state to %d", error, newstate);
1010}
1011
1012static void
1013vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1014{
1015
1016	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1017
1018	/*
1019	 * Update 'rendezvous_func' and execute a write memory barrier to
1020	 * ensure that it is visible across all host cpus. This is not needed
1021	 * for correctness but it does ensure that all the vcpus will notice
1022	 * that the rendezvous is requested immediately.
1023	 */
1024	vm->rendezvous_func = func;
1025	wmb();
1026}
1027
1028#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1029	do {								\
1030		if (vcpuid >= 0)					\
1031			VCPU_CTR0(vm, vcpuid, fmt);			\
1032		else							\
1033			VM_CTR0(vm, fmt);				\
1034	} while (0)
1035
1036static void
1037vm_handle_rendezvous(struct vm *vm, int vcpuid)
1038{
1039
1040	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1041	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1042
1043	mtx_lock(&vm->rendezvous_mtx);
1044	while (vm->rendezvous_func != NULL) {
1045		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1046		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1047
1048		if (vcpuid != -1 &&
1049		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1050		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1051			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1052			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1053			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1054		}
1055		if (CPU_CMP(&vm->rendezvous_req_cpus,
1056		    &vm->rendezvous_done_cpus) == 0) {
1057			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1058			vm_set_rendezvous_func(vm, NULL);
1059			wakeup(&vm->rendezvous_func);
1060			break;
1061		}
1062		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1063		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1064		    "vmrndv", 0);
1065	}
1066	mtx_unlock(&vm->rendezvous_mtx);
1067}
1068
1069/*
1070 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1071 */
1072static int
1073vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1074{
1075	struct vcpu *vcpu;
1076	const char *wmesg;
1077	int t, vcpu_halted, vm_halted;
1078
1079	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1080
1081	vcpu = &vm->vcpu[vcpuid];
1082	vcpu_halted = 0;
1083	vm_halted = 0;
1084
1085	vcpu_lock(vcpu);
1086	while (1) {
1087		/*
1088		 * Do a final check for pending NMI or interrupts before
1089		 * really putting this thread to sleep. Also check for
1090		 * software events that would cause this vcpu to wakeup.
1091		 *
1092		 * These interrupts/events could have happened after the
1093		 * vcpu returned from VMRUN() and before it acquired the
1094		 * vcpu lock above.
1095		 */
1096		if (vm->rendezvous_func != NULL || vm->suspend)
1097			break;
1098		if (vm_nmi_pending(vm, vcpuid))
1099			break;
1100		if (!intr_disabled) {
1101			if (vm_extint_pending(vm, vcpuid) ||
1102			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1103				break;
1104			}
1105		}
1106
1107		/*
1108		 * Some Linux guests implement "halt" by having all vcpus
1109		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1110		 * track of the vcpus that have entered this state. When all
1111		 * vcpus enter the halted state the virtual machine is halted.
1112		 */
1113		if (intr_disabled) {
1114			wmesg = "vmhalt";
1115			VCPU_CTR0(vm, vcpuid, "Halted");
1116			if (!vcpu_halted && halt_detection_enabled) {
1117				vcpu_halted = 1;
1118				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1119			}
1120			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1121				vm_halted = 1;
1122				break;
1123			}
1124		} else {
1125			wmesg = "vmidle";
1126		}
1127
1128		t = ticks;
1129		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1130		msleep_spin(vcpu, &vcpu->mtx, wmesg, 0);
1131		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1132		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1133	}
1134
1135	if (vcpu_halted)
1136		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1137
1138	vcpu_unlock(vcpu);
1139
1140	if (vm_halted)
1141		vm_suspend(vm, VM_SUSPEND_HALT);
1142
1143	return (0);
1144}
1145
1146static int
1147vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1148{
1149	int rv, ftype;
1150	struct vm_map *map;
1151	struct vcpu *vcpu;
1152	struct vm_exit *vme;
1153
1154	vcpu = &vm->vcpu[vcpuid];
1155	vme = &vcpu->exitinfo;
1156
1157	ftype = vme->u.paging.fault_type;
1158	KASSERT(ftype == VM_PROT_READ ||
1159	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1160	    ("vm_handle_paging: invalid fault_type %d", ftype));
1161
1162	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1163		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1164		    vme->u.paging.gpa, ftype);
1165		if (rv == 0)
1166			goto done;
1167	}
1168
1169	map = &vm->vmspace->vm_map;
1170	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1171
1172	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1173	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1174
1175	if (rv != KERN_SUCCESS)
1176		return (EFAULT);
1177done:
1178	/* restart execution at the faulting instruction */
1179	vme->inst_length = 0;
1180
1181	return (0);
1182}
1183
1184static int
1185vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1186{
1187	struct vie *vie;
1188	struct vcpu *vcpu;
1189	struct vm_exit *vme;
1190	uint64_t gla, gpa;
1191	struct vm_guest_paging *paging;
1192	mem_region_read_t mread;
1193	mem_region_write_t mwrite;
1194	int error;
1195
1196	vcpu = &vm->vcpu[vcpuid];
1197	vme = &vcpu->exitinfo;
1198
1199	gla = vme->u.inst_emul.gla;
1200	gpa = vme->u.inst_emul.gpa;
1201	vie = &vme->u.inst_emul.vie;
1202	paging = &vme->u.inst_emul.paging;
1203
1204	vie_init(vie);
1205
1206	/* Fetch, decode and emulate the faulting instruction */
1207	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1208	    vme->inst_length, vie);
1209	if (error == 1)
1210		return (0);		/* Resume guest to handle page fault */
1211	else if (error == -1)
1212		return (EFAULT);
1213	else if (error != 0)
1214		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1215
1216	if (vmm_decode_instruction(vm, vcpuid, gla, paging->cpu_mode, vie) != 0)
1217		return (EFAULT);
1218
1219	/* return to userland unless this is an in-kernel emulated device */
1220	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1221		mread = lapic_mmio_read;
1222		mwrite = lapic_mmio_write;
1223	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1224		mread = vioapic_mmio_read;
1225		mwrite = vioapic_mmio_write;
1226	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1227		mread = vhpet_mmio_read;
1228		mwrite = vhpet_mmio_write;
1229	} else {
1230		*retu = true;
1231		return (0);
1232	}
1233
1234	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1235	    retu);
1236
1237	return (error);
1238}
1239
1240static int
1241vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1242{
1243	int i, done;
1244	struct vcpu *vcpu;
1245
1246	done = 0;
1247	vcpu = &vm->vcpu[vcpuid];
1248
1249	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1250
1251	/*
1252	 * Wait until all 'active_cpus' have suspended themselves.
1253	 *
1254	 * Since a VM may be suspended at any time including when one or
1255	 * more vcpus are doing a rendezvous we need to call the rendezvous
1256	 * handler while we are waiting to prevent a deadlock.
1257	 */
1258	vcpu_lock(vcpu);
1259	while (1) {
1260		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1261			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1262			break;
1263		}
1264
1265		if (vm->rendezvous_func == NULL) {
1266			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1267			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1268			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1269			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1270		} else {
1271			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1272			vcpu_unlock(vcpu);
1273			vm_handle_rendezvous(vm, vcpuid);
1274			vcpu_lock(vcpu);
1275		}
1276	}
1277	vcpu_unlock(vcpu);
1278
1279	/*
1280	 * Wakeup the other sleeping vcpus and return to userspace.
1281	 */
1282	for (i = 0; i < VM_MAXCPU; i++) {
1283		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1284			vcpu_notify_event(vm, i, false);
1285		}
1286	}
1287
1288	*retu = true;
1289	return (0);
1290}
1291
1292int
1293vm_suspend(struct vm *vm, enum vm_suspend_how how)
1294{
1295	int i;
1296
1297	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1298		return (EINVAL);
1299
1300	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1301		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1302		    vm->suspend, how);
1303		return (EALREADY);
1304	}
1305
1306	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1307
1308	/*
1309	 * Notify all active vcpus that they are now suspended.
1310	 */
1311	for (i = 0; i < VM_MAXCPU; i++) {
1312		if (CPU_ISSET(i, &vm->active_cpus))
1313			vcpu_notify_event(vm, i, false);
1314	}
1315
1316	return (0);
1317}
1318
1319void
1320vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1321{
1322	struct vm_exit *vmexit;
1323
1324	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1325	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1326
1327	vmexit = vm_exitinfo(vm, vcpuid);
1328	vmexit->rip = rip;
1329	vmexit->inst_length = 0;
1330	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1331	vmexit->u.suspended.how = vm->suspend;
1332}
1333
1334int
1335vm_run(struct vm *vm, struct vm_run *vmrun)
1336{
1337	int error, vcpuid;
1338	struct vcpu *vcpu;
1339	struct pcb *pcb;
1340	uint64_t tscval, rip;
1341	struct vm_exit *vme;
1342	bool retu, intr_disabled;
1343	pmap_t pmap;
1344	void *rptr, *sptr;
1345
1346	vcpuid = vmrun->cpuid;
1347
1348	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1349		return (EINVAL);
1350
1351	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1352		return (EINVAL);
1353
1354	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1355		return (EINVAL);
1356
1357	rptr = &vm->rendezvous_func;
1358	sptr = &vm->suspend;
1359	pmap = vmspace_pmap(vm->vmspace);
1360	vcpu = &vm->vcpu[vcpuid];
1361	vme = &vcpu->exitinfo;
1362	rip = vmrun->rip;
1363restart:
1364	critical_enter();
1365
1366	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1367	    ("vm_run: absurd pm_active"));
1368
1369	tscval = rdtsc();
1370
1371	pcb = PCPU_GET(curpcb);
1372	set_pcb_flags(pcb, PCB_FULL_IRET);
1373
1374	restore_guest_msrs(vm, vcpuid);
1375	restore_guest_fpustate(vcpu);
1376
1377	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1378	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1379	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1380
1381	save_guest_fpustate(vcpu);
1382	restore_host_msrs(vm, vcpuid);
1383
1384	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1385
1386	critical_exit();
1387
1388	if (error == 0) {
1389		retu = false;
1390		switch (vme->exitcode) {
1391		case VM_EXITCODE_SUSPENDED:
1392			error = vm_handle_suspend(vm, vcpuid, &retu);
1393			break;
1394		case VM_EXITCODE_IOAPIC_EOI:
1395			vioapic_process_eoi(vm, vcpuid,
1396			    vme->u.ioapic_eoi.vector);
1397			break;
1398		case VM_EXITCODE_RENDEZVOUS:
1399			vm_handle_rendezvous(vm, vcpuid);
1400			error = 0;
1401			break;
1402		case VM_EXITCODE_HLT:
1403			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1404			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1405			break;
1406		case VM_EXITCODE_PAGING:
1407			error = vm_handle_paging(vm, vcpuid, &retu);
1408			break;
1409		case VM_EXITCODE_INST_EMUL:
1410			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1411			break;
1412		case VM_EXITCODE_INOUT:
1413		case VM_EXITCODE_INOUT_STR:
1414			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1415			break;
1416		default:
1417			retu = true;	/* handled in userland */
1418			break;
1419		}
1420	}
1421
1422	if (error == 0 && retu == false) {
1423		rip = vme->rip + vme->inst_length;
1424		goto restart;
1425	}
1426
1427	/* copy the exit information */
1428	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1429	return (error);
1430}
1431
1432int
1433vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1434{
1435	struct vcpu *vcpu;
1436
1437	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1438		return (EINVAL);
1439
1440	if (exception->vector < 0 || exception->vector >= 32)
1441		return (EINVAL);
1442
1443	vcpu = &vm->vcpu[vcpuid];
1444
1445	if (vcpu->exception_pending) {
1446		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1447		    "pending exception %d", exception->vector,
1448		    vcpu->exception.vector);
1449		return (EBUSY);
1450	}
1451
1452	vcpu->exception_pending = 1;
1453	vcpu->exception = *exception;
1454	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1455	return (0);
1456}
1457
1458int
1459vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1460{
1461	struct vcpu *vcpu;
1462	int pending;
1463
1464	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1465
1466	vcpu = &vm->vcpu[vcpuid];
1467	pending = vcpu->exception_pending;
1468	if (pending) {
1469		vcpu->exception_pending = 0;
1470		*exception = vcpu->exception;
1471		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1472		    exception->vector);
1473	}
1474	return (pending);
1475}
1476
1477static void
1478vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1479{
1480	struct vm_exit *vmexit;
1481	int error;
1482
1483	error = vm_inject_exception(vm, vcpuid, exception);
1484	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1485
1486	/*
1487	 * A fault-like exception allows the instruction to be restarted
1488	 * after the exception handler returns.
1489	 *
1490	 * By setting the inst_length to 0 we ensure that the instruction
1491	 * pointer remains at the faulting instruction.
1492	 */
1493	vmexit = vm_exitinfo(vm, vcpuid);
1494	vmexit->inst_length = 0;
1495}
1496
1497void
1498vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2)
1499{
1500	struct vm_exception pf = {
1501		.vector = IDT_PF,
1502		.error_code_valid = 1,
1503		.error_code = error_code
1504	};
1505	int error;
1506
1507	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1508	    error_code, cr2);
1509
1510	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1511	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1512
1513	vm_inject_fault(vm, vcpuid, &pf);
1514}
1515
1516void
1517vm_inject_gp(struct vm *vm, int vcpuid)
1518{
1519	struct vm_exception gpf = {
1520		.vector = IDT_GP,
1521		.error_code_valid = 1,
1522		.error_code = 0
1523	};
1524
1525	vm_inject_fault(vm, vcpuid, &gpf);
1526}
1527
1528void
1529vm_inject_ud(struct vm *vm, int vcpuid)
1530{
1531	struct vm_exception udf = {
1532		.vector = IDT_UD,
1533		.error_code_valid = 0
1534	};
1535
1536	vm_inject_fault(vm, vcpuid, &udf);
1537}
1538
1539static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1540
1541int
1542vm_inject_nmi(struct vm *vm, int vcpuid)
1543{
1544	struct vcpu *vcpu;
1545
1546	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1547		return (EINVAL);
1548
1549	vcpu = &vm->vcpu[vcpuid];
1550
1551	vcpu->nmi_pending = 1;
1552	vcpu_notify_event(vm, vcpuid, false);
1553	return (0);
1554}
1555
1556int
1557vm_nmi_pending(struct vm *vm, int vcpuid)
1558{
1559	struct vcpu *vcpu;
1560
1561	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1562		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1563
1564	vcpu = &vm->vcpu[vcpuid];
1565
1566	return (vcpu->nmi_pending);
1567}
1568
1569void
1570vm_nmi_clear(struct vm *vm, int vcpuid)
1571{
1572	struct vcpu *vcpu;
1573
1574	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1575		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1576
1577	vcpu = &vm->vcpu[vcpuid];
1578
1579	if (vcpu->nmi_pending == 0)
1580		panic("vm_nmi_clear: inconsistent nmi_pending state");
1581
1582	vcpu->nmi_pending = 0;
1583	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1584}
1585
1586static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1587
1588int
1589vm_inject_extint(struct vm *vm, int vcpuid)
1590{
1591	struct vcpu *vcpu;
1592
1593	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1594		return (EINVAL);
1595
1596	vcpu = &vm->vcpu[vcpuid];
1597
1598	vcpu->extint_pending = 1;
1599	vcpu_notify_event(vm, vcpuid, false);
1600	return (0);
1601}
1602
1603int
1604vm_extint_pending(struct vm *vm, int vcpuid)
1605{
1606	struct vcpu *vcpu;
1607
1608	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1609		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1610
1611	vcpu = &vm->vcpu[vcpuid];
1612
1613	return (vcpu->extint_pending);
1614}
1615
1616void
1617vm_extint_clear(struct vm *vm, int vcpuid)
1618{
1619	struct vcpu *vcpu;
1620
1621	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1622		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1623
1624	vcpu = &vm->vcpu[vcpuid];
1625
1626	if (vcpu->extint_pending == 0)
1627		panic("vm_extint_clear: inconsistent extint_pending state");
1628
1629	vcpu->extint_pending = 0;
1630	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1631}
1632
1633int
1634vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1635{
1636	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1637		return (EINVAL);
1638
1639	if (type < 0 || type >= VM_CAP_MAX)
1640		return (EINVAL);
1641
1642	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1643}
1644
1645int
1646vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1647{
1648	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1649		return (EINVAL);
1650
1651	if (type < 0 || type >= VM_CAP_MAX)
1652		return (EINVAL);
1653
1654	return (VMSETCAP(vm->cookie, vcpu, type, val));
1655}
1656
1657uint64_t *
1658vm_guest_msrs(struct vm *vm, int cpu)
1659{
1660	return (vm->vcpu[cpu].guest_msrs);
1661}
1662
1663struct vlapic *
1664vm_lapic(struct vm *vm, int cpu)
1665{
1666	return (vm->vcpu[cpu].vlapic);
1667}
1668
1669struct vioapic *
1670vm_ioapic(struct vm *vm)
1671{
1672
1673	return (vm->vioapic);
1674}
1675
1676struct vhpet *
1677vm_hpet(struct vm *vm)
1678{
1679
1680	return (vm->vhpet);
1681}
1682
1683boolean_t
1684vmm_is_pptdev(int bus, int slot, int func)
1685{
1686	int found, i, n;
1687	int b, s, f;
1688	char *val, *cp, *cp2;
1689
1690	/*
1691	 * XXX
1692	 * The length of an environment variable is limited to 128 bytes which
1693	 * puts an upper limit on the number of passthru devices that may be
1694	 * specified using a single environment variable.
1695	 *
1696	 * Work around this by scanning multiple environment variable
1697	 * names instead of a single one - yuck!
1698	 */
1699	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1700
1701	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1702	found = 0;
1703	for (i = 0; names[i] != NULL && !found; i++) {
1704		cp = val = getenv(names[i]);
1705		while (cp != NULL && *cp != '\0') {
1706			if ((cp2 = strchr(cp, ' ')) != NULL)
1707				*cp2 = '\0';
1708
1709			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1710			if (n == 3 && bus == b && slot == s && func == f) {
1711				found = 1;
1712				break;
1713			}
1714
1715			if (cp2 != NULL)
1716				*cp2++ = ' ';
1717
1718			cp = cp2;
1719		}
1720		freeenv(val);
1721	}
1722	return (found);
1723}
1724
1725void *
1726vm_iommu_domain(struct vm *vm)
1727{
1728
1729	return (vm->iommu);
1730}
1731
1732int
1733vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1734    bool from_idle)
1735{
1736	int error;
1737	struct vcpu *vcpu;
1738
1739	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1740		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1741
1742	vcpu = &vm->vcpu[vcpuid];
1743
1744	vcpu_lock(vcpu);
1745	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1746	vcpu_unlock(vcpu);
1747
1748	return (error);
1749}
1750
1751enum vcpu_state
1752vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1753{
1754	struct vcpu *vcpu;
1755	enum vcpu_state state;
1756
1757	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1758		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1759
1760	vcpu = &vm->vcpu[vcpuid];
1761
1762	vcpu_lock(vcpu);
1763	state = vcpu->state;
1764	if (hostcpu != NULL)
1765		*hostcpu = vcpu->hostcpu;
1766	vcpu_unlock(vcpu);
1767
1768	return (state);
1769}
1770
1771int
1772vm_activate_cpu(struct vm *vm, int vcpuid)
1773{
1774
1775	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1776		return (EINVAL);
1777
1778	if (CPU_ISSET(vcpuid, &vm->active_cpus))
1779		return (EBUSY);
1780
1781	VCPU_CTR0(vm, vcpuid, "activated");
1782	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1783	return (0);
1784}
1785
1786cpuset_t
1787vm_active_cpus(struct vm *vm)
1788{
1789
1790	return (vm->active_cpus);
1791}
1792
1793cpuset_t
1794vm_suspended_cpus(struct vm *vm)
1795{
1796
1797	return (vm->suspended_cpus);
1798}
1799
1800void *
1801vcpu_stats(struct vm *vm, int vcpuid)
1802{
1803
1804	return (vm->vcpu[vcpuid].stats);
1805}
1806
1807int
1808vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1809{
1810	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1811		return (EINVAL);
1812
1813	*state = vm->vcpu[vcpuid].x2apic_state;
1814
1815	return (0);
1816}
1817
1818int
1819vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1820{
1821	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1822		return (EINVAL);
1823
1824	if (state >= X2APIC_STATE_LAST)
1825		return (EINVAL);
1826
1827	vm->vcpu[vcpuid].x2apic_state = state;
1828
1829	vlapic_set_x2apic_state(vm, vcpuid, state);
1830
1831	return (0);
1832}
1833
1834/*
1835 * This function is called to ensure that a vcpu "sees" a pending event
1836 * as soon as possible:
1837 * - If the vcpu thread is sleeping then it is woken up.
1838 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1839 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1840 */
1841void
1842vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1843{
1844	int hostcpu;
1845	struct vcpu *vcpu;
1846
1847	vcpu = &vm->vcpu[vcpuid];
1848
1849	vcpu_lock(vcpu);
1850	hostcpu = vcpu->hostcpu;
1851	if (vcpu->state == VCPU_RUNNING) {
1852		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1853		if (hostcpu != curcpu) {
1854			if (lapic_intr) {
1855				vlapic_post_intr(vcpu->vlapic, hostcpu,
1856				    vmm_ipinum);
1857			} else {
1858				ipi_cpu(hostcpu, vmm_ipinum);
1859			}
1860		} else {
1861			/*
1862			 * If the 'vcpu' is running on 'curcpu' then it must
1863			 * be sending a notification to itself (e.g. SELF_IPI).
1864			 * The pending event will be picked up when the vcpu
1865			 * transitions back to guest context.
1866			 */
1867		}
1868	} else {
1869		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1870		    "with hostcpu %d", vcpu->state, hostcpu));
1871		if (vcpu->state == VCPU_SLEEPING)
1872			wakeup_one(vcpu);
1873	}
1874	vcpu_unlock(vcpu);
1875}
1876
1877struct vmspace *
1878vm_get_vmspace(struct vm *vm)
1879{
1880
1881	return (vm->vmspace);
1882}
1883
1884int
1885vm_apicid2vcpuid(struct vm *vm, int apicid)
1886{
1887	/*
1888	 * XXX apic id is assumed to be numerically identical to vcpu id
1889	 */
1890	return (apicid);
1891}
1892
1893void
1894vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1895    vm_rendezvous_func_t func, void *arg)
1896{
1897	int i;
1898
1899	/*
1900	 * Enforce that this function is called without any locks
1901	 */
1902	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1903	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1904	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1905
1906restart:
1907	mtx_lock(&vm->rendezvous_mtx);
1908	if (vm->rendezvous_func != NULL) {
1909		/*
1910		 * If a rendezvous is already in progress then we need to
1911		 * call the rendezvous handler in case this 'vcpuid' is one
1912		 * of the targets of the rendezvous.
1913		 */
1914		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1915		mtx_unlock(&vm->rendezvous_mtx);
1916		vm_handle_rendezvous(vm, vcpuid);
1917		goto restart;
1918	}
1919	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1920	    "rendezvous is still in progress"));
1921
1922	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1923	vm->rendezvous_req_cpus = dest;
1924	CPU_ZERO(&vm->rendezvous_done_cpus);
1925	vm->rendezvous_arg = arg;
1926	vm_set_rendezvous_func(vm, func);
1927	mtx_unlock(&vm->rendezvous_mtx);
1928
1929	/*
1930	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1931	 * vcpus so they handle the rendezvous as soon as possible.
1932	 */
1933	for (i = 0; i < VM_MAXCPU; i++) {
1934		if (CPU_ISSET(i, &dest))
1935			vcpu_notify_event(vm, i, false);
1936	}
1937
1938	vm_handle_rendezvous(vm, vcpuid);
1939}
1940
1941struct vatpic *
1942vm_atpic(struct vm *vm)
1943{
1944	return (vm->vatpic);
1945}
1946
1947struct vatpit *
1948vm_atpit(struct vm *vm)
1949{
1950	return (vm->vatpit);
1951}
1952
1953enum vm_reg_name
1954vm_segment_name(int seg)
1955{
1956	static enum vm_reg_name seg_names[] = {
1957		VM_REG_GUEST_ES,
1958		VM_REG_GUEST_CS,
1959		VM_REG_GUEST_SS,
1960		VM_REG_GUEST_DS,
1961		VM_REG_GUEST_FS,
1962		VM_REG_GUEST_GS
1963	};
1964
1965	KASSERT(seg >= 0 && seg < nitems(seg_names),
1966	    ("%s: invalid segment encoding %d", __func__, seg));
1967	return (seg_names[seg]);
1968}
1969