vmm.c revision 268891
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 268891 2014-07-19 22:06:46Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 268891 2014-07-19 22:06:46Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vatpic.h"
71#include "vatpit.h"
72#include "vhpet.h"
73#include "vioapic.h"
74#include "vlapic.h"
75#include "vmm_msr.h"
76#include "vmm_ipi.h"
77#include "vmm_stat.h"
78#include "vmm_lapic.h"
79
80#include "io/ppt.h"
81#include "io/iommu.h"
82
83struct vlapic;
84
85struct vcpu {
86	int		flags;
87	enum vcpu_state	state;
88	struct mtx	mtx;
89	int		hostcpu;	/* host cpuid this vcpu last ran on */
90	uint64_t	guest_msrs[VMM_MSR_NUM];
91	struct vlapic	*vlapic;
92	int		 vcpuid;
93	struct savefpu	*guestfpu;	/* guest fpu state */
94	uint64_t	guest_xcr0;
95	void		*stats;
96	struct vm_exit	exitinfo;
97	enum x2apic_state x2apic_state;
98	int		nmi_pending;
99	int		extint_pending;
100	struct vm_exception exception;
101	int		exception_pending;
102};
103
104#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
105#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
106#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
107#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
108
109struct mem_seg {
110	vm_paddr_t	gpa;
111	size_t		len;
112	boolean_t	wired;
113	vm_object_t	object;
114};
115#define	VM_MAX_MEMORY_SEGMENTS	2
116
117struct vm {
118	void		*cookie;	/* processor-specific data */
119	void		*iommu;		/* iommu-specific data */
120	struct vhpet	*vhpet;		/* virtual HPET */
121	struct vioapic	*vioapic;	/* virtual ioapic */
122	struct vatpic	*vatpic;	/* virtual atpic */
123	struct vatpit	*vatpit;	/* virtual atpit */
124	struct vmspace	*vmspace;	/* guest's address space */
125	struct vcpu	vcpu[VM_MAXCPU];
126	int		num_mem_segs;
127	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
128	char		name[VM_MAX_NAMELEN];
129
130	/*
131	 * Set of active vcpus.
132	 * An active vcpu is one that has been started implicitly (BSP) or
133	 * explicitly (AP) by sending it a startup ipi.
134	 */
135	volatile cpuset_t active_cpus;
136
137	struct mtx	rendezvous_mtx;
138	cpuset_t	rendezvous_req_cpus;
139	cpuset_t	rendezvous_done_cpus;
140	void		*rendezvous_arg;
141	vm_rendezvous_func_t rendezvous_func;
142};
143
144static int vmm_initialized;
145
146static struct vmm_ops *ops;
147#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
148#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
149#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
150
151#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
152#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
153	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
154#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
155#define	VMSPACE_ALLOC(min, max) \
156	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
157#define	VMSPACE_FREE(vmspace) \
158	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
159#define	VMGETREG(vmi, vcpu, num, retval)		\
160	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
161#define	VMSETREG(vmi, vcpu, num, val)		\
162	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
163#define	VMGETDESC(vmi, vcpu, num, desc)		\
164	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
165#define	VMSETDESC(vmi, vcpu, num, desc)		\
166	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
167#define	VMGETCAP(vmi, vcpu, num, retval)	\
168	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
169#define	VMSETCAP(vmi, vcpu, num, val)		\
170	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
171#define	VLAPIC_INIT(vmi, vcpu)			\
172	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
173#define	VLAPIC_CLEANUP(vmi, vlapic)		\
174	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
175
176#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
177#define	fpu_stop_emulating()	clts()
178
179static MALLOC_DEFINE(M_VM, "vm", "vm");
180CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
181
182/* statistics */
183static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
184
185SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
186
187static int vmm_ipinum;
188SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
189    "IPI vector used for vcpu notifications");
190
191static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
192
193static void
194vcpu_cleanup(struct vm *vm, int i)
195{
196	struct vcpu *vcpu = &vm->vcpu[i];
197
198	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
199	vmm_stat_free(vcpu->stats);
200	fpu_save_area_free(vcpu->guestfpu);
201}
202
203static void
204vcpu_init(struct vm *vm, uint32_t vcpu_id)
205{
206	struct vcpu *vcpu;
207
208	vcpu = &vm->vcpu[vcpu_id];
209
210	vcpu_lock_init(vcpu);
211	vcpu->hostcpu = NOCPU;
212	vcpu->vcpuid = vcpu_id;
213	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
214	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
215	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
216	vcpu->guestfpu = fpu_save_area_alloc();
217	fpu_save_area_reset(vcpu->guestfpu);
218	vcpu->stats = vmm_stat_alloc();
219}
220
221struct vm_exit *
222vm_exitinfo(struct vm *vm, int cpuid)
223{
224	struct vcpu *vcpu;
225
226	if (cpuid < 0 || cpuid >= VM_MAXCPU)
227		panic("vm_exitinfo: invalid cpuid %d", cpuid);
228
229	vcpu = &vm->vcpu[cpuid];
230
231	return (&vcpu->exitinfo);
232}
233
234static void
235vmm_resume(void)
236{
237	VMM_RESUME();
238}
239
240static int
241vmm_init(void)
242{
243	int error;
244
245	vmm_host_state_init();
246
247	vmm_ipinum = vmm_ipi_alloc();
248	if (vmm_ipinum == 0)
249		vmm_ipinum = IPI_AST;
250
251	error = vmm_mem_init();
252	if (error)
253		return (error);
254
255	if (vmm_is_intel())
256		ops = &vmm_ops_intel;
257	else if (vmm_is_amd())
258		ops = &vmm_ops_amd;
259	else
260		return (ENXIO);
261
262	vmm_msr_init();
263	vmm_resume_p = vmm_resume;
264
265	return (VMM_INIT(vmm_ipinum));
266}
267
268static int
269vmm_handler(module_t mod, int what, void *arg)
270{
271	int error;
272
273	switch (what) {
274	case MOD_LOAD:
275		vmmdev_init();
276		if (ppt_avail_devices() > 0)
277			iommu_init();
278		error = vmm_init();
279		if (error == 0)
280			vmm_initialized = 1;
281		break;
282	case MOD_UNLOAD:
283		error = vmmdev_cleanup();
284		if (error == 0) {
285			vmm_resume_p = NULL;
286			iommu_cleanup();
287			if (vmm_ipinum != IPI_AST)
288				vmm_ipi_free(vmm_ipinum);
289			error = VMM_CLEANUP();
290			/*
291			 * Something bad happened - prevent new
292			 * VMs from being created
293			 */
294			if (error)
295				vmm_initialized = 0;
296		}
297		break;
298	default:
299		error = 0;
300		break;
301	}
302	return (error);
303}
304
305static moduledata_t vmm_kmod = {
306	"vmm",
307	vmm_handler,
308	NULL
309};
310
311/*
312 * vmm initialization has the following dependencies:
313 *
314 * - iommu initialization must happen after the pci passthru driver has had
315 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
316 *
317 * - VT-x initialization requires smp_rendezvous() and therefore must happen
318 *   after SMP is fully functional (after SI_SUB_SMP).
319 */
320DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
321MODULE_VERSION(vmm, 1);
322
323int
324vm_create(const char *name, struct vm **retvm)
325{
326	int i;
327	struct vm *vm;
328	struct vmspace *vmspace;
329
330	const int BSP = 0;
331
332	/*
333	 * If vmm.ko could not be successfully initialized then don't attempt
334	 * to create the virtual machine.
335	 */
336	if (!vmm_initialized)
337		return (ENXIO);
338
339	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
340		return (EINVAL);
341
342	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
343	if (vmspace == NULL)
344		return (ENOMEM);
345
346	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
347	strcpy(vm->name, name);
348	vm->vmspace = vmspace;
349	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
350	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
351	vm->vioapic = vioapic_init(vm);
352	vm->vhpet = vhpet_init(vm);
353	vm->vatpic = vatpic_init(vm);
354	vm->vatpit = vatpit_init(vm);
355
356	for (i = 0; i < VM_MAXCPU; i++) {
357		vcpu_init(vm, i);
358		guest_msrs_init(vm, i);
359	}
360
361	vm_activate_cpu(vm, BSP);
362
363	*retvm = vm;
364	return (0);
365}
366
367static void
368vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
369{
370
371	if (seg->object != NULL)
372		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
373
374	bzero(seg, sizeof(*seg));
375}
376
377void
378vm_destroy(struct vm *vm)
379{
380	int i;
381
382	ppt_unassign_all(vm);
383
384	if (vm->iommu != NULL)
385		iommu_destroy_domain(vm->iommu);
386
387	vatpit_cleanup(vm->vatpit);
388	vhpet_cleanup(vm->vhpet);
389	vatpic_cleanup(vm->vatpic);
390	vioapic_cleanup(vm->vioapic);
391
392	for (i = 0; i < vm->num_mem_segs; i++)
393		vm_free_mem_seg(vm, &vm->mem_segs[i]);
394
395	vm->num_mem_segs = 0;
396
397	for (i = 0; i < VM_MAXCPU; i++)
398		vcpu_cleanup(vm, i);
399
400	VMSPACE_FREE(vm->vmspace);
401
402	VMCLEANUP(vm->cookie);
403
404	free(vm, M_VM);
405}
406
407const char *
408vm_name(struct vm *vm)
409{
410	return (vm->name);
411}
412
413int
414vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
415{
416	vm_object_t obj;
417
418	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
419		return (ENOMEM);
420	else
421		return (0);
422}
423
424int
425vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
426{
427
428	vmm_mmio_free(vm->vmspace, gpa, len);
429	return (0);
430}
431
432boolean_t
433vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
434{
435	int i;
436	vm_paddr_t gpabase, gpalimit;
437
438	for (i = 0; i < vm->num_mem_segs; i++) {
439		gpabase = vm->mem_segs[i].gpa;
440		gpalimit = gpabase + vm->mem_segs[i].len;
441		if (gpa >= gpabase && gpa < gpalimit)
442			return (TRUE);		/* 'gpa' is regular memory */
443	}
444
445	if (ppt_is_mmio(vm, gpa))
446		return (TRUE);			/* 'gpa' is pci passthru mmio */
447
448	return (FALSE);
449}
450
451int
452vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
453{
454	int available, allocated;
455	struct mem_seg *seg;
456	vm_object_t object;
457	vm_paddr_t g;
458
459	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
460		return (EINVAL);
461
462	available = allocated = 0;
463	g = gpa;
464	while (g < gpa + len) {
465		if (vm_mem_allocated(vm, g))
466			allocated++;
467		else
468			available++;
469
470		g += PAGE_SIZE;
471	}
472
473	/*
474	 * If there are some allocated and some available pages in the address
475	 * range then it is an error.
476	 */
477	if (allocated && available)
478		return (EINVAL);
479
480	/*
481	 * If the entire address range being requested has already been
482	 * allocated then there isn't anything more to do.
483	 */
484	if (allocated && available == 0)
485		return (0);
486
487	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
488		return (E2BIG);
489
490	seg = &vm->mem_segs[vm->num_mem_segs];
491
492	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
493		return (ENOMEM);
494
495	seg->gpa = gpa;
496	seg->len = len;
497	seg->object = object;
498	seg->wired = FALSE;
499
500	vm->num_mem_segs++;
501
502	return (0);
503}
504
505static void
506vm_gpa_unwire(struct vm *vm)
507{
508	int i, rv;
509	struct mem_seg *seg;
510
511	for (i = 0; i < vm->num_mem_segs; i++) {
512		seg = &vm->mem_segs[i];
513		if (!seg->wired)
514			continue;
515
516		rv = vm_map_unwire(&vm->vmspace->vm_map,
517				   seg->gpa, seg->gpa + seg->len,
518				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
519		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
520		    "%#lx/%ld could not be unwired: %d",
521		    vm_name(vm), seg->gpa, seg->len, rv));
522
523		seg->wired = FALSE;
524	}
525}
526
527static int
528vm_gpa_wire(struct vm *vm)
529{
530	int i, rv;
531	struct mem_seg *seg;
532
533	for (i = 0; i < vm->num_mem_segs; i++) {
534		seg = &vm->mem_segs[i];
535		if (seg->wired)
536			continue;
537
538		/* XXX rlimits? */
539		rv = vm_map_wire(&vm->vmspace->vm_map,
540				 seg->gpa, seg->gpa + seg->len,
541				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
542		if (rv != KERN_SUCCESS)
543			break;
544
545		seg->wired = TRUE;
546	}
547
548	if (i < vm->num_mem_segs) {
549		/*
550		 * Undo the wiring before returning an error.
551		 */
552		vm_gpa_unwire(vm);
553		return (EAGAIN);
554	}
555
556	return (0);
557}
558
559static void
560vm_iommu_modify(struct vm *vm, boolean_t map)
561{
562	int i, sz;
563	vm_paddr_t gpa, hpa;
564	struct mem_seg *seg;
565	void *vp, *cookie, *host_domain;
566
567	sz = PAGE_SIZE;
568	host_domain = iommu_host_domain();
569
570	for (i = 0; i < vm->num_mem_segs; i++) {
571		seg = &vm->mem_segs[i];
572		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
573		    vm_name(vm), seg->gpa, seg->len));
574
575		gpa = seg->gpa;
576		while (gpa < seg->gpa + seg->len) {
577			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
578					 &cookie);
579			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
580			    vm_name(vm), gpa));
581
582			vm_gpa_release(cookie);
583
584			hpa = DMAP_TO_PHYS((uintptr_t)vp);
585			if (map) {
586				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
587				iommu_remove_mapping(host_domain, hpa, sz);
588			} else {
589				iommu_remove_mapping(vm->iommu, gpa, sz);
590				iommu_create_mapping(host_domain, hpa, hpa, sz);
591			}
592
593			gpa += PAGE_SIZE;
594		}
595	}
596
597	/*
598	 * Invalidate the cached translations associated with the domain
599	 * from which pages were removed.
600	 */
601	if (map)
602		iommu_invalidate_tlb(host_domain);
603	else
604		iommu_invalidate_tlb(vm->iommu);
605}
606
607#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
608#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
609
610int
611vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
612{
613	int error;
614
615	error = ppt_unassign_device(vm, bus, slot, func);
616	if (error)
617		return (error);
618
619	if (ppt_assigned_devices(vm) == 0) {
620		vm_iommu_unmap(vm);
621		vm_gpa_unwire(vm);
622	}
623	return (0);
624}
625
626int
627vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
628{
629	int error;
630	vm_paddr_t maxaddr;
631
632	/*
633	 * Virtual machines with pci passthru devices get special treatment:
634	 * - the guest physical memory is wired
635	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
636	 *
637	 * We need to do this before the first pci passthru device is attached.
638	 */
639	if (ppt_assigned_devices(vm) == 0) {
640		KASSERT(vm->iommu == NULL,
641		    ("vm_assign_pptdev: iommu must be NULL"));
642		maxaddr = vmm_mem_maxaddr();
643		vm->iommu = iommu_create_domain(maxaddr);
644
645		error = vm_gpa_wire(vm);
646		if (error)
647			return (error);
648
649		vm_iommu_map(vm);
650	}
651
652	error = ppt_assign_device(vm, bus, slot, func);
653	return (error);
654}
655
656void *
657vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
658	    void **cookie)
659{
660	int count, pageoff;
661	vm_page_t m;
662
663	pageoff = gpa & PAGE_MASK;
664	if (len > PAGE_SIZE - pageoff)
665		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
666
667	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
668	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
669
670	if (count == 1) {
671		*cookie = m;
672		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
673	} else {
674		*cookie = NULL;
675		return (NULL);
676	}
677}
678
679void
680vm_gpa_release(void *cookie)
681{
682	vm_page_t m = cookie;
683
684	vm_page_lock(m);
685	vm_page_unhold(m);
686	vm_page_unlock(m);
687}
688
689int
690vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
691		  struct vm_memory_segment *seg)
692{
693	int i;
694
695	for (i = 0; i < vm->num_mem_segs; i++) {
696		if (gpabase == vm->mem_segs[i].gpa) {
697			seg->gpa = vm->mem_segs[i].gpa;
698			seg->len = vm->mem_segs[i].len;
699			seg->wired = vm->mem_segs[i].wired;
700			return (0);
701		}
702	}
703	return (-1);
704}
705
706int
707vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
708	      vm_offset_t *offset, struct vm_object **object)
709{
710	int i;
711	size_t seg_len;
712	vm_paddr_t seg_gpa;
713	vm_object_t seg_obj;
714
715	for (i = 0; i < vm->num_mem_segs; i++) {
716		if ((seg_obj = vm->mem_segs[i].object) == NULL)
717			continue;
718
719		seg_gpa = vm->mem_segs[i].gpa;
720		seg_len = vm->mem_segs[i].len;
721
722		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
723			*offset = gpa - seg_gpa;
724			*object = seg_obj;
725			vm_object_reference(seg_obj);
726			return (0);
727		}
728	}
729
730	return (EINVAL);
731}
732
733int
734vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
735{
736
737	if (vcpu < 0 || vcpu >= VM_MAXCPU)
738		return (EINVAL);
739
740	if (reg >= VM_REG_LAST)
741		return (EINVAL);
742
743	return (VMGETREG(vm->cookie, vcpu, reg, retval));
744}
745
746int
747vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
748{
749
750	if (vcpu < 0 || vcpu >= VM_MAXCPU)
751		return (EINVAL);
752
753	if (reg >= VM_REG_LAST)
754		return (EINVAL);
755
756	return (VMSETREG(vm->cookie, vcpu, reg, val));
757}
758
759static boolean_t
760is_descriptor_table(int reg)
761{
762
763	switch (reg) {
764	case VM_REG_GUEST_IDTR:
765	case VM_REG_GUEST_GDTR:
766		return (TRUE);
767	default:
768		return (FALSE);
769	}
770}
771
772static boolean_t
773is_segment_register(int reg)
774{
775
776	switch (reg) {
777	case VM_REG_GUEST_ES:
778	case VM_REG_GUEST_CS:
779	case VM_REG_GUEST_SS:
780	case VM_REG_GUEST_DS:
781	case VM_REG_GUEST_FS:
782	case VM_REG_GUEST_GS:
783	case VM_REG_GUEST_TR:
784	case VM_REG_GUEST_LDTR:
785		return (TRUE);
786	default:
787		return (FALSE);
788	}
789}
790
791int
792vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
793		struct seg_desc *desc)
794{
795
796	if (vcpu < 0 || vcpu >= VM_MAXCPU)
797		return (EINVAL);
798
799	if (!is_segment_register(reg) && !is_descriptor_table(reg))
800		return (EINVAL);
801
802	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
803}
804
805int
806vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
807		struct seg_desc *desc)
808{
809	if (vcpu < 0 || vcpu >= VM_MAXCPU)
810		return (EINVAL);
811
812	if (!is_segment_register(reg) && !is_descriptor_table(reg))
813		return (EINVAL);
814
815	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
816}
817
818static void
819restore_guest_fpustate(struct vcpu *vcpu)
820{
821
822	/* flush host state to the pcb */
823	fpuexit(curthread);
824
825	/* restore guest FPU state */
826	fpu_stop_emulating();
827	fpurestore(vcpu->guestfpu);
828
829	/* restore guest XCR0 if XSAVE is enabled in the host */
830	if (rcr4() & CR4_XSAVE)
831		load_xcr(0, vcpu->guest_xcr0);
832
833	/*
834	 * The FPU is now "dirty" with the guest's state so turn on emulation
835	 * to trap any access to the FPU by the host.
836	 */
837	fpu_start_emulating();
838}
839
840static void
841save_guest_fpustate(struct vcpu *vcpu)
842{
843
844	if ((rcr0() & CR0_TS) == 0)
845		panic("fpu emulation not enabled in host!");
846
847	/* save guest XCR0 and restore host XCR0 */
848	if (rcr4() & CR4_XSAVE) {
849		vcpu->guest_xcr0 = rxcr(0);
850		load_xcr(0, vmm_get_host_xcr0());
851	}
852
853	/* save guest FPU state */
854	fpu_stop_emulating();
855	fpusave(vcpu->guestfpu);
856	fpu_start_emulating();
857}
858
859static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
860
861static int
862vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
863    bool from_idle)
864{
865	int error;
866
867	vcpu_assert_locked(vcpu);
868
869	/*
870	 * State transitions from the vmmdev_ioctl() must always begin from
871	 * the VCPU_IDLE state. This guarantees that there is only a single
872	 * ioctl() operating on a vcpu at any point.
873	 */
874	if (from_idle) {
875		while (vcpu->state != VCPU_IDLE)
876			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
877	} else {
878		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
879		    "vcpu idle state"));
880	}
881
882	if (vcpu->state == VCPU_RUNNING) {
883		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
884		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
885	} else {
886		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
887		    "vcpu that is not running", vcpu->hostcpu));
888	}
889
890	/*
891	 * The following state transitions are allowed:
892	 * IDLE -> FROZEN -> IDLE
893	 * FROZEN -> RUNNING -> FROZEN
894	 * FROZEN -> SLEEPING -> FROZEN
895	 */
896	switch (vcpu->state) {
897	case VCPU_IDLE:
898	case VCPU_RUNNING:
899	case VCPU_SLEEPING:
900		error = (newstate != VCPU_FROZEN);
901		break;
902	case VCPU_FROZEN:
903		error = (newstate == VCPU_FROZEN);
904		break;
905	default:
906		error = 1;
907		break;
908	}
909
910	if (error)
911		return (EBUSY);
912
913	vcpu->state = newstate;
914	if (newstate == VCPU_RUNNING)
915		vcpu->hostcpu = curcpu;
916	else
917		vcpu->hostcpu = NOCPU;
918
919	if (newstate == VCPU_IDLE)
920		wakeup(&vcpu->state);
921
922	return (0);
923}
924
925static void
926vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
927{
928	int error;
929
930	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
931		panic("Error %d setting state to %d\n", error, newstate);
932}
933
934static void
935vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
936{
937	int error;
938
939	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
940		panic("Error %d setting state to %d", error, newstate);
941}
942
943static void
944vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
945{
946
947	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
948
949	/*
950	 * Update 'rendezvous_func' and execute a write memory barrier to
951	 * ensure that it is visible across all host cpus. This is not needed
952	 * for correctness but it does ensure that all the vcpus will notice
953	 * that the rendezvous is requested immediately.
954	 */
955	vm->rendezvous_func = func;
956	wmb();
957}
958
959#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
960	do {								\
961		if (vcpuid >= 0)					\
962			VCPU_CTR0(vm, vcpuid, fmt);			\
963		else							\
964			VM_CTR0(vm, fmt);				\
965	} while (0)
966
967static void
968vm_handle_rendezvous(struct vm *vm, int vcpuid)
969{
970
971	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
972	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
973
974	mtx_lock(&vm->rendezvous_mtx);
975	while (vm->rendezvous_func != NULL) {
976		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
977		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
978
979		if (vcpuid != -1 &&
980		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
981		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
982			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
983			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
984			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
985		}
986		if (CPU_CMP(&vm->rendezvous_req_cpus,
987		    &vm->rendezvous_done_cpus) == 0) {
988			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
989			vm_set_rendezvous_func(vm, NULL);
990			wakeup(&vm->rendezvous_func);
991			break;
992		}
993		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
994		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
995		    "vmrndv", 0);
996	}
997	mtx_unlock(&vm->rendezvous_mtx);
998}
999
1000/*
1001 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1002 */
1003static int
1004vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1005{
1006	struct vm_exit *vmexit;
1007	struct vcpu *vcpu;
1008	int t, timo, spindown;
1009
1010	vcpu = &vm->vcpu[vcpuid];
1011	spindown = 0;
1012
1013	vcpu_lock(vcpu);
1014
1015	/*
1016	 * Do a final check for pending NMI or interrupts before
1017	 * really putting this thread to sleep.
1018	 *
1019	 * These interrupts could have happened any time after we
1020	 * returned from VMRUN() and before we grabbed the vcpu lock.
1021	 */
1022	if (!vm_nmi_pending(vm, vcpuid) &&
1023	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1024		t = ticks;
1025		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1026		if (vlapic_enabled(vcpu->vlapic)) {
1027			/*
1028			 * XXX msleep_spin() is not interruptible so use the
1029			 * 'timo' to put an upper bound on the sleep time.
1030			 */
1031			timo = hz;
1032			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1033		} else {
1034			/*
1035			 * Spindown the vcpu if the apic is disabled and it
1036			 * had entered the halted state.
1037			 */
1038			spindown = 1;
1039		}
1040		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1041		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1042	}
1043	vcpu_unlock(vcpu);
1044
1045	/*
1046	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1047	 * outside the confines of the vcpu spinlock.
1048	 */
1049	if (spindown) {
1050		*retu = true;
1051		vmexit = vm_exitinfo(vm, vcpuid);
1052		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1053		vm_deactivate_cpu(vm, vcpuid);
1054		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1055	}
1056
1057	return (0);
1058}
1059
1060static int
1061vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1062{
1063	int rv, ftype;
1064	struct vm_map *map;
1065	struct vcpu *vcpu;
1066	struct vm_exit *vme;
1067
1068	vcpu = &vm->vcpu[vcpuid];
1069	vme = &vcpu->exitinfo;
1070
1071	ftype = vme->u.paging.fault_type;
1072	KASSERT(ftype == VM_PROT_READ ||
1073	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1074	    ("vm_handle_paging: invalid fault_type %d", ftype));
1075
1076	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1077		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1078		    vme->u.paging.gpa, ftype);
1079		if (rv == 0)
1080			goto done;
1081	}
1082
1083	map = &vm->vmspace->vm_map;
1084	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1085
1086	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1087	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1088
1089	if (rv != KERN_SUCCESS)
1090		return (EFAULT);
1091done:
1092	/* restart execution at the faulting instruction */
1093	vme->inst_length = 0;
1094
1095	return (0);
1096}
1097
1098static int
1099vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1100{
1101	struct vie *vie;
1102	struct vcpu *vcpu;
1103	struct vm_exit *vme;
1104	int error, inst_length;
1105	uint64_t rip, gla, gpa, cr3;
1106	enum vie_cpu_mode cpu_mode;
1107	enum vie_paging_mode paging_mode;
1108	mem_region_read_t mread;
1109	mem_region_write_t mwrite;
1110
1111	vcpu = &vm->vcpu[vcpuid];
1112	vme = &vcpu->exitinfo;
1113
1114	rip = vme->rip;
1115	inst_length = vme->inst_length;
1116
1117	gla = vme->u.inst_emul.gla;
1118	gpa = vme->u.inst_emul.gpa;
1119	cr3 = vme->u.inst_emul.cr3;
1120	cpu_mode = vme->u.inst_emul.cpu_mode;
1121	paging_mode = vme->u.inst_emul.paging_mode;
1122	vie = &vme->u.inst_emul.vie;
1123
1124	vie_init(vie);
1125
1126	/* Fetch, decode and emulate the faulting instruction */
1127	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3,
1128	    paging_mode, vie) != 0)
1129		return (EFAULT);
1130
1131	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, vie) != 0)
1132		return (EFAULT);
1133
1134	/* return to userland unless this is an in-kernel emulated device */
1135	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1136		mread = lapic_mmio_read;
1137		mwrite = lapic_mmio_write;
1138	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1139		mread = vioapic_mmio_read;
1140		mwrite = vioapic_mmio_write;
1141	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1142		mread = vhpet_mmio_read;
1143		mwrite = vhpet_mmio_write;
1144	} else {
1145		*retu = true;
1146		return (0);
1147	}
1148
1149	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1150	    retu);
1151
1152	return (error);
1153}
1154
1155int
1156vm_run(struct vm *vm, struct vm_run *vmrun)
1157{
1158	int error, vcpuid;
1159	struct vcpu *vcpu;
1160	struct pcb *pcb;
1161	uint64_t tscval, rip;
1162	struct vm_exit *vme;
1163	bool retu, intr_disabled;
1164	pmap_t pmap;
1165
1166	vcpuid = vmrun->cpuid;
1167
1168	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1169		return (EINVAL);
1170
1171	pmap = vmspace_pmap(vm->vmspace);
1172	vcpu = &vm->vcpu[vcpuid];
1173	vme = &vcpu->exitinfo;
1174	rip = vmrun->rip;
1175restart:
1176	critical_enter();
1177
1178	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1179	    ("vm_run: absurd pm_active"));
1180
1181	tscval = rdtsc();
1182
1183	pcb = PCPU_GET(curpcb);
1184	set_pcb_flags(pcb, PCB_FULL_IRET);
1185
1186	restore_guest_msrs(vm, vcpuid);
1187	restore_guest_fpustate(vcpu);
1188
1189	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1190	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1191	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1192
1193	save_guest_fpustate(vcpu);
1194	restore_host_msrs(vm, vcpuid);
1195
1196	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1197
1198	critical_exit();
1199
1200	if (error == 0) {
1201		retu = false;
1202		switch (vme->exitcode) {
1203		case VM_EXITCODE_IOAPIC_EOI:
1204			vioapic_process_eoi(vm, vcpuid,
1205			    vme->u.ioapic_eoi.vector);
1206			break;
1207		case VM_EXITCODE_RENDEZVOUS:
1208			vm_handle_rendezvous(vm, vcpuid);
1209			error = 0;
1210			break;
1211		case VM_EXITCODE_HLT:
1212			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1213			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1214			break;
1215		case VM_EXITCODE_PAGING:
1216			error = vm_handle_paging(vm, vcpuid, &retu);
1217			break;
1218		case VM_EXITCODE_INST_EMUL:
1219			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1220			break;
1221		default:
1222			retu = true;	/* handled in userland */
1223			break;
1224		}
1225	}
1226
1227	if (error == 0 && retu == false) {
1228		rip = vme->rip + vme->inst_length;
1229		goto restart;
1230	}
1231
1232	/* copy the exit information */
1233	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1234	return (error);
1235}
1236
1237int
1238vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1239{
1240	struct vcpu *vcpu;
1241
1242	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1243		return (EINVAL);
1244
1245	if (exception->vector < 0 || exception->vector >= 32)
1246		return (EINVAL);
1247
1248	vcpu = &vm->vcpu[vcpuid];
1249
1250	if (vcpu->exception_pending) {
1251		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1252		    "pending exception %d", exception->vector,
1253		    vcpu->exception.vector);
1254		return (EBUSY);
1255	}
1256
1257	vcpu->exception_pending = 1;
1258	vcpu->exception = *exception;
1259	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1260	return (0);
1261}
1262
1263int
1264vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
1265{
1266	struct vcpu *vcpu;
1267	int pending;
1268
1269	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1270
1271	vcpu = &vm->vcpu[vcpuid];
1272	pending = vcpu->exception_pending;
1273	if (pending) {
1274		vcpu->exception_pending = 0;
1275		*exception = vcpu->exception;
1276		VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
1277		    exception->vector);
1278	}
1279	return (pending);
1280}
1281
1282static void
1283vm_inject_fault(struct vm *vm, int vcpuid, struct vm_exception *exception)
1284{
1285	struct vm_exit *vmexit;
1286	int error;
1287
1288	error = vm_inject_exception(vm, vcpuid, exception);
1289	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1290
1291	/*
1292	 * A fault-like exception allows the instruction to be restarted
1293	 * after the exception handler returns.
1294	 *
1295	 * By setting the inst_length to 0 we ensure that the instruction
1296	 * pointer remains at the faulting instruction.
1297	 */
1298	vmexit = vm_exitinfo(vm, vcpuid);
1299	vmexit->inst_length = 0;
1300}
1301
1302void
1303vm_inject_gp(struct vm *vm, int vcpuid)
1304{
1305	struct vm_exception gpf = {
1306		.vector = IDT_GP,
1307		.error_code_valid = 1,
1308		.error_code = 0
1309	};
1310
1311	vm_inject_fault(vm, vcpuid, &gpf);
1312}
1313
1314void
1315vm_inject_ud(struct vm *vm, int vcpuid)
1316{
1317	struct vm_exception udf = {
1318		.vector = IDT_UD,
1319		.error_code_valid = 0
1320	};
1321
1322	vm_inject_fault(vm, vcpuid, &udf);
1323}
1324
1325static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1326
1327int
1328vm_inject_nmi(struct vm *vm, int vcpuid)
1329{
1330	struct vcpu *vcpu;
1331
1332	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1333		return (EINVAL);
1334
1335	vcpu = &vm->vcpu[vcpuid];
1336
1337	vcpu->nmi_pending = 1;
1338	vcpu_notify_event(vm, vcpuid, false);
1339	return (0);
1340}
1341
1342int
1343vm_nmi_pending(struct vm *vm, int vcpuid)
1344{
1345	struct vcpu *vcpu;
1346
1347	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1348		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1349
1350	vcpu = &vm->vcpu[vcpuid];
1351
1352	return (vcpu->nmi_pending);
1353}
1354
1355void
1356vm_nmi_clear(struct vm *vm, int vcpuid)
1357{
1358	struct vcpu *vcpu;
1359
1360	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1361		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1362
1363	vcpu = &vm->vcpu[vcpuid];
1364
1365	if (vcpu->nmi_pending == 0)
1366		panic("vm_nmi_clear: inconsistent nmi_pending state");
1367
1368	vcpu->nmi_pending = 0;
1369	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1370}
1371
1372static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1373
1374int
1375vm_inject_extint(struct vm *vm, int vcpuid)
1376{
1377	struct vcpu *vcpu;
1378
1379	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1380		return (EINVAL);
1381
1382	vcpu = &vm->vcpu[vcpuid];
1383
1384	vcpu->extint_pending = 1;
1385	vcpu_notify_event(vm, vcpuid, false);
1386	return (0);
1387}
1388
1389int
1390vm_extint_pending(struct vm *vm, int vcpuid)
1391{
1392	struct vcpu *vcpu;
1393
1394	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1395		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1396
1397	vcpu = &vm->vcpu[vcpuid];
1398
1399	return (vcpu->extint_pending);
1400}
1401
1402void
1403vm_extint_clear(struct vm *vm, int vcpuid)
1404{
1405	struct vcpu *vcpu;
1406
1407	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1408		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1409
1410	vcpu = &vm->vcpu[vcpuid];
1411
1412	if (vcpu->extint_pending == 0)
1413		panic("vm_extint_clear: inconsistent extint_pending state");
1414
1415	vcpu->extint_pending = 0;
1416	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1417}
1418
1419int
1420vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1421{
1422	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1423		return (EINVAL);
1424
1425	if (type < 0 || type >= VM_CAP_MAX)
1426		return (EINVAL);
1427
1428	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1429}
1430
1431int
1432vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1433{
1434	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1435		return (EINVAL);
1436
1437	if (type < 0 || type >= VM_CAP_MAX)
1438		return (EINVAL);
1439
1440	return (VMSETCAP(vm->cookie, vcpu, type, val));
1441}
1442
1443uint64_t *
1444vm_guest_msrs(struct vm *vm, int cpu)
1445{
1446	return (vm->vcpu[cpu].guest_msrs);
1447}
1448
1449struct vlapic *
1450vm_lapic(struct vm *vm, int cpu)
1451{
1452	return (vm->vcpu[cpu].vlapic);
1453}
1454
1455struct vioapic *
1456vm_ioapic(struct vm *vm)
1457{
1458
1459	return (vm->vioapic);
1460}
1461
1462struct vhpet *
1463vm_hpet(struct vm *vm)
1464{
1465
1466	return (vm->vhpet);
1467}
1468
1469boolean_t
1470vmm_is_pptdev(int bus, int slot, int func)
1471{
1472	int found, i, n;
1473	int b, s, f;
1474	char *val, *cp, *cp2;
1475
1476	/*
1477	 * XXX
1478	 * The length of an environment variable is limited to 128 bytes which
1479	 * puts an upper limit on the number of passthru devices that may be
1480	 * specified using a single environment variable.
1481	 *
1482	 * Work around this by scanning multiple environment variable
1483	 * names instead of a single one - yuck!
1484	 */
1485	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1486
1487	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1488	found = 0;
1489	for (i = 0; names[i] != NULL && !found; i++) {
1490		cp = val = getenv(names[i]);
1491		while (cp != NULL && *cp != '\0') {
1492			if ((cp2 = strchr(cp, ' ')) != NULL)
1493				*cp2 = '\0';
1494
1495			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1496			if (n == 3 && bus == b && slot == s && func == f) {
1497				found = 1;
1498				break;
1499			}
1500
1501			if (cp2 != NULL)
1502				*cp2++ = ' ';
1503
1504			cp = cp2;
1505		}
1506		freeenv(val);
1507	}
1508	return (found);
1509}
1510
1511void *
1512vm_iommu_domain(struct vm *vm)
1513{
1514
1515	return (vm->iommu);
1516}
1517
1518int
1519vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1520    bool from_idle)
1521{
1522	int error;
1523	struct vcpu *vcpu;
1524
1525	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1526		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1527
1528	vcpu = &vm->vcpu[vcpuid];
1529
1530	vcpu_lock(vcpu);
1531	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1532	vcpu_unlock(vcpu);
1533
1534	return (error);
1535}
1536
1537enum vcpu_state
1538vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1539{
1540	struct vcpu *vcpu;
1541	enum vcpu_state state;
1542
1543	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1544		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1545
1546	vcpu = &vm->vcpu[vcpuid];
1547
1548	vcpu_lock(vcpu);
1549	state = vcpu->state;
1550	if (hostcpu != NULL)
1551		*hostcpu = vcpu->hostcpu;
1552	vcpu_unlock(vcpu);
1553
1554	return (state);
1555}
1556
1557void
1558vm_activate_cpu(struct vm *vm, int vcpuid)
1559{
1560
1561	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1562	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1563	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1564	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1565
1566	VCPU_CTR0(vm, vcpuid, "activated");
1567	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1568}
1569
1570static void
1571vm_deactivate_cpu(struct vm *vm, int vcpuid)
1572{
1573
1574	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1575	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1576	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1577	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1578
1579	VCPU_CTR0(vm, vcpuid, "deactivated");
1580	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1581
1582	/*
1583	 * If a vcpu rendezvous is in progress then it could be blocked
1584	 * on 'vcpuid' - unblock it before disappearing forever.
1585	 */
1586	mtx_lock(&vm->rendezvous_mtx);
1587	if (vm->rendezvous_func != NULL) {
1588		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1589		wakeup(&vm->rendezvous_func);
1590	}
1591	mtx_unlock(&vm->rendezvous_mtx);
1592}
1593
1594cpuset_t
1595vm_active_cpus(struct vm *vm)
1596{
1597
1598	return (vm->active_cpus);
1599}
1600
1601void *
1602vcpu_stats(struct vm *vm, int vcpuid)
1603{
1604
1605	return (vm->vcpu[vcpuid].stats);
1606}
1607
1608int
1609vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1610{
1611	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1612		return (EINVAL);
1613
1614	*state = vm->vcpu[vcpuid].x2apic_state;
1615
1616	return (0);
1617}
1618
1619int
1620vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1621{
1622	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1623		return (EINVAL);
1624
1625	if (state >= X2APIC_STATE_LAST)
1626		return (EINVAL);
1627
1628	vm->vcpu[vcpuid].x2apic_state = state;
1629
1630	vlapic_set_x2apic_state(vm, vcpuid, state);
1631
1632	return (0);
1633}
1634
1635/*
1636 * This function is called to ensure that a vcpu "sees" a pending event
1637 * as soon as possible:
1638 * - If the vcpu thread is sleeping then it is woken up.
1639 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1640 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1641 */
1642void
1643vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1644{
1645	int hostcpu;
1646	struct vcpu *vcpu;
1647
1648	vcpu = &vm->vcpu[vcpuid];
1649
1650	vcpu_lock(vcpu);
1651	hostcpu = vcpu->hostcpu;
1652	if (vcpu->state == VCPU_RUNNING) {
1653		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1654		if (hostcpu != curcpu) {
1655			if (lapic_intr) {
1656				vlapic_post_intr(vcpu->vlapic, hostcpu,
1657				    vmm_ipinum);
1658			} else {
1659				ipi_cpu(hostcpu, vmm_ipinum);
1660			}
1661		} else {
1662			/*
1663			 * If the 'vcpu' is running on 'curcpu' then it must
1664			 * be sending a notification to itself (e.g. SELF_IPI).
1665			 * The pending event will be picked up when the vcpu
1666			 * transitions back to guest context.
1667			 */
1668		}
1669	} else {
1670		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1671		    "with hostcpu %d", vcpu->state, hostcpu));
1672		if (vcpu->state == VCPU_SLEEPING)
1673			wakeup_one(vcpu);
1674	}
1675	vcpu_unlock(vcpu);
1676}
1677
1678struct vmspace *
1679vm_get_vmspace(struct vm *vm)
1680{
1681
1682	return (vm->vmspace);
1683}
1684
1685int
1686vm_apicid2vcpuid(struct vm *vm, int apicid)
1687{
1688	/*
1689	 * XXX apic id is assumed to be numerically identical to vcpu id
1690	 */
1691	return (apicid);
1692}
1693
1694void
1695vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1696    vm_rendezvous_func_t func, void *arg)
1697{
1698	int i;
1699
1700	/*
1701	 * Enforce that this function is called without any locks
1702	 */
1703	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1704	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1705	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1706
1707restart:
1708	mtx_lock(&vm->rendezvous_mtx);
1709	if (vm->rendezvous_func != NULL) {
1710		/*
1711		 * If a rendezvous is already in progress then we need to
1712		 * call the rendezvous handler in case this 'vcpuid' is one
1713		 * of the targets of the rendezvous.
1714		 */
1715		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1716		mtx_unlock(&vm->rendezvous_mtx);
1717		vm_handle_rendezvous(vm, vcpuid);
1718		goto restart;
1719	}
1720	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1721	    "rendezvous is still in progress"));
1722
1723	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1724	vm->rendezvous_req_cpus = dest;
1725	CPU_ZERO(&vm->rendezvous_done_cpus);
1726	vm->rendezvous_arg = arg;
1727	vm_set_rendezvous_func(vm, func);
1728	mtx_unlock(&vm->rendezvous_mtx);
1729
1730	/*
1731	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1732	 * vcpus so they handle the rendezvous as soon as possible.
1733	 */
1734	for (i = 0; i < VM_MAXCPU; i++) {
1735		if (CPU_ISSET(i, &dest))
1736			vcpu_notify_event(vm, i, false);
1737	}
1738
1739	vm_handle_rendezvous(vm, vcpuid);
1740}
1741
1742struct vatpic *
1743vm_atpic(struct vm *vm)
1744{
1745	return (vm->vatpic);
1746}
1747
1748struct vatpit *
1749vm_atpit(struct vm *vm)
1750{
1751	return (vm->vatpit);
1752}
1753