vmm.c revision 261088
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 261088 2014-01-23 20:21:39Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 261088 2014-01-23 20:21:39Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/vm.h>
56#include <machine/pcb.h>
57#include <machine/smp.h>
58#include <x86/apicreg.h>
59#include <machine/pmap.h>
60#include <machine/vmparam.h>
61
62#include <machine/vmm.h>
63#include <machine/vmm_dev.h>
64
65#include "vmm_ktr.h"
66#include "vmm_host.h"
67#include "vmm_mem.h"
68#include "vmm_util.h"
69#include "vhpet.h"
70#include "vioapic.h"
71#include "vlapic.h"
72#include "vmm_msr.h"
73#include "vmm_ipi.h"
74#include "vmm_stat.h"
75#include "vmm_lapic.h"
76
77#include "io/ppt.h"
78#include "io/iommu.h"
79
80struct vlapic;
81
82struct vcpu {
83	int		flags;
84	enum vcpu_state	state;
85	struct mtx	mtx;
86	int		hostcpu;	/* host cpuid this vcpu last ran on */
87	uint64_t	guest_msrs[VMM_MSR_NUM];
88	struct vlapic	*vlapic;
89	int		 vcpuid;
90	struct savefpu	*guestfpu;	/* guest fpu state */
91	void		*stats;
92	struct vm_exit	exitinfo;
93	enum x2apic_state x2apic_state;
94	int		nmi_pending;
95};
96
97#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
98#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
99#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
100#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
101
102struct mem_seg {
103	vm_paddr_t	gpa;
104	size_t		len;
105	boolean_t	wired;
106	vm_object_t	object;
107};
108#define	VM_MAX_MEMORY_SEGMENTS	2
109
110struct vm {
111	void		*cookie;	/* processor-specific data */
112	void		*iommu;		/* iommu-specific data */
113	struct vhpet	*vhpet;		/* virtual HPET */
114	struct vioapic	*vioapic;	/* virtual ioapic */
115	struct vmspace	*vmspace;	/* guest's address space */
116	struct vcpu	vcpu[VM_MAXCPU];
117	int		num_mem_segs;
118	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
119	char		name[VM_MAX_NAMELEN];
120
121	/*
122	 * Set of active vcpus.
123	 * An active vcpu is one that has been started implicitly (BSP) or
124	 * explicitly (AP) by sending it a startup ipi.
125	 */
126	cpuset_t	active_cpus;
127};
128
129static int vmm_initialized;
130
131static struct vmm_ops *ops;
132#define	VMM_INIT()	(ops != NULL ? (*ops->init)() : 0)
133#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
134
135#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
136#define	VMRUN(vmi, vcpu, rip, pmap) \
137	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap) : ENXIO)
138#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
139#define	VMSPACE_ALLOC(min, max) \
140	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
141#define	VMSPACE_FREE(vmspace) \
142	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
143#define	VMGETREG(vmi, vcpu, num, retval)		\
144	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
145#define	VMSETREG(vmi, vcpu, num, val)		\
146	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
147#define	VMGETDESC(vmi, vcpu, num, desc)		\
148	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
149#define	VMSETDESC(vmi, vcpu, num, desc)		\
150	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
151#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
152	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
153#define	VMGETCAP(vmi, vcpu, num, retval)	\
154	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
155#define	VMSETCAP(vmi, vcpu, num, val)		\
156	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
157
158#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
159#define	fpu_stop_emulating()	clts()
160
161static MALLOC_DEFINE(M_VM, "vm", "vm");
162CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
163
164/* statistics */
165static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
166
167static void
168vcpu_cleanup(struct vcpu *vcpu)
169{
170	vlapic_cleanup(vcpu->vlapic);
171	vmm_stat_free(vcpu->stats);
172	fpu_save_area_free(vcpu->guestfpu);
173}
174
175static void
176vcpu_init(struct vm *vm, uint32_t vcpu_id)
177{
178	struct vcpu *vcpu;
179
180	vcpu = &vm->vcpu[vcpu_id];
181
182	vcpu_lock_init(vcpu);
183	vcpu->hostcpu = NOCPU;
184	vcpu->vcpuid = vcpu_id;
185	vcpu->vlapic = vlapic_init(vm, vcpu_id);
186	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
187	vcpu->guestfpu = fpu_save_area_alloc();
188	fpu_save_area_reset(vcpu->guestfpu);
189	vcpu->stats = vmm_stat_alloc();
190}
191
192struct vm_exit *
193vm_exitinfo(struct vm *vm, int cpuid)
194{
195	struct vcpu *vcpu;
196
197	if (cpuid < 0 || cpuid >= VM_MAXCPU)
198		panic("vm_exitinfo: invalid cpuid %d", cpuid);
199
200	vcpu = &vm->vcpu[cpuid];
201
202	return (&vcpu->exitinfo);
203}
204
205static int
206vmm_init(void)
207{
208	int error;
209
210	vmm_host_state_init();
211	vmm_ipi_init();
212
213	error = vmm_mem_init();
214	if (error)
215		return (error);
216
217	if (vmm_is_intel())
218		ops = &vmm_ops_intel;
219	else if (vmm_is_amd())
220		ops = &vmm_ops_amd;
221	else
222		return (ENXIO);
223
224	vmm_msr_init();
225
226	return (VMM_INIT());
227}
228
229static int
230vmm_handler(module_t mod, int what, void *arg)
231{
232	int error;
233
234	switch (what) {
235	case MOD_LOAD:
236		vmmdev_init();
237		iommu_init();
238		error = vmm_init();
239		if (error == 0)
240			vmm_initialized = 1;
241		break;
242	case MOD_UNLOAD:
243		error = vmmdev_cleanup();
244		if (error == 0) {
245			iommu_cleanup();
246			vmm_ipi_cleanup();
247			error = VMM_CLEANUP();
248			/*
249			 * Something bad happened - prevent new
250			 * VMs from being created
251			 */
252			if (error)
253				vmm_initialized = 0;
254		}
255		break;
256	default:
257		error = 0;
258		break;
259	}
260	return (error);
261}
262
263static moduledata_t vmm_kmod = {
264	"vmm",
265	vmm_handler,
266	NULL
267};
268
269/*
270 * vmm initialization has the following dependencies:
271 *
272 * - iommu initialization must happen after the pci passthru driver has had
273 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
274 *
275 * - VT-x initialization requires smp_rendezvous() and therefore must happen
276 *   after SMP is fully functional (after SI_SUB_SMP).
277 */
278DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
279MODULE_VERSION(vmm, 1);
280
281SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
282
283int
284vm_create(const char *name, struct vm **retvm)
285{
286	int i;
287	struct vm *vm;
288	struct vmspace *vmspace;
289
290	const int BSP = 0;
291
292	/*
293	 * If vmm.ko could not be successfully initialized then don't attempt
294	 * to create the virtual machine.
295	 */
296	if (!vmm_initialized)
297		return (ENXIO);
298
299	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
300		return (EINVAL);
301
302	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
303	if (vmspace == NULL)
304		return (ENOMEM);
305
306	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
307	strcpy(vm->name, name);
308	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
309	vm->vioapic = vioapic_init(vm);
310	vm->vhpet = vhpet_init(vm);
311
312	for (i = 0; i < VM_MAXCPU; i++) {
313		vcpu_init(vm, i);
314		guest_msrs_init(vm, i);
315	}
316
317	vm_activate_cpu(vm, BSP);
318	vm->vmspace = vmspace;
319
320	*retvm = vm;
321	return (0);
322}
323
324static void
325vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
326{
327
328	if (seg->object != NULL)
329		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
330
331	bzero(seg, sizeof(*seg));
332}
333
334void
335vm_destroy(struct vm *vm)
336{
337	int i;
338
339	ppt_unassign_all(vm);
340
341	if (vm->iommu != NULL)
342		iommu_destroy_domain(vm->iommu);
343
344	vhpet_cleanup(vm->vhpet);
345	vioapic_cleanup(vm->vioapic);
346
347	for (i = 0; i < vm->num_mem_segs; i++)
348		vm_free_mem_seg(vm, &vm->mem_segs[i]);
349
350	vm->num_mem_segs = 0;
351
352	for (i = 0; i < VM_MAXCPU; i++)
353		vcpu_cleanup(&vm->vcpu[i]);
354
355	VMSPACE_FREE(vm->vmspace);
356
357	VMCLEANUP(vm->cookie);
358
359	free(vm, M_VM);
360}
361
362const char *
363vm_name(struct vm *vm)
364{
365	return (vm->name);
366}
367
368int
369vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
370{
371	vm_object_t obj;
372
373	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
374		return (ENOMEM);
375	else
376		return (0);
377}
378
379int
380vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
381{
382
383	vmm_mmio_free(vm->vmspace, gpa, len);
384	return (0);
385}
386
387boolean_t
388vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
389{
390	int i;
391	vm_paddr_t gpabase, gpalimit;
392
393	for (i = 0; i < vm->num_mem_segs; i++) {
394		gpabase = vm->mem_segs[i].gpa;
395		gpalimit = gpabase + vm->mem_segs[i].len;
396		if (gpa >= gpabase && gpa < gpalimit)
397			return (TRUE);		/* 'gpa' is regular memory */
398	}
399
400	if (ppt_is_mmio(vm, gpa))
401		return (TRUE);			/* 'gpa' is pci passthru mmio */
402
403	return (FALSE);
404}
405
406int
407vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
408{
409	int available, allocated;
410	struct mem_seg *seg;
411	vm_object_t object;
412	vm_paddr_t g;
413
414	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
415		return (EINVAL);
416
417	available = allocated = 0;
418	g = gpa;
419	while (g < gpa + len) {
420		if (vm_mem_allocated(vm, g))
421			allocated++;
422		else
423			available++;
424
425		g += PAGE_SIZE;
426	}
427
428	/*
429	 * If there are some allocated and some available pages in the address
430	 * range then it is an error.
431	 */
432	if (allocated && available)
433		return (EINVAL);
434
435	/*
436	 * If the entire address range being requested has already been
437	 * allocated then there isn't anything more to do.
438	 */
439	if (allocated && available == 0)
440		return (0);
441
442	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
443		return (E2BIG);
444
445	seg = &vm->mem_segs[vm->num_mem_segs];
446
447	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
448		return (ENOMEM);
449
450	seg->gpa = gpa;
451	seg->len = len;
452	seg->object = object;
453	seg->wired = FALSE;
454
455	vm->num_mem_segs++;
456
457	return (0);
458}
459
460static void
461vm_gpa_unwire(struct vm *vm)
462{
463	int i, rv;
464	struct mem_seg *seg;
465
466	for (i = 0; i < vm->num_mem_segs; i++) {
467		seg = &vm->mem_segs[i];
468		if (!seg->wired)
469			continue;
470
471		rv = vm_map_unwire(&vm->vmspace->vm_map,
472				   seg->gpa, seg->gpa + seg->len,
473				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
474		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
475		    "%#lx/%ld could not be unwired: %d",
476		    vm_name(vm), seg->gpa, seg->len, rv));
477
478		seg->wired = FALSE;
479	}
480}
481
482static int
483vm_gpa_wire(struct vm *vm)
484{
485	int i, rv;
486	struct mem_seg *seg;
487
488	for (i = 0; i < vm->num_mem_segs; i++) {
489		seg = &vm->mem_segs[i];
490		if (seg->wired)
491			continue;
492
493		/* XXX rlimits? */
494		rv = vm_map_wire(&vm->vmspace->vm_map,
495				 seg->gpa, seg->gpa + seg->len,
496				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
497		if (rv != KERN_SUCCESS)
498			break;
499
500		seg->wired = TRUE;
501	}
502
503	if (i < vm->num_mem_segs) {
504		/*
505		 * Undo the wiring before returning an error.
506		 */
507		vm_gpa_unwire(vm);
508		return (EAGAIN);
509	}
510
511	return (0);
512}
513
514static void
515vm_iommu_modify(struct vm *vm, boolean_t map)
516{
517	int i, sz;
518	vm_paddr_t gpa, hpa;
519	struct mem_seg *seg;
520	void *vp, *cookie, *host_domain;
521
522	sz = PAGE_SIZE;
523	host_domain = iommu_host_domain();
524
525	for (i = 0; i < vm->num_mem_segs; i++) {
526		seg = &vm->mem_segs[i];
527		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
528		    vm_name(vm), seg->gpa, seg->len));
529
530		gpa = seg->gpa;
531		while (gpa < seg->gpa + seg->len) {
532			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
533					 &cookie);
534			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
535			    vm_name(vm), gpa));
536
537			vm_gpa_release(cookie);
538
539			hpa = DMAP_TO_PHYS((uintptr_t)vp);
540			if (map) {
541				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
542				iommu_remove_mapping(host_domain, hpa, sz);
543			} else {
544				iommu_remove_mapping(vm->iommu, gpa, sz);
545				iommu_create_mapping(host_domain, hpa, hpa, sz);
546			}
547
548			gpa += PAGE_SIZE;
549		}
550	}
551
552	/*
553	 * Invalidate the cached translations associated with the domain
554	 * from which pages were removed.
555	 */
556	if (map)
557		iommu_invalidate_tlb(host_domain);
558	else
559		iommu_invalidate_tlb(vm->iommu);
560}
561
562#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
563#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
564
565int
566vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
567{
568	int error;
569
570	error = ppt_unassign_device(vm, bus, slot, func);
571	if (error)
572		return (error);
573
574	if (ppt_num_devices(vm) == 0) {
575		vm_iommu_unmap(vm);
576		vm_gpa_unwire(vm);
577	}
578	return (0);
579}
580
581int
582vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
583{
584	int error;
585	vm_paddr_t maxaddr;
586
587	/*
588	 * Virtual machines with pci passthru devices get special treatment:
589	 * - the guest physical memory is wired
590	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
591	 *
592	 * We need to do this before the first pci passthru device is attached.
593	 */
594	if (ppt_num_devices(vm) == 0) {
595		KASSERT(vm->iommu == NULL,
596		    ("vm_assign_pptdev: iommu must be NULL"));
597		maxaddr = vmm_mem_maxaddr();
598		vm->iommu = iommu_create_domain(maxaddr);
599
600		error = vm_gpa_wire(vm);
601		if (error)
602			return (error);
603
604		vm_iommu_map(vm);
605	}
606
607	error = ppt_assign_device(vm, bus, slot, func);
608	return (error);
609}
610
611void *
612vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
613	    void **cookie)
614{
615	int count, pageoff;
616	vm_page_t m;
617
618	pageoff = gpa & PAGE_MASK;
619	if (len > PAGE_SIZE - pageoff)
620		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
621
622	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
623	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
624
625	if (count == 1) {
626		*cookie = m;
627		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
628	} else {
629		*cookie = NULL;
630		return (NULL);
631	}
632}
633
634void
635vm_gpa_release(void *cookie)
636{
637	vm_page_t m = cookie;
638
639	vm_page_lock(m);
640	vm_page_unhold(m);
641	vm_page_unlock(m);
642}
643
644int
645vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
646		  struct vm_memory_segment *seg)
647{
648	int i;
649
650	for (i = 0; i < vm->num_mem_segs; i++) {
651		if (gpabase == vm->mem_segs[i].gpa) {
652			seg->gpa = vm->mem_segs[i].gpa;
653			seg->len = vm->mem_segs[i].len;
654			seg->wired = vm->mem_segs[i].wired;
655			return (0);
656		}
657	}
658	return (-1);
659}
660
661int
662vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
663	      vm_offset_t *offset, struct vm_object **object)
664{
665	int i;
666	size_t seg_len;
667	vm_paddr_t seg_gpa;
668	vm_object_t seg_obj;
669
670	for (i = 0; i < vm->num_mem_segs; i++) {
671		if ((seg_obj = vm->mem_segs[i].object) == NULL)
672			continue;
673
674		seg_gpa = vm->mem_segs[i].gpa;
675		seg_len = vm->mem_segs[i].len;
676
677		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
678			*offset = gpa - seg_gpa;
679			*object = seg_obj;
680			vm_object_reference(seg_obj);
681			return (0);
682		}
683	}
684
685	return (EINVAL);
686}
687
688int
689vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
690{
691
692	if (vcpu < 0 || vcpu >= VM_MAXCPU)
693		return (EINVAL);
694
695	if (reg >= VM_REG_LAST)
696		return (EINVAL);
697
698	return (VMGETREG(vm->cookie, vcpu, reg, retval));
699}
700
701int
702vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
703{
704
705	if (vcpu < 0 || vcpu >= VM_MAXCPU)
706		return (EINVAL);
707
708	if (reg >= VM_REG_LAST)
709		return (EINVAL);
710
711	return (VMSETREG(vm->cookie, vcpu, reg, val));
712}
713
714static boolean_t
715is_descriptor_table(int reg)
716{
717
718	switch (reg) {
719	case VM_REG_GUEST_IDTR:
720	case VM_REG_GUEST_GDTR:
721		return (TRUE);
722	default:
723		return (FALSE);
724	}
725}
726
727static boolean_t
728is_segment_register(int reg)
729{
730
731	switch (reg) {
732	case VM_REG_GUEST_ES:
733	case VM_REG_GUEST_CS:
734	case VM_REG_GUEST_SS:
735	case VM_REG_GUEST_DS:
736	case VM_REG_GUEST_FS:
737	case VM_REG_GUEST_GS:
738	case VM_REG_GUEST_TR:
739	case VM_REG_GUEST_LDTR:
740		return (TRUE);
741	default:
742		return (FALSE);
743	}
744}
745
746int
747vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
748		struct seg_desc *desc)
749{
750
751	if (vcpu < 0 || vcpu >= VM_MAXCPU)
752		return (EINVAL);
753
754	if (!is_segment_register(reg) && !is_descriptor_table(reg))
755		return (EINVAL);
756
757	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
758}
759
760int
761vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
762		struct seg_desc *desc)
763{
764	if (vcpu < 0 || vcpu >= VM_MAXCPU)
765		return (EINVAL);
766
767	if (!is_segment_register(reg) && !is_descriptor_table(reg))
768		return (EINVAL);
769
770	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
771}
772
773static void
774restore_guest_fpustate(struct vcpu *vcpu)
775{
776
777	/* flush host state to the pcb */
778	fpuexit(curthread);
779
780	/* restore guest FPU state */
781	fpu_stop_emulating();
782	fpurestore(vcpu->guestfpu);
783
784	/*
785	 * The FPU is now "dirty" with the guest's state so turn on emulation
786	 * to trap any access to the FPU by the host.
787	 */
788	fpu_start_emulating();
789}
790
791static void
792save_guest_fpustate(struct vcpu *vcpu)
793{
794
795	if ((rcr0() & CR0_TS) == 0)
796		panic("fpu emulation not enabled in host!");
797
798	/* save guest FPU state */
799	fpu_stop_emulating();
800	fpusave(vcpu->guestfpu);
801	fpu_start_emulating();
802}
803
804static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
805
806static int
807vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
808{
809	int error;
810
811	vcpu_assert_locked(vcpu);
812
813	/*
814	 * The following state transitions are allowed:
815	 * IDLE -> FROZEN -> IDLE
816	 * FROZEN -> RUNNING -> FROZEN
817	 * FROZEN -> SLEEPING -> FROZEN
818	 */
819	switch (vcpu->state) {
820	case VCPU_IDLE:
821	case VCPU_RUNNING:
822	case VCPU_SLEEPING:
823		error = (newstate != VCPU_FROZEN);
824		break;
825	case VCPU_FROZEN:
826		error = (newstate == VCPU_FROZEN);
827		break;
828	default:
829		error = 1;
830		break;
831	}
832
833	if (error == 0)
834		vcpu->state = newstate;
835	else
836		error = EBUSY;
837
838	return (error);
839}
840
841static void
842vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
843{
844	int error;
845
846	if ((error = vcpu_set_state(vm, vcpuid, newstate)) != 0)
847		panic("Error %d setting state to %d\n", error, newstate);
848}
849
850static void
851vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
852{
853	int error;
854
855	if ((error = vcpu_set_state_locked(vcpu, newstate)) != 0)
856		panic("Error %d setting state to %d", error, newstate);
857}
858
859/*
860 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
861 */
862static int
863vm_handle_hlt(struct vm *vm, int vcpuid, boolean_t *retu)
864{
865	struct vcpu *vcpu;
866	int sleepticks, t;
867
868	vcpu = &vm->vcpu[vcpuid];
869
870	vcpu_lock(vcpu);
871
872	/*
873	 * Figure out the number of host ticks until the next apic
874	 * timer interrupt in the guest.
875	 */
876	sleepticks = lapic_timer_tick(vm, vcpuid);
877
878	/*
879	 * If the guest local apic timer is disabled then sleep for
880	 * a long time but not forever.
881	 */
882	if (sleepticks < 0)
883		sleepticks = hz;
884
885	/*
886	 * Do a final check for pending NMI or interrupts before
887	 * really putting this thread to sleep.
888	 *
889	 * These interrupts could have happened any time after we
890	 * returned from VMRUN() and before we grabbed the vcpu lock.
891	 */
892	if (!vm_nmi_pending(vm, vcpuid) && lapic_pending_intr(vm, vcpuid) < 0) {
893		if (sleepticks <= 0)
894			panic("invalid sleepticks %d", sleepticks);
895		t = ticks;
896		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
897		msleep_spin(vcpu, &vcpu->mtx, "vmidle", sleepticks);
898		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
899		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
900	}
901	vcpu_unlock(vcpu);
902
903	return (0);
904}
905
906static int
907vm_handle_paging(struct vm *vm, int vcpuid, boolean_t *retu)
908{
909	int rv, ftype;
910	struct vm_map *map;
911	struct vcpu *vcpu;
912	struct vm_exit *vme;
913
914	vcpu = &vm->vcpu[vcpuid];
915	vme = &vcpu->exitinfo;
916
917	ftype = vme->u.paging.fault_type;
918	KASSERT(ftype == VM_PROT_READ ||
919	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
920	    ("vm_handle_paging: invalid fault_type %d", ftype));
921
922	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
923		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
924		    vme->u.paging.gpa, ftype);
925		if (rv == 0)
926			goto done;
927	}
928
929	map = &vm->vmspace->vm_map;
930	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
931
932	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
933	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
934
935	if (rv != KERN_SUCCESS)
936		return (EFAULT);
937done:
938	/* restart execution at the faulting instruction */
939	vme->inst_length = 0;
940
941	return (0);
942}
943
944static int
945vm_handle_inst_emul(struct vm *vm, int vcpuid, boolean_t *retu)
946{
947	struct vie *vie;
948	struct vcpu *vcpu;
949	struct vm_exit *vme;
950	int error, inst_length;
951	uint64_t rip, gla, gpa, cr3;
952	mem_region_read_t mread;
953	mem_region_write_t mwrite;
954
955	vcpu = &vm->vcpu[vcpuid];
956	vme = &vcpu->exitinfo;
957
958	rip = vme->rip;
959	inst_length = vme->inst_length;
960
961	gla = vme->u.inst_emul.gla;
962	gpa = vme->u.inst_emul.gpa;
963	cr3 = vme->u.inst_emul.cr3;
964	vie = &vme->u.inst_emul.vie;
965
966	vie_init(vie);
967
968	/* Fetch, decode and emulate the faulting instruction */
969	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
970		return (EFAULT);
971
972	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
973		return (EFAULT);
974
975	/* return to userland unless this is an in-kernel emulated device */
976	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
977		mread = lapic_mmio_read;
978		mwrite = lapic_mmio_write;
979	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
980		mread = vioapic_mmio_read;
981		mwrite = vioapic_mmio_write;
982	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
983		mread = vhpet_mmio_read;
984		mwrite = vhpet_mmio_write;
985	} else {
986		*retu = TRUE;
987		return (0);
988	}
989
990	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite, 0);
991
992	/* return to userland to spin up the AP */
993	if (error == 0 && vme->exitcode == VM_EXITCODE_SPINUP_AP)
994		*retu = TRUE;
995
996	return (error);
997}
998
999int
1000vm_run(struct vm *vm, struct vm_run *vmrun)
1001{
1002	int error, vcpuid;
1003	struct vcpu *vcpu;
1004	struct pcb *pcb;
1005	uint64_t tscval, rip;
1006	struct vm_exit *vme;
1007	boolean_t retu;
1008	pmap_t pmap;
1009
1010	vcpuid = vmrun->cpuid;
1011
1012	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1013		return (EINVAL);
1014
1015	pmap = vmspace_pmap(vm->vmspace);
1016	vcpu = &vm->vcpu[vcpuid];
1017	vme = &vcpu->exitinfo;
1018	rip = vmrun->rip;
1019restart:
1020	critical_enter();
1021
1022	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1023	    ("vm_run: absurd pm_active"));
1024
1025	tscval = rdtsc();
1026
1027	pcb = PCPU_GET(curpcb);
1028	set_pcb_flags(pcb, PCB_FULL_IRET);
1029
1030	restore_guest_msrs(vm, vcpuid);
1031	restore_guest_fpustate(vcpu);
1032
1033	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1034	vcpu->hostcpu = curcpu;
1035	error = VMRUN(vm->cookie, vcpuid, rip, pmap);
1036	vcpu->hostcpu = NOCPU;
1037	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1038
1039	save_guest_fpustate(vcpu);
1040	restore_host_msrs(vm, vcpuid);
1041
1042	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1043
1044	critical_exit();
1045
1046	if (error == 0) {
1047		retu = FALSE;
1048		switch (vme->exitcode) {
1049		case VM_EXITCODE_HLT:
1050			error = vm_handle_hlt(vm, vcpuid, &retu);
1051			break;
1052		case VM_EXITCODE_PAGING:
1053			error = vm_handle_paging(vm, vcpuid, &retu);
1054			break;
1055		case VM_EXITCODE_INST_EMUL:
1056			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1057			break;
1058		default:
1059			retu = TRUE;	/* handled in userland */
1060			break;
1061		}
1062	}
1063
1064	if (error == 0 && retu == FALSE) {
1065		rip = vme->rip + vme->inst_length;
1066		goto restart;
1067	}
1068
1069	/* copy the exit information */
1070	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1071	return (error);
1072}
1073
1074int
1075vm_inject_event(struct vm *vm, int vcpuid, int type,
1076		int vector, uint32_t code, int code_valid)
1077{
1078	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1079		return (EINVAL);
1080
1081	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1082		return (EINVAL);
1083
1084	if (vector < 0 || vector > 255)
1085		return (EINVAL);
1086
1087	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1088}
1089
1090static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1091
1092int
1093vm_inject_nmi(struct vm *vm, int vcpuid)
1094{
1095	struct vcpu *vcpu;
1096
1097	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1098		return (EINVAL);
1099
1100	vcpu = &vm->vcpu[vcpuid];
1101
1102	vcpu->nmi_pending = 1;
1103	vm_interrupt_hostcpu(vm, vcpuid);
1104	return (0);
1105}
1106
1107int
1108vm_nmi_pending(struct vm *vm, int vcpuid)
1109{
1110	struct vcpu *vcpu;
1111
1112	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1113		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1114
1115	vcpu = &vm->vcpu[vcpuid];
1116
1117	return (vcpu->nmi_pending);
1118}
1119
1120void
1121vm_nmi_clear(struct vm *vm, int vcpuid)
1122{
1123	struct vcpu *vcpu;
1124
1125	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1126		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1127
1128	vcpu = &vm->vcpu[vcpuid];
1129
1130	if (vcpu->nmi_pending == 0)
1131		panic("vm_nmi_clear: inconsistent nmi_pending state");
1132
1133	vcpu->nmi_pending = 0;
1134	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1135}
1136
1137int
1138vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1139{
1140	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1141		return (EINVAL);
1142
1143	if (type < 0 || type >= VM_CAP_MAX)
1144		return (EINVAL);
1145
1146	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1147}
1148
1149int
1150vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1151{
1152	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1153		return (EINVAL);
1154
1155	if (type < 0 || type >= VM_CAP_MAX)
1156		return (EINVAL);
1157
1158	return (VMSETCAP(vm->cookie, vcpu, type, val));
1159}
1160
1161uint64_t *
1162vm_guest_msrs(struct vm *vm, int cpu)
1163{
1164	return (vm->vcpu[cpu].guest_msrs);
1165}
1166
1167struct vlapic *
1168vm_lapic(struct vm *vm, int cpu)
1169{
1170	return (vm->vcpu[cpu].vlapic);
1171}
1172
1173struct vioapic *
1174vm_ioapic(struct vm *vm)
1175{
1176
1177	return (vm->vioapic);
1178}
1179
1180struct vhpet *
1181vm_hpet(struct vm *vm)
1182{
1183
1184	return (vm->vhpet);
1185}
1186
1187boolean_t
1188vmm_is_pptdev(int bus, int slot, int func)
1189{
1190	int found, i, n;
1191	int b, s, f;
1192	char *val, *cp, *cp2;
1193
1194	/*
1195	 * XXX
1196	 * The length of an environment variable is limited to 128 bytes which
1197	 * puts an upper limit on the number of passthru devices that may be
1198	 * specified using a single environment variable.
1199	 *
1200	 * Work around this by scanning multiple environment variable
1201	 * names instead of a single one - yuck!
1202	 */
1203	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1204
1205	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1206	found = 0;
1207	for (i = 0; names[i] != NULL && !found; i++) {
1208		cp = val = getenv(names[i]);
1209		while (cp != NULL && *cp != '\0') {
1210			if ((cp2 = strchr(cp, ' ')) != NULL)
1211				*cp2 = '\0';
1212
1213			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1214			if (n == 3 && bus == b && slot == s && func == f) {
1215				found = 1;
1216				break;
1217			}
1218
1219			if (cp2 != NULL)
1220				*cp2++ = ' ';
1221
1222			cp = cp2;
1223		}
1224		freeenv(val);
1225	}
1226	return (found);
1227}
1228
1229void *
1230vm_iommu_domain(struct vm *vm)
1231{
1232
1233	return (vm->iommu);
1234}
1235
1236int
1237vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1238{
1239	int error;
1240	struct vcpu *vcpu;
1241
1242	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1243		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1244
1245	vcpu = &vm->vcpu[vcpuid];
1246
1247	vcpu_lock(vcpu);
1248	error = vcpu_set_state_locked(vcpu, newstate);
1249	vcpu_unlock(vcpu);
1250
1251	return (error);
1252}
1253
1254enum vcpu_state
1255vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1256{
1257	struct vcpu *vcpu;
1258	enum vcpu_state state;
1259
1260	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1261		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1262
1263	vcpu = &vm->vcpu[vcpuid];
1264
1265	vcpu_lock(vcpu);
1266	state = vcpu->state;
1267	if (hostcpu != NULL)
1268		*hostcpu = vcpu->hostcpu;
1269	vcpu_unlock(vcpu);
1270
1271	return (state);
1272}
1273
1274void
1275vm_activate_cpu(struct vm *vm, int vcpuid)
1276{
1277
1278	if (vcpuid >= 0 && vcpuid < VM_MAXCPU)
1279		CPU_SET(vcpuid, &vm->active_cpus);
1280}
1281
1282cpuset_t
1283vm_active_cpus(struct vm *vm)
1284{
1285
1286	return (vm->active_cpus);
1287}
1288
1289void *
1290vcpu_stats(struct vm *vm, int vcpuid)
1291{
1292
1293	return (vm->vcpu[vcpuid].stats);
1294}
1295
1296int
1297vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1298{
1299	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1300		return (EINVAL);
1301
1302	*state = vm->vcpu[vcpuid].x2apic_state;
1303
1304	return (0);
1305}
1306
1307int
1308vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1309{
1310	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1311		return (EINVAL);
1312
1313	if (state >= X2APIC_STATE_LAST)
1314		return (EINVAL);
1315
1316	vm->vcpu[vcpuid].x2apic_state = state;
1317
1318	vlapic_set_x2apic_state(vm, vcpuid, state);
1319
1320	return (0);
1321}
1322
1323void
1324vm_interrupt_hostcpu(struct vm *vm, int vcpuid)
1325{
1326	int hostcpu;
1327	struct vcpu *vcpu;
1328
1329	vcpu = &vm->vcpu[vcpuid];
1330
1331	vcpu_lock(vcpu);
1332	hostcpu = vcpu->hostcpu;
1333	if (hostcpu == NOCPU) {
1334		if (vcpu->state == VCPU_SLEEPING)
1335			wakeup_one(vcpu);
1336	} else {
1337		if (vcpu->state != VCPU_RUNNING)
1338			panic("invalid vcpu state %d", vcpu->state);
1339		if (hostcpu != curcpu)
1340			ipi_cpu(hostcpu, vmm_ipinum);
1341	}
1342	vcpu_unlock(vcpu);
1343}
1344
1345struct vmspace *
1346vm_get_vmspace(struct vm *vm)
1347{
1348
1349	return (vm->vmspace);
1350}
1351
1352int
1353vm_apicid2vcpuid(struct vm *vm, int apicid)
1354{
1355	/*
1356	 * XXX apic id is assumed to be numerically identical to vcpu id
1357	 */
1358	return (apicid);
1359}
1360