vmm.c revision 270159
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270159 2014-08-19 01:20:24Z grehan $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 270159 2014-08-19 01:20:24Z grehan $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65#include <machine/vmm_instruction_emul.h>
66
67#include "vmm_ioport.h"
68#include "vmm_ktr.h"
69#include "vmm_host.h"
70#include "vmm_mem.h"
71#include "vmm_util.h"
72#include "vatpic.h"
73#include "vatpit.h"
74#include "vhpet.h"
75#include "vioapic.h"
76#include "vlapic.h"
77#include "vmm_msr.h"
78#include "vmm_ipi.h"
79#include "vmm_stat.h"
80#include "vmm_lapic.h"
81
82#include "io/ppt.h"
83#include "io/iommu.h"
84
85struct vlapic;
86
87/*
88 * Initialization:
89 * (a) allocated when vcpu is created
90 * (i) initialized when vcpu is created and when it is reinitialized
91 * (o) initialized the first time the vcpu is created
92 * (x) initialized before use
93 */
94struct vcpu {
95	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
96	enum vcpu_state	state;		/* (o) vcpu state */
97	int		hostcpu;	/* (o) vcpu's host cpu */
98	struct vlapic	*vlapic;	/* (i) APIC device model */
99	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
101	int		nmi_pending;	/* (i) NMI pending */
102	int		extint_pending;	/* (i) INTR pending */
103	struct vm_exception exception;	/* (x) exception collateral */
104	int	exception_pending;	/* (i) exception pending */
105	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
106	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
107	void		*stats;		/* (a,i) statistics */
108	uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
109	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
110};
111
112#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
113#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
114#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
115#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
116#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
117
118struct mem_seg {
119	vm_paddr_t	gpa;
120	size_t		len;
121	boolean_t	wired;
122	vm_object_t	object;
123};
124#define	VM_MAX_MEMORY_SEGMENTS	2
125
126/*
127 * Initialization:
128 * (o) initialized the first time the VM is created
129 * (i) initialized when VM is created and when it is reinitialized
130 * (x) initialized before use
131 */
132struct vm {
133	void		*cookie;		/* (i) cpu-specific data */
134	void		*iommu;			/* (x) iommu-specific data */
135	struct vhpet	*vhpet;			/* (i) virtual HPET */
136	struct vioapic	*vioapic;		/* (i) virtual ioapic */
137	struct vatpic	*vatpic;		/* (i) virtual atpic */
138	struct vatpit	*vatpit;		/* (i) virtual atpit */
139	volatile cpuset_t active_cpus;		/* (i) active vcpus */
140	int		suspend;		/* (i) stop VM execution */
141	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
142	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
143	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
144	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
145	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
146	vm_rendezvous_func_t rendezvous_func;
147	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
148	int		num_mem_segs;		/* (o) guest memory segments */
149	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
150	struct vmspace	*vmspace;		/* (o) guest's address space */
151	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
152	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
153};
154
155static int vmm_initialized;
156
157static struct vmm_ops *ops;
158#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
159#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
160#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
161
162#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
163#define	VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
164	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
165#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
166#define	VMSPACE_ALLOC(min, max) \
167	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
168#define	VMSPACE_FREE(vmspace) \
169	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
170#define	VMGETREG(vmi, vcpu, num, retval)		\
171	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
172#define	VMSETREG(vmi, vcpu, num, val)		\
173	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
174#define	VMGETDESC(vmi, vcpu, num, desc)		\
175	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
176#define	VMSETDESC(vmi, vcpu, num, desc)		\
177	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
178#define	VMGETCAP(vmi, vcpu, num, retval)	\
179	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
180#define	VMSETCAP(vmi, vcpu, num, val)		\
181	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
182#define	VLAPIC_INIT(vmi, vcpu)			\
183	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
184#define	VLAPIC_CLEANUP(vmi, vlapic)		\
185	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
186
187#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
188#define	fpu_stop_emulating()	clts()
189
190static MALLOC_DEFINE(M_VM, "vm", "vm");
191CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
192
193/* statistics */
194static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
195
196SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
197
198/*
199 * Halt the guest if all vcpus are executing a HLT instruction with
200 * interrupts disabled.
201 */
202static int halt_detection_enabled = 1;
203TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
204SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
205    &halt_detection_enabled, 0,
206    "Halt VM if all vcpus execute HLT with interrupts disabled");
207
208static int vmm_ipinum;
209SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
210    "IPI vector used for vcpu notifications");
211
212static void
213vcpu_cleanup(struct vm *vm, int i, bool destroy)
214{
215	struct vcpu *vcpu = &vm->vcpu[i];
216
217	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
218	if (destroy) {
219		vmm_stat_free(vcpu->stats);
220		fpu_save_area_free(vcpu->guestfpu);
221	}
222}
223
224static void
225vcpu_init(struct vm *vm, int vcpu_id, bool create)
226{
227	struct vcpu *vcpu;
228
229	KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
230	    ("vcpu_init: invalid vcpu %d", vcpu_id));
231
232	vcpu = &vm->vcpu[vcpu_id];
233
234	if (create) {
235		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
236		    "initialized", vcpu_id));
237		vcpu_lock_init(vcpu);
238		vcpu->state = VCPU_IDLE;
239		vcpu->hostcpu = NOCPU;
240		vcpu->guestfpu = fpu_save_area_alloc();
241		vcpu->stats = vmm_stat_alloc();
242	}
243
244	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
245	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
246	vcpu->exitintinfo = 0;
247	vcpu->nmi_pending = 0;
248	vcpu->extint_pending = 0;
249	vcpu->exception_pending = 0;
250	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
251	fpu_save_area_reset(vcpu->guestfpu);
252	vmm_stat_init(vcpu->stats);
253	guest_msrs_init(vm, vcpu_id);
254}
255
256struct vm_exit *
257vm_exitinfo(struct vm *vm, int cpuid)
258{
259	struct vcpu *vcpu;
260
261	if (cpuid < 0 || cpuid >= VM_MAXCPU)
262		panic("vm_exitinfo: invalid cpuid %d", cpuid);
263
264	vcpu = &vm->vcpu[cpuid];
265
266	return (&vcpu->exitinfo);
267}
268
269static void
270vmm_resume(void)
271{
272	VMM_RESUME();
273}
274
275static int
276vmm_init(void)
277{
278	int error;
279
280	vmm_host_state_init();
281
282	vmm_ipinum = vmm_ipi_alloc();
283	if (vmm_ipinum == 0)
284		vmm_ipinum = IPI_AST;
285
286	error = vmm_mem_init();
287	if (error)
288		return (error);
289
290	if (vmm_is_intel())
291		ops = &vmm_ops_intel;
292	else if (vmm_is_amd())
293		ops = &vmm_ops_amd;
294	else
295		return (ENXIO);
296
297	vmm_msr_init();
298	vmm_resume_p = vmm_resume;
299
300	return (VMM_INIT(vmm_ipinum));
301}
302
303static int
304vmm_handler(module_t mod, int what, void *arg)
305{
306	int error;
307
308	switch (what) {
309	case MOD_LOAD:
310		vmmdev_init();
311		if (ppt_avail_devices() > 0)
312			iommu_init();
313		error = vmm_init();
314		if (error == 0)
315			vmm_initialized = 1;
316		break;
317	case MOD_UNLOAD:
318		error = vmmdev_cleanup();
319		if (error == 0) {
320			vmm_resume_p = NULL;
321			iommu_cleanup();
322			if (vmm_ipinum != IPI_AST)
323				vmm_ipi_free(vmm_ipinum);
324			error = VMM_CLEANUP();
325			/*
326			 * Something bad happened - prevent new
327			 * VMs from being created
328			 */
329			if (error)
330				vmm_initialized = 0;
331		}
332		break;
333	default:
334		error = 0;
335		break;
336	}
337	return (error);
338}
339
340static moduledata_t vmm_kmod = {
341	"vmm",
342	vmm_handler,
343	NULL
344};
345
346/*
347 * vmm initialization has the following dependencies:
348 *
349 * - iommu initialization must happen after the pci passthru driver has had
350 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
351 *
352 * - VT-x initialization requires smp_rendezvous() and therefore must happen
353 *   after SMP is fully functional (after SI_SUB_SMP).
354 */
355DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
356MODULE_VERSION(vmm, 1);
357
358static void
359vm_init(struct vm *vm, bool create)
360{
361	int i;
362
363	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
364	vm->iommu = NULL;
365	vm->vioapic = vioapic_init(vm);
366	vm->vhpet = vhpet_init(vm);
367	vm->vatpic = vatpic_init(vm);
368	vm->vatpit = vatpit_init(vm);
369
370	CPU_ZERO(&vm->active_cpus);
371
372	vm->suspend = 0;
373	CPU_ZERO(&vm->suspended_cpus);
374
375	for (i = 0; i < VM_MAXCPU; i++)
376		vcpu_init(vm, i, create);
377}
378
379int
380vm_create(const char *name, struct vm **retvm)
381{
382	struct vm *vm;
383	struct vmspace *vmspace;
384
385	/*
386	 * If vmm.ko could not be successfully initialized then don't attempt
387	 * to create the virtual machine.
388	 */
389	if (!vmm_initialized)
390		return (ENXIO);
391
392	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
393		return (EINVAL);
394
395	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
396	if (vmspace == NULL)
397		return (ENOMEM);
398
399	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
400	strcpy(vm->name, name);
401	vm->num_mem_segs = 0;
402	vm->vmspace = vmspace;
403	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
404
405	vm_init(vm, true);
406
407	*retvm = vm;
408	return (0);
409}
410
411static void
412vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
413{
414
415	if (seg->object != NULL)
416		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
417
418	bzero(seg, sizeof(*seg));
419}
420
421static void
422vm_cleanup(struct vm *vm, bool destroy)
423{
424	int i;
425
426	ppt_unassign_all(vm);
427
428	if (vm->iommu != NULL)
429		iommu_destroy_domain(vm->iommu);
430
431	vatpit_cleanup(vm->vatpit);
432	vhpet_cleanup(vm->vhpet);
433	vatpic_cleanup(vm->vatpic);
434	vioapic_cleanup(vm->vioapic);
435
436	for (i = 0; i < VM_MAXCPU; i++)
437		vcpu_cleanup(vm, i, destroy);
438
439	VMCLEANUP(vm->cookie);
440
441	if (destroy) {
442		for (i = 0; i < vm->num_mem_segs; i++)
443			vm_free_mem_seg(vm, &vm->mem_segs[i]);
444
445		vm->num_mem_segs = 0;
446
447		VMSPACE_FREE(vm->vmspace);
448		vm->vmspace = NULL;
449	}
450}
451
452void
453vm_destroy(struct vm *vm)
454{
455	vm_cleanup(vm, true);
456	free(vm, M_VM);
457}
458
459int
460vm_reinit(struct vm *vm)
461{
462	int error;
463
464	/*
465	 * A virtual machine can be reset only if all vcpus are suspended.
466	 */
467	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
468		vm_cleanup(vm, false);
469		vm_init(vm, false);
470		error = 0;
471	} else {
472		error = EBUSY;
473	}
474
475	return (error);
476}
477
478const char *
479vm_name(struct vm *vm)
480{
481	return (vm->name);
482}
483
484int
485vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
486{
487	vm_object_t obj;
488
489	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
490		return (ENOMEM);
491	else
492		return (0);
493}
494
495int
496vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
497{
498
499	vmm_mmio_free(vm->vmspace, gpa, len);
500	return (0);
501}
502
503boolean_t
504vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
505{
506	int i;
507	vm_paddr_t gpabase, gpalimit;
508
509	for (i = 0; i < vm->num_mem_segs; i++) {
510		gpabase = vm->mem_segs[i].gpa;
511		gpalimit = gpabase + vm->mem_segs[i].len;
512		if (gpa >= gpabase && gpa < gpalimit)
513			return (TRUE);		/* 'gpa' is regular memory */
514	}
515
516	if (ppt_is_mmio(vm, gpa))
517		return (TRUE);			/* 'gpa' is pci passthru mmio */
518
519	return (FALSE);
520}
521
522int
523vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
524{
525	int available, allocated;
526	struct mem_seg *seg;
527	vm_object_t object;
528	vm_paddr_t g;
529
530	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
531		return (EINVAL);
532
533	available = allocated = 0;
534	g = gpa;
535	while (g < gpa + len) {
536		if (vm_mem_allocated(vm, g))
537			allocated++;
538		else
539			available++;
540
541		g += PAGE_SIZE;
542	}
543
544	/*
545	 * If there are some allocated and some available pages in the address
546	 * range then it is an error.
547	 */
548	if (allocated && available)
549		return (EINVAL);
550
551	/*
552	 * If the entire address range being requested has already been
553	 * allocated then there isn't anything more to do.
554	 */
555	if (allocated && available == 0)
556		return (0);
557
558	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
559		return (E2BIG);
560
561	seg = &vm->mem_segs[vm->num_mem_segs];
562
563	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
564		return (ENOMEM);
565
566	seg->gpa = gpa;
567	seg->len = len;
568	seg->object = object;
569	seg->wired = FALSE;
570
571	vm->num_mem_segs++;
572
573	return (0);
574}
575
576static vm_paddr_t
577vm_maxmem(struct vm *vm)
578{
579	int i;
580	vm_paddr_t gpa, maxmem;
581
582	maxmem = 0;
583	for (i = 0; i < vm->num_mem_segs; i++) {
584		gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
585		if (gpa > maxmem)
586			maxmem = gpa;
587	}
588	return (maxmem);
589}
590
591static void
592vm_gpa_unwire(struct vm *vm)
593{
594	int i, rv;
595	struct mem_seg *seg;
596
597	for (i = 0; i < vm->num_mem_segs; i++) {
598		seg = &vm->mem_segs[i];
599		if (!seg->wired)
600			continue;
601
602		rv = vm_map_unwire(&vm->vmspace->vm_map,
603				   seg->gpa, seg->gpa + seg->len,
604				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
605		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
606		    "%#lx/%ld could not be unwired: %d",
607		    vm_name(vm), seg->gpa, seg->len, rv));
608
609		seg->wired = FALSE;
610	}
611}
612
613static int
614vm_gpa_wire(struct vm *vm)
615{
616	int i, rv;
617	struct mem_seg *seg;
618
619	for (i = 0; i < vm->num_mem_segs; i++) {
620		seg = &vm->mem_segs[i];
621		if (seg->wired)
622			continue;
623
624		/* XXX rlimits? */
625		rv = vm_map_wire(&vm->vmspace->vm_map,
626				 seg->gpa, seg->gpa + seg->len,
627				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
628		if (rv != KERN_SUCCESS)
629			break;
630
631		seg->wired = TRUE;
632	}
633
634	if (i < vm->num_mem_segs) {
635		/*
636		 * Undo the wiring before returning an error.
637		 */
638		vm_gpa_unwire(vm);
639		return (EAGAIN);
640	}
641
642	return (0);
643}
644
645static void
646vm_iommu_modify(struct vm *vm, boolean_t map)
647{
648	int i, sz;
649	vm_paddr_t gpa, hpa;
650	struct mem_seg *seg;
651	void *vp, *cookie, *host_domain;
652
653	sz = PAGE_SIZE;
654	host_domain = iommu_host_domain();
655
656	for (i = 0; i < vm->num_mem_segs; i++) {
657		seg = &vm->mem_segs[i];
658		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
659		    vm_name(vm), seg->gpa, seg->len));
660
661		gpa = seg->gpa;
662		while (gpa < seg->gpa + seg->len) {
663			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
664					 &cookie);
665			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
666			    vm_name(vm), gpa));
667
668			vm_gpa_release(cookie);
669
670			hpa = DMAP_TO_PHYS((uintptr_t)vp);
671			if (map) {
672				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
673				iommu_remove_mapping(host_domain, hpa, sz);
674			} else {
675				iommu_remove_mapping(vm->iommu, gpa, sz);
676				iommu_create_mapping(host_domain, hpa, hpa, sz);
677			}
678
679			gpa += PAGE_SIZE;
680		}
681	}
682
683	/*
684	 * Invalidate the cached translations associated with the domain
685	 * from which pages were removed.
686	 */
687	if (map)
688		iommu_invalidate_tlb(host_domain);
689	else
690		iommu_invalidate_tlb(vm->iommu);
691}
692
693#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
694#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
695
696int
697vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
698{
699	int error;
700
701	error = ppt_unassign_device(vm, bus, slot, func);
702	if (error)
703		return (error);
704
705	if (ppt_assigned_devices(vm) == 0) {
706		vm_iommu_unmap(vm);
707		vm_gpa_unwire(vm);
708	}
709	return (0);
710}
711
712int
713vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
714{
715	int error;
716	vm_paddr_t maxaddr;
717
718	/*
719	 * Virtual machines with pci passthru devices get special treatment:
720	 * - the guest physical memory is wired
721	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
722	 *
723	 * We need to do this before the first pci passthru device is attached.
724	 */
725	if (ppt_assigned_devices(vm) == 0) {
726		KASSERT(vm->iommu == NULL,
727		    ("vm_assign_pptdev: iommu must be NULL"));
728		maxaddr = vm_maxmem(vm);
729		vm->iommu = iommu_create_domain(maxaddr);
730
731		error = vm_gpa_wire(vm);
732		if (error)
733			return (error);
734
735		vm_iommu_map(vm);
736	}
737
738	error = ppt_assign_device(vm, bus, slot, func);
739	return (error);
740}
741
742void *
743vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
744	    void **cookie)
745{
746	int count, pageoff;
747	vm_page_t m;
748
749	pageoff = gpa & PAGE_MASK;
750	if (len > PAGE_SIZE - pageoff)
751		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
752
753	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
754	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
755
756	if (count == 1) {
757		*cookie = m;
758		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
759	} else {
760		*cookie = NULL;
761		return (NULL);
762	}
763}
764
765void
766vm_gpa_release(void *cookie)
767{
768	vm_page_t m = cookie;
769
770	vm_page_lock(m);
771	vm_page_unhold(m);
772	vm_page_unlock(m);
773}
774
775int
776vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
777		  struct vm_memory_segment *seg)
778{
779	int i;
780
781	for (i = 0; i < vm->num_mem_segs; i++) {
782		if (gpabase == vm->mem_segs[i].gpa) {
783			seg->gpa = vm->mem_segs[i].gpa;
784			seg->len = vm->mem_segs[i].len;
785			seg->wired = vm->mem_segs[i].wired;
786			return (0);
787		}
788	}
789	return (-1);
790}
791
792int
793vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
794	      vm_offset_t *offset, struct vm_object **object)
795{
796	int i;
797	size_t seg_len;
798	vm_paddr_t seg_gpa;
799	vm_object_t seg_obj;
800
801	for (i = 0; i < vm->num_mem_segs; i++) {
802		if ((seg_obj = vm->mem_segs[i].object) == NULL)
803			continue;
804
805		seg_gpa = vm->mem_segs[i].gpa;
806		seg_len = vm->mem_segs[i].len;
807
808		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
809			*offset = gpa - seg_gpa;
810			*object = seg_obj;
811			vm_object_reference(seg_obj);
812			return (0);
813		}
814	}
815
816	return (EINVAL);
817}
818
819int
820vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
821{
822
823	if (vcpu < 0 || vcpu >= VM_MAXCPU)
824		return (EINVAL);
825
826	if (reg >= VM_REG_LAST)
827		return (EINVAL);
828
829	return (VMGETREG(vm->cookie, vcpu, reg, retval));
830}
831
832int
833vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
834{
835
836	if (vcpu < 0 || vcpu >= VM_MAXCPU)
837		return (EINVAL);
838
839	if (reg >= VM_REG_LAST)
840		return (EINVAL);
841
842	return (VMSETREG(vm->cookie, vcpu, reg, val));
843}
844
845static boolean_t
846is_descriptor_table(int reg)
847{
848
849	switch (reg) {
850	case VM_REG_GUEST_IDTR:
851	case VM_REG_GUEST_GDTR:
852		return (TRUE);
853	default:
854		return (FALSE);
855	}
856}
857
858static boolean_t
859is_segment_register(int reg)
860{
861
862	switch (reg) {
863	case VM_REG_GUEST_ES:
864	case VM_REG_GUEST_CS:
865	case VM_REG_GUEST_SS:
866	case VM_REG_GUEST_DS:
867	case VM_REG_GUEST_FS:
868	case VM_REG_GUEST_GS:
869	case VM_REG_GUEST_TR:
870	case VM_REG_GUEST_LDTR:
871		return (TRUE);
872	default:
873		return (FALSE);
874	}
875}
876
877int
878vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
879		struct seg_desc *desc)
880{
881
882	if (vcpu < 0 || vcpu >= VM_MAXCPU)
883		return (EINVAL);
884
885	if (!is_segment_register(reg) && !is_descriptor_table(reg))
886		return (EINVAL);
887
888	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
889}
890
891int
892vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
893		struct seg_desc *desc)
894{
895	if (vcpu < 0 || vcpu >= VM_MAXCPU)
896		return (EINVAL);
897
898	if (!is_segment_register(reg) && !is_descriptor_table(reg))
899		return (EINVAL);
900
901	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
902}
903
904static void
905restore_guest_fpustate(struct vcpu *vcpu)
906{
907
908	/* flush host state to the pcb */
909	fpuexit(curthread);
910
911	/* restore guest FPU state */
912	fpu_stop_emulating();
913	fpurestore(vcpu->guestfpu);
914
915	/* restore guest XCR0 if XSAVE is enabled in the host */
916	if (rcr4() & CR4_XSAVE)
917		load_xcr(0, vcpu->guest_xcr0);
918
919	/*
920	 * The FPU is now "dirty" with the guest's state so turn on emulation
921	 * to trap any access to the FPU by the host.
922	 */
923	fpu_start_emulating();
924}
925
926static void
927save_guest_fpustate(struct vcpu *vcpu)
928{
929
930	if ((rcr0() & CR0_TS) == 0)
931		panic("fpu emulation not enabled in host!");
932
933	/* save guest XCR0 and restore host XCR0 */
934	if (rcr4() & CR4_XSAVE) {
935		vcpu->guest_xcr0 = rxcr(0);
936		load_xcr(0, vmm_get_host_xcr0());
937	}
938
939	/* save guest FPU state */
940	fpu_stop_emulating();
941	fpusave(vcpu->guestfpu);
942	fpu_start_emulating();
943}
944
945static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
946
947static int
948vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
949    bool from_idle)
950{
951	int error;
952
953	vcpu_assert_locked(vcpu);
954
955	/*
956	 * State transitions from the vmmdev_ioctl() must always begin from
957	 * the VCPU_IDLE state. This guarantees that there is only a single
958	 * ioctl() operating on a vcpu at any point.
959	 */
960	if (from_idle) {
961		while (vcpu->state != VCPU_IDLE)
962			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
963	} else {
964		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
965		    "vcpu idle state"));
966	}
967
968	if (vcpu->state == VCPU_RUNNING) {
969		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
970		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
971	} else {
972		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
973		    "vcpu that is not running", vcpu->hostcpu));
974	}
975
976	/*
977	 * The following state transitions are allowed:
978	 * IDLE -> FROZEN -> IDLE
979	 * FROZEN -> RUNNING -> FROZEN
980	 * FROZEN -> SLEEPING -> FROZEN
981	 */
982	switch (vcpu->state) {
983	case VCPU_IDLE:
984	case VCPU_RUNNING:
985	case VCPU_SLEEPING:
986		error = (newstate != VCPU_FROZEN);
987		break;
988	case VCPU_FROZEN:
989		error = (newstate == VCPU_FROZEN);
990		break;
991	default:
992		error = 1;
993		break;
994	}
995
996	if (error)
997		return (EBUSY);
998
999	vcpu->state = newstate;
1000	if (newstate == VCPU_RUNNING)
1001		vcpu->hostcpu = curcpu;
1002	else
1003		vcpu->hostcpu = NOCPU;
1004
1005	if (newstate == VCPU_IDLE)
1006		wakeup(&vcpu->state);
1007
1008	return (0);
1009}
1010
1011static void
1012vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1013{
1014	int error;
1015
1016	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1017		panic("Error %d setting state to %d\n", error, newstate);
1018}
1019
1020static void
1021vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
1022{
1023	int error;
1024
1025	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
1026		panic("Error %d setting state to %d", error, newstate);
1027}
1028
1029static void
1030vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1031{
1032
1033	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1034
1035	/*
1036	 * Update 'rendezvous_func' and execute a write memory barrier to
1037	 * ensure that it is visible across all host cpus. This is not needed
1038	 * for correctness but it does ensure that all the vcpus will notice
1039	 * that the rendezvous is requested immediately.
1040	 */
1041	vm->rendezvous_func = func;
1042	wmb();
1043}
1044
1045#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1046	do {								\
1047		if (vcpuid >= 0)					\
1048			VCPU_CTR0(vm, vcpuid, fmt);			\
1049		else							\
1050			VM_CTR0(vm, fmt);				\
1051	} while (0)
1052
1053static void
1054vm_handle_rendezvous(struct vm *vm, int vcpuid)
1055{
1056
1057	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1058	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1059
1060	mtx_lock(&vm->rendezvous_mtx);
1061	while (vm->rendezvous_func != NULL) {
1062		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1063		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1064
1065		if (vcpuid != -1 &&
1066		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1067		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1068			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1069			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1070			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1071		}
1072		if (CPU_CMP(&vm->rendezvous_req_cpus,
1073		    &vm->rendezvous_done_cpus) == 0) {
1074			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1075			vm_set_rendezvous_func(vm, NULL);
1076			wakeup(&vm->rendezvous_func);
1077			break;
1078		}
1079		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1080		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1081		    "vmrndv", 0);
1082	}
1083	mtx_unlock(&vm->rendezvous_mtx);
1084}
1085
1086/*
1087 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1088 */
1089static int
1090vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1091{
1092	struct vcpu *vcpu;
1093	const char *wmesg;
1094	int t, vcpu_halted, vm_halted;
1095
1096	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1097
1098	vcpu = &vm->vcpu[vcpuid];
1099	vcpu_halted = 0;
1100	vm_halted = 0;
1101
1102	vcpu_lock(vcpu);
1103	while (1) {
1104		/*
1105		 * Do a final check for pending NMI or interrupts before
1106		 * really putting this thread to sleep. Also check for
1107		 * software events that would cause this vcpu to wakeup.
1108		 *
1109		 * These interrupts/events could have happened after the
1110		 * vcpu returned from VMRUN() and before it acquired the
1111		 * vcpu lock above.
1112		 */
1113		if (vm->rendezvous_func != NULL || vm->suspend)
1114			break;
1115		if (vm_nmi_pending(vm, vcpuid))
1116			break;
1117		if (!intr_disabled) {
1118			if (vm_extint_pending(vm, vcpuid) ||
1119			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1120				break;
1121			}
1122		}
1123
1124		/* Don't go to sleep if the vcpu thread needs to yield */
1125		if (vcpu_should_yield(vm, vcpuid))
1126			break;
1127
1128		/*
1129		 * Some Linux guests implement "halt" by having all vcpus
1130		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1131		 * track of the vcpus that have entered this state. When all
1132		 * vcpus enter the halted state the virtual machine is halted.
1133		 */
1134		if (intr_disabled) {
1135			wmesg = "vmhalt";
1136			VCPU_CTR0(vm, vcpuid, "Halted");
1137			if (!vcpu_halted && halt_detection_enabled) {
1138				vcpu_halted = 1;
1139				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1140			}
1141			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1142				vm_halted = 1;
1143				break;
1144			}
1145		} else {
1146			wmesg = "vmidle";
1147		}
1148
1149		t = ticks;
1150		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1151		/*
1152		 * XXX msleep_spin() cannot be interrupted by signals so
1153		 * wake up periodically to check pending signals.
1154		 */
1155		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1156		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1157		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1158	}
1159
1160	if (vcpu_halted)
1161		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1162
1163	vcpu_unlock(vcpu);
1164
1165	if (vm_halted)
1166		vm_suspend(vm, VM_SUSPEND_HALT);
1167
1168	return (0);
1169}
1170
1171static int
1172vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1173{
1174	int rv, ftype;
1175	struct vm_map *map;
1176	struct vcpu *vcpu;
1177	struct vm_exit *vme;
1178
1179	vcpu = &vm->vcpu[vcpuid];
1180	vme = &vcpu->exitinfo;
1181
1182	ftype = vme->u.paging.fault_type;
1183	KASSERT(ftype == VM_PROT_READ ||
1184	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1185	    ("vm_handle_paging: invalid fault_type %d", ftype));
1186
1187	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1188		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1189		    vme->u.paging.gpa, ftype);
1190		if (rv == 0)
1191			goto done;
1192	}
1193
1194	map = &vm->vmspace->vm_map;
1195	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1196
1197	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1198	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1199
1200	if (rv != KERN_SUCCESS)
1201		return (EFAULT);
1202done:
1203	/* restart execution at the faulting instruction */
1204	vme->inst_length = 0;
1205
1206	return (0);
1207}
1208
1209static int
1210vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1211{
1212	struct vie *vie;
1213	struct vcpu *vcpu;
1214	struct vm_exit *vme;
1215	uint64_t gla, gpa;
1216	struct vm_guest_paging *paging;
1217	mem_region_read_t mread;
1218	mem_region_write_t mwrite;
1219	enum vm_cpu_mode cpu_mode;
1220	int cs_d, error;
1221
1222	vcpu = &vm->vcpu[vcpuid];
1223	vme = &vcpu->exitinfo;
1224
1225	gla = vme->u.inst_emul.gla;
1226	gpa = vme->u.inst_emul.gpa;
1227	cs_d = vme->u.inst_emul.cs_d;
1228	vie = &vme->u.inst_emul.vie;
1229	paging = &vme->u.inst_emul.paging;
1230	cpu_mode = paging->cpu_mode;
1231
1232	vie_init(vie);
1233
1234	/* Fetch, decode and emulate the faulting instruction */
1235	error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
1236	    vme->inst_length, vie);
1237	if (error == 1)
1238		return (0);		/* Resume guest to handle page fault */
1239	else if (error == -1)
1240		return (EFAULT);
1241	else if (error != 0)
1242		panic("%s: vmm_fetch_instruction error %d", __func__, error);
1243
1244	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
1245		return (EFAULT);
1246
1247	/* return to userland unless this is an in-kernel emulated device */
1248	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1249		mread = lapic_mmio_read;
1250		mwrite = lapic_mmio_write;
1251	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1252		mread = vioapic_mmio_read;
1253		mwrite = vioapic_mmio_write;
1254	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1255		mread = vhpet_mmio_read;
1256		mwrite = vhpet_mmio_write;
1257	} else {
1258		*retu = true;
1259		return (0);
1260	}
1261
1262	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1263	    mread, mwrite, retu);
1264
1265	return (error);
1266}
1267
1268static int
1269vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1270{
1271	int i, done;
1272	struct vcpu *vcpu;
1273
1274	done = 0;
1275	vcpu = &vm->vcpu[vcpuid];
1276
1277	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1278
1279	/*
1280	 * Wait until all 'active_cpus' have suspended themselves.
1281	 *
1282	 * Since a VM may be suspended at any time including when one or
1283	 * more vcpus are doing a rendezvous we need to call the rendezvous
1284	 * handler while we are waiting to prevent a deadlock.
1285	 */
1286	vcpu_lock(vcpu);
1287	while (1) {
1288		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1289			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1290			break;
1291		}
1292
1293		if (vm->rendezvous_func == NULL) {
1294			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1295			vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1296			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1297			vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1298		} else {
1299			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1300			vcpu_unlock(vcpu);
1301			vm_handle_rendezvous(vm, vcpuid);
1302			vcpu_lock(vcpu);
1303		}
1304	}
1305	vcpu_unlock(vcpu);
1306
1307	/*
1308	 * Wakeup the other sleeping vcpus and return to userspace.
1309	 */
1310	for (i = 0; i < VM_MAXCPU; i++) {
1311		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1312			vcpu_notify_event(vm, i, false);
1313		}
1314	}
1315
1316	*retu = true;
1317	return (0);
1318}
1319
1320int
1321vm_suspend(struct vm *vm, enum vm_suspend_how how)
1322{
1323	int i;
1324
1325	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1326		return (EINVAL);
1327
1328	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1329		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1330		    vm->suspend, how);
1331		return (EALREADY);
1332	}
1333
1334	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1335
1336	/*
1337	 * Notify all active vcpus that they are now suspended.
1338	 */
1339	for (i = 0; i < VM_MAXCPU; i++) {
1340		if (CPU_ISSET(i, &vm->active_cpus))
1341			vcpu_notify_event(vm, i, false);
1342	}
1343
1344	return (0);
1345}
1346
1347void
1348vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1349{
1350	struct vm_exit *vmexit;
1351
1352	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1353	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1354
1355	vmexit = vm_exitinfo(vm, vcpuid);
1356	vmexit->rip = rip;
1357	vmexit->inst_length = 0;
1358	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1359	vmexit->u.suspended.how = vm->suspend;
1360}
1361
1362void
1363vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1364{
1365	struct vm_exit *vmexit;
1366
1367	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1368
1369	vmexit = vm_exitinfo(vm, vcpuid);
1370	vmexit->rip = rip;
1371	vmexit->inst_length = 0;
1372	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1373	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1374}
1375
1376void
1377vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1378{
1379	struct vm_exit *vmexit;
1380
1381	vmexit = vm_exitinfo(vm, vcpuid);
1382	vmexit->rip = rip;
1383	vmexit->inst_length = 0;
1384	vmexit->exitcode = VM_EXITCODE_BOGUS;
1385	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1386}
1387
1388int
1389vm_run(struct vm *vm, struct vm_run *vmrun)
1390{
1391	int error, vcpuid;
1392	struct vcpu *vcpu;
1393	struct pcb *pcb;
1394	uint64_t tscval, rip;
1395	struct vm_exit *vme;
1396	bool retu, intr_disabled;
1397	pmap_t pmap;
1398	void *rptr, *sptr;
1399
1400	vcpuid = vmrun->cpuid;
1401
1402	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1403		return (EINVAL);
1404
1405	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1406		return (EINVAL);
1407
1408	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1409		return (EINVAL);
1410
1411	rptr = &vm->rendezvous_func;
1412	sptr = &vm->suspend;
1413	pmap = vmspace_pmap(vm->vmspace);
1414	vcpu = &vm->vcpu[vcpuid];
1415	vme = &vcpu->exitinfo;
1416	rip = vmrun->rip;
1417restart:
1418	critical_enter();
1419
1420	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1421	    ("vm_run: absurd pm_active"));
1422
1423	tscval = rdtsc();
1424
1425	pcb = PCPU_GET(curpcb);
1426	set_pcb_flags(pcb, PCB_FULL_IRET);
1427
1428	restore_guest_msrs(vm, vcpuid);
1429	restore_guest_fpustate(vcpu);
1430
1431	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1432	error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
1433	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1434
1435	save_guest_fpustate(vcpu);
1436	restore_host_msrs(vm, vcpuid);
1437
1438	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1439
1440	critical_exit();
1441
1442	if (error == 0) {
1443		retu = false;
1444		switch (vme->exitcode) {
1445		case VM_EXITCODE_SUSPENDED:
1446			error = vm_handle_suspend(vm, vcpuid, &retu);
1447			break;
1448		case VM_EXITCODE_IOAPIC_EOI:
1449			vioapic_process_eoi(vm, vcpuid,
1450			    vme->u.ioapic_eoi.vector);
1451			break;
1452		case VM_EXITCODE_RENDEZVOUS:
1453			vm_handle_rendezvous(vm, vcpuid);
1454			error = 0;
1455			break;
1456		case VM_EXITCODE_HLT:
1457			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1458			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1459			break;
1460		case VM_EXITCODE_PAGING:
1461			error = vm_handle_paging(vm, vcpuid, &retu);
1462			break;
1463		case VM_EXITCODE_INST_EMUL:
1464			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1465			break;
1466		case VM_EXITCODE_INOUT:
1467		case VM_EXITCODE_INOUT_STR:
1468			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1469			break;
1470		default:
1471			retu = true;	/* handled in userland */
1472			break;
1473		}
1474	}
1475
1476	if (error == 0 && retu == false) {
1477		rip = vme->rip + vme->inst_length;
1478		goto restart;
1479	}
1480
1481	/* copy the exit information */
1482	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1483	return (error);
1484}
1485
1486int
1487vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1488{
1489	struct vcpu *vcpu;
1490	int type, vector;
1491
1492	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1493		return (EINVAL);
1494
1495	vcpu = &vm->vcpu[vcpuid];
1496
1497	if (info & VM_INTINFO_VALID) {
1498		type = info & VM_INTINFO_TYPE;
1499		vector = info & 0xff;
1500		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1501			return (EINVAL);
1502		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1503			return (EINVAL);
1504		if (info & VM_INTINFO_RSVD)
1505			return (EINVAL);
1506	} else {
1507		info = 0;
1508	}
1509	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1510	vcpu->exitintinfo = info;
1511	return (0);
1512}
1513
1514enum exc_class {
1515	EXC_BENIGN,
1516	EXC_CONTRIBUTORY,
1517	EXC_PAGEFAULT
1518};
1519
1520#define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
1521
1522static enum exc_class
1523exception_class(uint64_t info)
1524{
1525	int type, vector;
1526
1527	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1528	type = info & VM_INTINFO_TYPE;
1529	vector = info & 0xff;
1530
1531	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1532	switch (type) {
1533	case VM_INTINFO_HWINTR:
1534	case VM_INTINFO_SWINTR:
1535	case VM_INTINFO_NMI:
1536		return (EXC_BENIGN);
1537	default:
1538		/*
1539		 * Hardware exception.
1540		 *
1541		 * SVM and VT-x use identical type values to represent NMI,
1542		 * hardware interrupt and software interrupt.
1543		 *
1544		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1545		 * for exceptions except #BP and #OF. #BP and #OF use a type
1546		 * value of '5' or '6'. Therefore we don't check for explicit
1547		 * values of 'type' to classify 'intinfo' into a hardware
1548		 * exception.
1549		 */
1550		break;
1551	}
1552
1553	switch (vector) {
1554	case IDT_PF:
1555	case IDT_VE:
1556		return (EXC_PAGEFAULT);
1557	case IDT_DE:
1558	case IDT_TS:
1559	case IDT_NP:
1560	case IDT_SS:
1561	case IDT_GP:
1562		return (EXC_CONTRIBUTORY);
1563	default:
1564		return (EXC_BENIGN);
1565	}
1566}
1567
1568static int
1569nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1570    uint64_t *retinfo)
1571{
1572	enum exc_class exc1, exc2;
1573	int type1, vector1;
1574
1575	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1576	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1577
1578	/*
1579	 * If an exception occurs while attempting to call the double-fault
1580	 * handler the processor enters shutdown mode (aka triple fault).
1581	 */
1582	type1 = info1 & VM_INTINFO_TYPE;
1583	vector1 = info1 & 0xff;
1584	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1585		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1586		    info1, info2);
1587		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1588		*retinfo = 0;
1589		return (0);
1590	}
1591
1592	/*
1593	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1594	 */
1595	exc1 = exception_class(info1);
1596	exc2 = exception_class(info2);
1597	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1598	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1599		/* Convert nested fault into a double fault. */
1600		*retinfo = IDT_DF;
1601		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1602		*retinfo |= VM_INTINFO_DEL_ERRCODE;
1603	} else {
1604		/* Handle exceptions serially */
1605		*retinfo = info2;
1606	}
1607	return (1);
1608}
1609
1610static uint64_t
1611vcpu_exception_intinfo(struct vcpu *vcpu)
1612{
1613	uint64_t info = 0;
1614
1615	if (vcpu->exception_pending) {
1616		info = vcpu->exception.vector & 0xff;
1617		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1618		if (vcpu->exception.error_code_valid) {
1619			info |= VM_INTINFO_DEL_ERRCODE;
1620			info |= (uint64_t)vcpu->exception.error_code << 32;
1621		}
1622	}
1623	return (info);
1624}
1625
1626int
1627vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1628{
1629	struct vcpu *vcpu;
1630	uint64_t info1, info2;
1631	int valid;
1632
1633	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1634
1635	vcpu = &vm->vcpu[vcpuid];
1636
1637	info1 = vcpu->exitintinfo;
1638	vcpu->exitintinfo = 0;
1639
1640	info2 = 0;
1641	if (vcpu->exception_pending) {
1642		info2 = vcpu_exception_intinfo(vcpu);
1643		vcpu->exception_pending = 0;
1644		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1645		    vcpu->exception.vector, info2);
1646	}
1647
1648	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1649		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1650	} else if (info1 & VM_INTINFO_VALID) {
1651		*retinfo = info1;
1652		valid = 1;
1653	} else if (info2 & VM_INTINFO_VALID) {
1654		*retinfo = info2;
1655		valid = 1;
1656	} else {
1657		valid = 0;
1658	}
1659
1660	if (valid) {
1661		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1662		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1663	}
1664
1665	return (valid);
1666}
1667
1668int
1669vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1670{
1671	struct vcpu *vcpu;
1672
1673	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1674		return (EINVAL);
1675
1676	vcpu = &vm->vcpu[vcpuid];
1677	*info1 = vcpu->exitintinfo;
1678	*info2 = vcpu_exception_intinfo(vcpu);
1679	return (0);
1680}
1681
1682int
1683vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
1684{
1685	struct vcpu *vcpu;
1686
1687	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1688		return (EINVAL);
1689
1690	if (exception->vector < 0 || exception->vector >= 32)
1691		return (EINVAL);
1692
1693	/*
1694	 * A double fault exception should never be injected directly into
1695	 * the guest. It is a derived exception that results from specific
1696	 * combinations of nested faults.
1697	 */
1698	if (exception->vector == IDT_DF)
1699		return (EINVAL);
1700
1701	vcpu = &vm->vcpu[vcpuid];
1702
1703	if (vcpu->exception_pending) {
1704		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1705		    "pending exception %d", exception->vector,
1706		    vcpu->exception.vector);
1707		return (EBUSY);
1708	}
1709
1710	vcpu->exception_pending = 1;
1711	vcpu->exception = *exception;
1712	VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
1713	return (0);
1714}
1715
1716void
1717vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
1718    int errcode)
1719{
1720	struct vm_exception exception;
1721	struct vm_exit *vmexit;
1722	struct vm *vm;
1723	int error;
1724
1725	vm = vmarg;
1726
1727	exception.vector = vector;
1728	exception.error_code = errcode;
1729	exception.error_code_valid = errcode_valid;
1730	error = vm_inject_exception(vm, vcpuid, &exception);
1731	KASSERT(error == 0, ("vm_inject_exception error %d", error));
1732
1733	/*
1734	 * A fault-like exception allows the instruction to be restarted
1735	 * after the exception handler returns.
1736	 *
1737	 * By setting the inst_length to 0 we ensure that the instruction
1738	 * pointer remains at the faulting instruction.
1739	 */
1740	vmexit = vm_exitinfo(vm, vcpuid);
1741	vmexit->inst_length = 0;
1742}
1743
1744void
1745vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
1746{
1747	struct vm *vm;
1748	int error;
1749
1750	vm = vmarg;
1751	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
1752	    error_code, cr2);
1753
1754	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
1755	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
1756
1757	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
1758}
1759
1760static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1761
1762int
1763vm_inject_nmi(struct vm *vm, int vcpuid)
1764{
1765	struct vcpu *vcpu;
1766
1767	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1768		return (EINVAL);
1769
1770	vcpu = &vm->vcpu[vcpuid];
1771
1772	vcpu->nmi_pending = 1;
1773	vcpu_notify_event(vm, vcpuid, false);
1774	return (0);
1775}
1776
1777int
1778vm_nmi_pending(struct vm *vm, int vcpuid)
1779{
1780	struct vcpu *vcpu;
1781
1782	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1783		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1784
1785	vcpu = &vm->vcpu[vcpuid];
1786
1787	return (vcpu->nmi_pending);
1788}
1789
1790void
1791vm_nmi_clear(struct vm *vm, int vcpuid)
1792{
1793	struct vcpu *vcpu;
1794
1795	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1796		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1797
1798	vcpu = &vm->vcpu[vcpuid];
1799
1800	if (vcpu->nmi_pending == 0)
1801		panic("vm_nmi_clear: inconsistent nmi_pending state");
1802
1803	vcpu->nmi_pending = 0;
1804	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1805}
1806
1807static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
1808
1809int
1810vm_inject_extint(struct vm *vm, int vcpuid)
1811{
1812	struct vcpu *vcpu;
1813
1814	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1815		return (EINVAL);
1816
1817	vcpu = &vm->vcpu[vcpuid];
1818
1819	vcpu->extint_pending = 1;
1820	vcpu_notify_event(vm, vcpuid, false);
1821	return (0);
1822}
1823
1824int
1825vm_extint_pending(struct vm *vm, int vcpuid)
1826{
1827	struct vcpu *vcpu;
1828
1829	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1830		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1831
1832	vcpu = &vm->vcpu[vcpuid];
1833
1834	return (vcpu->extint_pending);
1835}
1836
1837void
1838vm_extint_clear(struct vm *vm, int vcpuid)
1839{
1840	struct vcpu *vcpu;
1841
1842	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1843		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
1844
1845	vcpu = &vm->vcpu[vcpuid];
1846
1847	if (vcpu->extint_pending == 0)
1848		panic("vm_extint_clear: inconsistent extint_pending state");
1849
1850	vcpu->extint_pending = 0;
1851	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
1852}
1853
1854int
1855vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1856{
1857	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1858		return (EINVAL);
1859
1860	if (type < 0 || type >= VM_CAP_MAX)
1861		return (EINVAL);
1862
1863	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1864}
1865
1866int
1867vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1868{
1869	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1870		return (EINVAL);
1871
1872	if (type < 0 || type >= VM_CAP_MAX)
1873		return (EINVAL);
1874
1875	return (VMSETCAP(vm->cookie, vcpu, type, val));
1876}
1877
1878uint64_t *
1879vm_guest_msrs(struct vm *vm, int cpu)
1880{
1881	return (vm->vcpu[cpu].guest_msrs);
1882}
1883
1884struct vlapic *
1885vm_lapic(struct vm *vm, int cpu)
1886{
1887	return (vm->vcpu[cpu].vlapic);
1888}
1889
1890struct vioapic *
1891vm_ioapic(struct vm *vm)
1892{
1893
1894	return (vm->vioapic);
1895}
1896
1897struct vhpet *
1898vm_hpet(struct vm *vm)
1899{
1900
1901	return (vm->vhpet);
1902}
1903
1904boolean_t
1905vmm_is_pptdev(int bus, int slot, int func)
1906{
1907	int found, i, n;
1908	int b, s, f;
1909	char *val, *cp, *cp2;
1910
1911	/*
1912	 * XXX
1913	 * The length of an environment variable is limited to 128 bytes which
1914	 * puts an upper limit on the number of passthru devices that may be
1915	 * specified using a single environment variable.
1916	 *
1917	 * Work around this by scanning multiple environment variable
1918	 * names instead of a single one - yuck!
1919	 */
1920	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1921
1922	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1923	found = 0;
1924	for (i = 0; names[i] != NULL && !found; i++) {
1925		cp = val = getenv(names[i]);
1926		while (cp != NULL && *cp != '\0') {
1927			if ((cp2 = strchr(cp, ' ')) != NULL)
1928				*cp2 = '\0';
1929
1930			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1931			if (n == 3 && bus == b && slot == s && func == f) {
1932				found = 1;
1933				break;
1934			}
1935
1936			if (cp2 != NULL)
1937				*cp2++ = ' ';
1938
1939			cp = cp2;
1940		}
1941		freeenv(val);
1942	}
1943	return (found);
1944}
1945
1946void *
1947vm_iommu_domain(struct vm *vm)
1948{
1949
1950	return (vm->iommu);
1951}
1952
1953int
1954vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1955    bool from_idle)
1956{
1957	int error;
1958	struct vcpu *vcpu;
1959
1960	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1961		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1962
1963	vcpu = &vm->vcpu[vcpuid];
1964
1965	vcpu_lock(vcpu);
1966	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1967	vcpu_unlock(vcpu);
1968
1969	return (error);
1970}
1971
1972enum vcpu_state
1973vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1974{
1975	struct vcpu *vcpu;
1976	enum vcpu_state state;
1977
1978	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1979		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1980
1981	vcpu = &vm->vcpu[vcpuid];
1982
1983	vcpu_lock(vcpu);
1984	state = vcpu->state;
1985	if (hostcpu != NULL)
1986		*hostcpu = vcpu->hostcpu;
1987	vcpu_unlock(vcpu);
1988
1989	return (state);
1990}
1991
1992int
1993vm_activate_cpu(struct vm *vm, int vcpuid)
1994{
1995
1996	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1997		return (EINVAL);
1998
1999	if (CPU_ISSET(vcpuid, &vm->active_cpus))
2000		return (EBUSY);
2001
2002	VCPU_CTR0(vm, vcpuid, "activated");
2003	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2004	return (0);
2005}
2006
2007cpuset_t
2008vm_active_cpus(struct vm *vm)
2009{
2010
2011	return (vm->active_cpus);
2012}
2013
2014cpuset_t
2015vm_suspended_cpus(struct vm *vm)
2016{
2017
2018	return (vm->suspended_cpus);
2019}
2020
2021void *
2022vcpu_stats(struct vm *vm, int vcpuid)
2023{
2024
2025	return (vm->vcpu[vcpuid].stats);
2026}
2027
2028int
2029vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2030{
2031	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2032		return (EINVAL);
2033
2034	*state = vm->vcpu[vcpuid].x2apic_state;
2035
2036	return (0);
2037}
2038
2039int
2040vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2041{
2042	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2043		return (EINVAL);
2044
2045	if (state >= X2APIC_STATE_LAST)
2046		return (EINVAL);
2047
2048	vm->vcpu[vcpuid].x2apic_state = state;
2049
2050	vlapic_set_x2apic_state(vm, vcpuid, state);
2051
2052	return (0);
2053}
2054
2055/*
2056 * This function is called to ensure that a vcpu "sees" a pending event
2057 * as soon as possible:
2058 * - If the vcpu thread is sleeping then it is woken up.
2059 * - If the vcpu is running on a different host_cpu then an IPI will be directed
2060 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2061 */
2062void
2063vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2064{
2065	int hostcpu;
2066	struct vcpu *vcpu;
2067
2068	vcpu = &vm->vcpu[vcpuid];
2069
2070	vcpu_lock(vcpu);
2071	hostcpu = vcpu->hostcpu;
2072	if (vcpu->state == VCPU_RUNNING) {
2073		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2074		if (hostcpu != curcpu) {
2075			if (lapic_intr) {
2076				vlapic_post_intr(vcpu->vlapic, hostcpu,
2077				    vmm_ipinum);
2078			} else {
2079				ipi_cpu(hostcpu, vmm_ipinum);
2080			}
2081		} else {
2082			/*
2083			 * If the 'vcpu' is running on 'curcpu' then it must
2084			 * be sending a notification to itself (e.g. SELF_IPI).
2085			 * The pending event will be picked up when the vcpu
2086			 * transitions back to guest context.
2087			 */
2088		}
2089	} else {
2090		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2091		    "with hostcpu %d", vcpu->state, hostcpu));
2092		if (vcpu->state == VCPU_SLEEPING)
2093			wakeup_one(vcpu);
2094	}
2095	vcpu_unlock(vcpu);
2096}
2097
2098struct vmspace *
2099vm_get_vmspace(struct vm *vm)
2100{
2101
2102	return (vm->vmspace);
2103}
2104
2105int
2106vm_apicid2vcpuid(struct vm *vm, int apicid)
2107{
2108	/*
2109	 * XXX apic id is assumed to be numerically identical to vcpu id
2110	 */
2111	return (apicid);
2112}
2113
2114void
2115vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2116    vm_rendezvous_func_t func, void *arg)
2117{
2118	int i;
2119
2120	/*
2121	 * Enforce that this function is called without any locks
2122	 */
2123	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2124	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
2125	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2126
2127restart:
2128	mtx_lock(&vm->rendezvous_mtx);
2129	if (vm->rendezvous_func != NULL) {
2130		/*
2131		 * If a rendezvous is already in progress then we need to
2132		 * call the rendezvous handler in case this 'vcpuid' is one
2133		 * of the targets of the rendezvous.
2134		 */
2135		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2136		mtx_unlock(&vm->rendezvous_mtx);
2137		vm_handle_rendezvous(vm, vcpuid);
2138		goto restart;
2139	}
2140	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2141	    "rendezvous is still in progress"));
2142
2143	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2144	vm->rendezvous_req_cpus = dest;
2145	CPU_ZERO(&vm->rendezvous_done_cpus);
2146	vm->rendezvous_arg = arg;
2147	vm_set_rendezvous_func(vm, func);
2148	mtx_unlock(&vm->rendezvous_mtx);
2149
2150	/*
2151	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2152	 * vcpus so they handle the rendezvous as soon as possible.
2153	 */
2154	for (i = 0; i < VM_MAXCPU; i++) {
2155		if (CPU_ISSET(i, &dest))
2156			vcpu_notify_event(vm, i, false);
2157	}
2158
2159	vm_handle_rendezvous(vm, vcpuid);
2160}
2161
2162struct vatpic *
2163vm_atpic(struct vm *vm)
2164{
2165	return (vm->vatpic);
2166}
2167
2168struct vatpit *
2169vm_atpit(struct vm *vm)
2170{
2171	return (vm->vatpit);
2172}
2173
2174enum vm_reg_name
2175vm_segment_name(int seg)
2176{
2177	static enum vm_reg_name seg_names[] = {
2178		VM_REG_GUEST_ES,
2179		VM_REG_GUEST_CS,
2180		VM_REG_GUEST_SS,
2181		VM_REG_GUEST_DS,
2182		VM_REG_GUEST_FS,
2183		VM_REG_GUEST_GS
2184	};
2185
2186	KASSERT(seg >= 0 && seg < nitems(seg_names),
2187	    ("%s: invalid segment encoding %d", __func__, seg));
2188	return (seg_names[seg]);
2189}
2190
2191void
2192vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2193    int num_copyinfo)
2194{
2195	int idx;
2196
2197	for (idx = 0; idx < num_copyinfo; idx++) {
2198		if (copyinfo[idx].cookie != NULL)
2199			vm_gpa_release(copyinfo[idx].cookie);
2200	}
2201	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2202}
2203
2204int
2205vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2206    uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2207    int num_copyinfo)
2208{
2209	int error, idx, nused;
2210	size_t n, off, remaining;
2211	void *hva, *cookie;
2212	uint64_t gpa;
2213
2214	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2215
2216	nused = 0;
2217	remaining = len;
2218	while (remaining > 0) {
2219		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2220		error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
2221		if (error)
2222			return (error);
2223		off = gpa & PAGE_MASK;
2224		n = min(remaining, PAGE_SIZE - off);
2225		copyinfo[nused].gpa = gpa;
2226		copyinfo[nused].len = n;
2227		remaining -= n;
2228		gla += n;
2229		nused++;
2230	}
2231
2232	for (idx = 0; idx < nused; idx++) {
2233		hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
2234		    prot, &cookie);
2235		if (hva == NULL)
2236			break;
2237		copyinfo[idx].hva = hva;
2238		copyinfo[idx].cookie = cookie;
2239	}
2240
2241	if (idx != nused) {
2242		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2243		return (-1);
2244	} else {
2245		return (0);
2246	}
2247}
2248
2249void
2250vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2251    size_t len)
2252{
2253	char *dst;
2254	int idx;
2255
2256	dst = kaddr;
2257	idx = 0;
2258	while (len > 0) {
2259		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2260		len -= copyinfo[idx].len;
2261		dst += copyinfo[idx].len;
2262		idx++;
2263	}
2264}
2265
2266void
2267vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2268    struct vm_copyinfo *copyinfo, size_t len)
2269{
2270	const char *src;
2271	int idx;
2272
2273	src = kaddr;
2274	idx = 0;
2275	while (len > 0) {
2276		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2277		len -= copyinfo[idx].len;
2278		src += copyinfo[idx].len;
2279		idx++;
2280	}
2281}
2282
2283/*
2284 * Return the amount of in-use and wired memory for the VM. Since
2285 * these are global stats, only return the values with for vCPU 0
2286 */
2287VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2288VMM_STAT_DECLARE(VMM_MEM_WIRED);
2289
2290static void
2291vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2292{
2293
2294	if (vcpu == 0) {
2295		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2296	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2297	}
2298}
2299
2300static void
2301vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2302{
2303
2304	if (vcpu == 0) {
2305		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2306	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2307	}
2308}
2309
2310VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2311VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2312