vmm.c revision 266393
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266393 2014-05-18 04:33:24Z jhb $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm.c 266393 2014-05-18 04:33:24Z jhb $");
31
32#include <sys/param.h>
33#include <sys/systm.h>
34#include <sys/kernel.h>
35#include <sys/module.h>
36#include <sys/sysctl.h>
37#include <sys/malloc.h>
38#include <sys/pcpu.h>
39#include <sys/lock.h>
40#include <sys/mutex.h>
41#include <sys/proc.h>
42#include <sys/rwlock.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/systm.h>
46
47#include <vm/vm.h>
48#include <vm/vm_object.h>
49#include <vm/vm_page.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_param.h>
54
55#include <machine/cpu.h>
56#include <machine/vm.h>
57#include <machine/pcb.h>
58#include <machine/smp.h>
59#include <x86/psl.h>
60#include <x86/apicreg.h>
61#include <machine/vmparam.h>
62
63#include <machine/vmm.h>
64#include <machine/vmm_dev.h>
65
66#include "vmm_ktr.h"
67#include "vmm_host.h"
68#include "vmm_mem.h"
69#include "vmm_util.h"
70#include "vhpet.h"
71#include "vioapic.h"
72#include "vlapic.h"
73#include "vmm_msr.h"
74#include "vmm_ipi.h"
75#include "vmm_stat.h"
76#include "vmm_lapic.h"
77
78#include "io/ppt.h"
79#include "io/iommu.h"
80
81struct vlapic;
82
83struct vcpu {
84	int		flags;
85	enum vcpu_state	state;
86	struct mtx	mtx;
87	int		hostcpu;	/* host cpuid this vcpu last ran on */
88	uint64_t	guest_msrs[VMM_MSR_NUM];
89	struct vlapic	*vlapic;
90	int		 vcpuid;
91	struct savefpu	*guestfpu;	/* guest fpu state */
92	void		*stats;
93	struct vm_exit	exitinfo;
94	enum x2apic_state x2apic_state;
95	int		nmi_pending;
96};
97
98#define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
99#define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
100#define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
101#define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
102
103struct mem_seg {
104	vm_paddr_t	gpa;
105	size_t		len;
106	boolean_t	wired;
107	vm_object_t	object;
108};
109#define	VM_MAX_MEMORY_SEGMENTS	2
110
111struct vm {
112	void		*cookie;	/* processor-specific data */
113	void		*iommu;		/* iommu-specific data */
114	struct vhpet	*vhpet;		/* virtual HPET */
115	struct vioapic	*vioapic;	/* virtual ioapic */
116	struct vmspace	*vmspace;	/* guest's address space */
117	struct vcpu	vcpu[VM_MAXCPU];
118	int		num_mem_segs;
119	struct mem_seg	mem_segs[VM_MAX_MEMORY_SEGMENTS];
120	char		name[VM_MAX_NAMELEN];
121
122	/*
123	 * Set of active vcpus.
124	 * An active vcpu is one that has been started implicitly (BSP) or
125	 * explicitly (AP) by sending it a startup ipi.
126	 */
127	volatile cpuset_t active_cpus;
128
129	struct mtx	rendezvous_mtx;
130	cpuset_t	rendezvous_req_cpus;
131	cpuset_t	rendezvous_done_cpus;
132	void		*rendezvous_arg;
133	vm_rendezvous_func_t rendezvous_func;
134};
135
136static int vmm_initialized;
137
138static struct vmm_ops *ops;
139#define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
140#define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
141#define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
142
143#define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
144#define	VMRUN(vmi, vcpu, rip, pmap, rptr) \
145	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr) : ENXIO)
146#define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
147#define	VMSPACE_ALLOC(min, max) \
148	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
149#define	VMSPACE_FREE(vmspace) \
150	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
151#define	VMGETREG(vmi, vcpu, num, retval)		\
152	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
153#define	VMSETREG(vmi, vcpu, num, val)		\
154	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
155#define	VMGETDESC(vmi, vcpu, num, desc)		\
156	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
157#define	VMSETDESC(vmi, vcpu, num, desc)		\
158	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
159#define	VMINJECT(vmi, vcpu, type, vec, ec, ecv)	\
160	(ops != NULL ? (*ops->vminject)(vmi, vcpu, type, vec, ec, ecv) : ENXIO)
161#define	VMGETCAP(vmi, vcpu, num, retval)	\
162	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
163#define	VMSETCAP(vmi, vcpu, num, val)		\
164	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
165#define	VLAPIC_INIT(vmi, vcpu)			\
166	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
167#define	VLAPIC_CLEANUP(vmi, vlapic)		\
168	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
169
170#define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
171#define	fpu_stop_emulating()	clts()
172
173static MALLOC_DEFINE(M_VM, "vm", "vm");
174CTASSERT(VMM_MSR_NUM <= 64);	/* msr_mask can keep track of up to 64 msrs */
175
176/* statistics */
177static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
178
179SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
180
181static int vmm_ipinum;
182SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
183    "IPI vector used for vcpu notifications");
184
185static void vm_deactivate_cpu(struct vm *vm, int vcpuid);
186
187static void
188vcpu_cleanup(struct vm *vm, int i)
189{
190	struct vcpu *vcpu = &vm->vcpu[i];
191
192	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
193	vmm_stat_free(vcpu->stats);
194	fpu_save_area_free(vcpu->guestfpu);
195}
196
197static void
198vcpu_init(struct vm *vm, uint32_t vcpu_id)
199{
200	struct vcpu *vcpu;
201
202	vcpu = &vm->vcpu[vcpu_id];
203
204	vcpu_lock_init(vcpu);
205	vcpu->hostcpu = NOCPU;
206	vcpu->vcpuid = vcpu_id;
207	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
208	vm_set_x2apic_state(vm, vcpu_id, X2APIC_ENABLED);
209	vcpu->guestfpu = fpu_save_area_alloc();
210	fpu_save_area_reset(vcpu->guestfpu);
211	vcpu->stats = vmm_stat_alloc();
212}
213
214struct vm_exit *
215vm_exitinfo(struct vm *vm, int cpuid)
216{
217	struct vcpu *vcpu;
218
219	if (cpuid < 0 || cpuid >= VM_MAXCPU)
220		panic("vm_exitinfo: invalid cpuid %d", cpuid);
221
222	vcpu = &vm->vcpu[cpuid];
223
224	return (&vcpu->exitinfo);
225}
226
227static void
228vmm_resume(void)
229{
230	VMM_RESUME();
231}
232
233static int
234vmm_init(void)
235{
236	int error;
237
238	vmm_host_state_init();
239
240	vmm_ipinum = vmm_ipi_alloc();
241	if (vmm_ipinum == 0)
242		vmm_ipinum = IPI_AST;
243
244	error = vmm_mem_init();
245	if (error)
246		return (error);
247
248	if (vmm_is_intel())
249		ops = &vmm_ops_intel;
250	else if (vmm_is_amd())
251		ops = &vmm_ops_amd;
252	else
253		return (ENXIO);
254
255	vmm_msr_init();
256	vmm_resume_p = vmm_resume;
257
258	return (VMM_INIT(vmm_ipinum));
259}
260
261static int
262vmm_handler(module_t mod, int what, void *arg)
263{
264	int error;
265
266	switch (what) {
267	case MOD_LOAD:
268		vmmdev_init();
269		iommu_init();
270		error = vmm_init();
271		if (error == 0)
272			vmm_initialized = 1;
273		break;
274	case MOD_UNLOAD:
275		error = vmmdev_cleanup();
276		if (error == 0) {
277			vmm_resume_p = NULL;
278			iommu_cleanup();
279			if (vmm_ipinum != IPI_AST)
280				vmm_ipi_free(vmm_ipinum);
281			error = VMM_CLEANUP();
282			/*
283			 * Something bad happened - prevent new
284			 * VMs from being created
285			 */
286			if (error)
287				vmm_initialized = 0;
288		}
289		break;
290	default:
291		error = 0;
292		break;
293	}
294	return (error);
295}
296
297static moduledata_t vmm_kmod = {
298	"vmm",
299	vmm_handler,
300	NULL
301};
302
303/*
304 * vmm initialization has the following dependencies:
305 *
306 * - iommu initialization must happen after the pci passthru driver has had
307 *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
308 *
309 * - VT-x initialization requires smp_rendezvous() and therefore must happen
310 *   after SMP is fully functional (after SI_SUB_SMP).
311 */
312DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
313MODULE_VERSION(vmm, 1);
314
315int
316vm_create(const char *name, struct vm **retvm)
317{
318	int i;
319	struct vm *vm;
320	struct vmspace *vmspace;
321
322	const int BSP = 0;
323
324	/*
325	 * If vmm.ko could not be successfully initialized then don't attempt
326	 * to create the virtual machine.
327	 */
328	if (!vmm_initialized)
329		return (ENXIO);
330
331	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
332		return (EINVAL);
333
334	vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
335	if (vmspace == NULL)
336		return (ENOMEM);
337
338	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
339	strcpy(vm->name, name);
340	vm->vmspace = vmspace;
341	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
342	vm->cookie = VMINIT(vm, vmspace_pmap(vmspace));
343	vm->vioapic = vioapic_init(vm);
344	vm->vhpet = vhpet_init(vm);
345
346	for (i = 0; i < VM_MAXCPU; i++) {
347		vcpu_init(vm, i);
348		guest_msrs_init(vm, i);
349	}
350
351	vm_activate_cpu(vm, BSP);
352
353	*retvm = vm;
354	return (0);
355}
356
357static void
358vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
359{
360
361	if (seg->object != NULL)
362		vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
363
364	bzero(seg, sizeof(*seg));
365}
366
367void
368vm_destroy(struct vm *vm)
369{
370	int i;
371
372	ppt_unassign_all(vm);
373
374	if (vm->iommu != NULL)
375		iommu_destroy_domain(vm->iommu);
376
377	vhpet_cleanup(vm->vhpet);
378	vioapic_cleanup(vm->vioapic);
379
380	for (i = 0; i < vm->num_mem_segs; i++)
381		vm_free_mem_seg(vm, &vm->mem_segs[i]);
382
383	vm->num_mem_segs = 0;
384
385	for (i = 0; i < VM_MAXCPU; i++)
386		vcpu_cleanup(vm, i);
387
388	VMSPACE_FREE(vm->vmspace);
389
390	VMCLEANUP(vm->cookie);
391
392	free(vm, M_VM);
393}
394
395const char *
396vm_name(struct vm *vm)
397{
398	return (vm->name);
399}
400
401int
402vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
403{
404	vm_object_t obj;
405
406	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
407		return (ENOMEM);
408	else
409		return (0);
410}
411
412int
413vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
414{
415
416	vmm_mmio_free(vm->vmspace, gpa, len);
417	return (0);
418}
419
420boolean_t
421vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
422{
423	int i;
424	vm_paddr_t gpabase, gpalimit;
425
426	for (i = 0; i < vm->num_mem_segs; i++) {
427		gpabase = vm->mem_segs[i].gpa;
428		gpalimit = gpabase + vm->mem_segs[i].len;
429		if (gpa >= gpabase && gpa < gpalimit)
430			return (TRUE);		/* 'gpa' is regular memory */
431	}
432
433	if (ppt_is_mmio(vm, gpa))
434		return (TRUE);			/* 'gpa' is pci passthru mmio */
435
436	return (FALSE);
437}
438
439int
440vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
441{
442	int available, allocated;
443	struct mem_seg *seg;
444	vm_object_t object;
445	vm_paddr_t g;
446
447	if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
448		return (EINVAL);
449
450	available = allocated = 0;
451	g = gpa;
452	while (g < gpa + len) {
453		if (vm_mem_allocated(vm, g))
454			allocated++;
455		else
456			available++;
457
458		g += PAGE_SIZE;
459	}
460
461	/*
462	 * If there are some allocated and some available pages in the address
463	 * range then it is an error.
464	 */
465	if (allocated && available)
466		return (EINVAL);
467
468	/*
469	 * If the entire address range being requested has already been
470	 * allocated then there isn't anything more to do.
471	 */
472	if (allocated && available == 0)
473		return (0);
474
475	if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
476		return (E2BIG);
477
478	seg = &vm->mem_segs[vm->num_mem_segs];
479
480	if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
481		return (ENOMEM);
482
483	seg->gpa = gpa;
484	seg->len = len;
485	seg->object = object;
486	seg->wired = FALSE;
487
488	vm->num_mem_segs++;
489
490	return (0);
491}
492
493static void
494vm_gpa_unwire(struct vm *vm)
495{
496	int i, rv;
497	struct mem_seg *seg;
498
499	for (i = 0; i < vm->num_mem_segs; i++) {
500		seg = &vm->mem_segs[i];
501		if (!seg->wired)
502			continue;
503
504		rv = vm_map_unwire(&vm->vmspace->vm_map,
505				   seg->gpa, seg->gpa + seg->len,
506				   VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
507		KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
508		    "%#lx/%ld could not be unwired: %d",
509		    vm_name(vm), seg->gpa, seg->len, rv));
510
511		seg->wired = FALSE;
512	}
513}
514
515static int
516vm_gpa_wire(struct vm *vm)
517{
518	int i, rv;
519	struct mem_seg *seg;
520
521	for (i = 0; i < vm->num_mem_segs; i++) {
522		seg = &vm->mem_segs[i];
523		if (seg->wired)
524			continue;
525
526		/* XXX rlimits? */
527		rv = vm_map_wire(&vm->vmspace->vm_map,
528				 seg->gpa, seg->gpa + seg->len,
529				 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
530		if (rv != KERN_SUCCESS)
531			break;
532
533		seg->wired = TRUE;
534	}
535
536	if (i < vm->num_mem_segs) {
537		/*
538		 * Undo the wiring before returning an error.
539		 */
540		vm_gpa_unwire(vm);
541		return (EAGAIN);
542	}
543
544	return (0);
545}
546
547static void
548vm_iommu_modify(struct vm *vm, boolean_t map)
549{
550	int i, sz;
551	vm_paddr_t gpa, hpa;
552	struct mem_seg *seg;
553	void *vp, *cookie, *host_domain;
554
555	sz = PAGE_SIZE;
556	host_domain = iommu_host_domain();
557
558	for (i = 0; i < vm->num_mem_segs; i++) {
559		seg = &vm->mem_segs[i];
560		KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
561		    vm_name(vm), seg->gpa, seg->len));
562
563		gpa = seg->gpa;
564		while (gpa < seg->gpa + seg->len) {
565			vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
566					 &cookie);
567			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
568			    vm_name(vm), gpa));
569
570			vm_gpa_release(cookie);
571
572			hpa = DMAP_TO_PHYS((uintptr_t)vp);
573			if (map) {
574				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
575				iommu_remove_mapping(host_domain, hpa, sz);
576			} else {
577				iommu_remove_mapping(vm->iommu, gpa, sz);
578				iommu_create_mapping(host_domain, hpa, hpa, sz);
579			}
580
581			gpa += PAGE_SIZE;
582		}
583	}
584
585	/*
586	 * Invalidate the cached translations associated with the domain
587	 * from which pages were removed.
588	 */
589	if (map)
590		iommu_invalidate_tlb(host_domain);
591	else
592		iommu_invalidate_tlb(vm->iommu);
593}
594
595#define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), FALSE)
596#define	vm_iommu_map(vm)	vm_iommu_modify((vm), TRUE)
597
598int
599vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
600{
601	int error;
602
603	error = ppt_unassign_device(vm, bus, slot, func);
604	if (error)
605		return (error);
606
607	if (ppt_num_devices(vm) == 0) {
608		vm_iommu_unmap(vm);
609		vm_gpa_unwire(vm);
610	}
611	return (0);
612}
613
614int
615vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
616{
617	int error;
618	vm_paddr_t maxaddr;
619
620	/*
621	 * Virtual machines with pci passthru devices get special treatment:
622	 * - the guest physical memory is wired
623	 * - the iommu is programmed to do the 'gpa' to 'hpa' translation
624	 *
625	 * We need to do this before the first pci passthru device is attached.
626	 */
627	if (ppt_num_devices(vm) == 0) {
628		KASSERT(vm->iommu == NULL,
629		    ("vm_assign_pptdev: iommu must be NULL"));
630		maxaddr = vmm_mem_maxaddr();
631		vm->iommu = iommu_create_domain(maxaddr);
632
633		error = vm_gpa_wire(vm);
634		if (error)
635			return (error);
636
637		vm_iommu_map(vm);
638	}
639
640	error = ppt_assign_device(vm, bus, slot, func);
641	return (error);
642}
643
644void *
645vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
646	    void **cookie)
647{
648	int count, pageoff;
649	vm_page_t m;
650
651	pageoff = gpa & PAGE_MASK;
652	if (len > PAGE_SIZE - pageoff)
653		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
654
655	count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
656	    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
657
658	if (count == 1) {
659		*cookie = m;
660		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
661	} else {
662		*cookie = NULL;
663		return (NULL);
664	}
665}
666
667void
668vm_gpa_release(void *cookie)
669{
670	vm_page_t m = cookie;
671
672	vm_page_lock(m);
673	vm_page_unhold(m);
674	vm_page_unlock(m);
675}
676
677int
678vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
679		  struct vm_memory_segment *seg)
680{
681	int i;
682
683	for (i = 0; i < vm->num_mem_segs; i++) {
684		if (gpabase == vm->mem_segs[i].gpa) {
685			seg->gpa = vm->mem_segs[i].gpa;
686			seg->len = vm->mem_segs[i].len;
687			seg->wired = vm->mem_segs[i].wired;
688			return (0);
689		}
690	}
691	return (-1);
692}
693
694int
695vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
696	      vm_offset_t *offset, struct vm_object **object)
697{
698	int i;
699	size_t seg_len;
700	vm_paddr_t seg_gpa;
701	vm_object_t seg_obj;
702
703	for (i = 0; i < vm->num_mem_segs; i++) {
704		if ((seg_obj = vm->mem_segs[i].object) == NULL)
705			continue;
706
707		seg_gpa = vm->mem_segs[i].gpa;
708		seg_len = vm->mem_segs[i].len;
709
710		if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
711			*offset = gpa - seg_gpa;
712			*object = seg_obj;
713			vm_object_reference(seg_obj);
714			return (0);
715		}
716	}
717
718	return (EINVAL);
719}
720
721int
722vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
723{
724
725	if (vcpu < 0 || vcpu >= VM_MAXCPU)
726		return (EINVAL);
727
728	if (reg >= VM_REG_LAST)
729		return (EINVAL);
730
731	return (VMGETREG(vm->cookie, vcpu, reg, retval));
732}
733
734int
735vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
736{
737
738	if (vcpu < 0 || vcpu >= VM_MAXCPU)
739		return (EINVAL);
740
741	if (reg >= VM_REG_LAST)
742		return (EINVAL);
743
744	return (VMSETREG(vm->cookie, vcpu, reg, val));
745}
746
747static boolean_t
748is_descriptor_table(int reg)
749{
750
751	switch (reg) {
752	case VM_REG_GUEST_IDTR:
753	case VM_REG_GUEST_GDTR:
754		return (TRUE);
755	default:
756		return (FALSE);
757	}
758}
759
760static boolean_t
761is_segment_register(int reg)
762{
763
764	switch (reg) {
765	case VM_REG_GUEST_ES:
766	case VM_REG_GUEST_CS:
767	case VM_REG_GUEST_SS:
768	case VM_REG_GUEST_DS:
769	case VM_REG_GUEST_FS:
770	case VM_REG_GUEST_GS:
771	case VM_REG_GUEST_TR:
772	case VM_REG_GUEST_LDTR:
773		return (TRUE);
774	default:
775		return (FALSE);
776	}
777}
778
779int
780vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
781		struct seg_desc *desc)
782{
783
784	if (vcpu < 0 || vcpu >= VM_MAXCPU)
785		return (EINVAL);
786
787	if (!is_segment_register(reg) && !is_descriptor_table(reg))
788		return (EINVAL);
789
790	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
791}
792
793int
794vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
795		struct seg_desc *desc)
796{
797	if (vcpu < 0 || vcpu >= VM_MAXCPU)
798		return (EINVAL);
799
800	if (!is_segment_register(reg) && !is_descriptor_table(reg))
801		return (EINVAL);
802
803	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
804}
805
806static void
807restore_guest_fpustate(struct vcpu *vcpu)
808{
809
810	/* flush host state to the pcb */
811	fpuexit(curthread);
812
813	/* restore guest FPU state */
814	fpu_stop_emulating();
815	fpurestore(vcpu->guestfpu);
816
817	/*
818	 * The FPU is now "dirty" with the guest's state so turn on emulation
819	 * to trap any access to the FPU by the host.
820	 */
821	fpu_start_emulating();
822}
823
824static void
825save_guest_fpustate(struct vcpu *vcpu)
826{
827
828	if ((rcr0() & CR0_TS) == 0)
829		panic("fpu emulation not enabled in host!");
830
831	/* save guest FPU state */
832	fpu_stop_emulating();
833	fpusave(vcpu->guestfpu);
834	fpu_start_emulating();
835}
836
837static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
838
839static int
840vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
841    bool from_idle)
842{
843	int error;
844
845	vcpu_assert_locked(vcpu);
846
847	/*
848	 * State transitions from the vmmdev_ioctl() must always begin from
849	 * the VCPU_IDLE state. This guarantees that there is only a single
850	 * ioctl() operating on a vcpu at any point.
851	 */
852	if (from_idle) {
853		while (vcpu->state != VCPU_IDLE)
854			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
855	} else {
856		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
857		    "vcpu idle state"));
858	}
859
860	if (vcpu->state == VCPU_RUNNING) {
861		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
862		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
863	} else {
864		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
865		    "vcpu that is not running", vcpu->hostcpu));
866	}
867
868	/*
869	 * The following state transitions are allowed:
870	 * IDLE -> FROZEN -> IDLE
871	 * FROZEN -> RUNNING -> FROZEN
872	 * FROZEN -> SLEEPING -> FROZEN
873	 */
874	switch (vcpu->state) {
875	case VCPU_IDLE:
876	case VCPU_RUNNING:
877	case VCPU_SLEEPING:
878		error = (newstate != VCPU_FROZEN);
879		break;
880	case VCPU_FROZEN:
881		error = (newstate == VCPU_FROZEN);
882		break;
883	default:
884		error = 1;
885		break;
886	}
887
888	if (error)
889		return (EBUSY);
890
891	vcpu->state = newstate;
892	if (newstate == VCPU_RUNNING)
893		vcpu->hostcpu = curcpu;
894	else
895		vcpu->hostcpu = NOCPU;
896
897	if (newstate == VCPU_IDLE)
898		wakeup(&vcpu->state);
899
900	return (0);
901}
902
903static void
904vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
905{
906	int error;
907
908	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
909		panic("Error %d setting state to %d\n", error, newstate);
910}
911
912static void
913vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
914{
915	int error;
916
917	if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
918		panic("Error %d setting state to %d", error, newstate);
919}
920
921static void
922vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
923{
924
925	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
926
927	/*
928	 * Update 'rendezvous_func' and execute a write memory barrier to
929	 * ensure that it is visible across all host cpus. This is not needed
930	 * for correctness but it does ensure that all the vcpus will notice
931	 * that the rendezvous is requested immediately.
932	 */
933	vm->rendezvous_func = func;
934	wmb();
935}
936
937#define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
938	do {								\
939		if (vcpuid >= 0)					\
940			VCPU_CTR0(vm, vcpuid, fmt);			\
941		else							\
942			VM_CTR0(vm, fmt);				\
943	} while (0)
944
945static void
946vm_handle_rendezvous(struct vm *vm, int vcpuid)
947{
948
949	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
950	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
951
952	mtx_lock(&vm->rendezvous_mtx);
953	while (vm->rendezvous_func != NULL) {
954		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
955		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
956
957		if (vcpuid != -1 &&
958		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
959		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
960			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
961			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
962			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
963		}
964		if (CPU_CMP(&vm->rendezvous_req_cpus,
965		    &vm->rendezvous_done_cpus) == 0) {
966			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
967			vm_set_rendezvous_func(vm, NULL);
968			wakeup(&vm->rendezvous_func);
969			break;
970		}
971		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
972		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
973		    "vmrndv", 0);
974	}
975	mtx_unlock(&vm->rendezvous_mtx);
976}
977
978/*
979 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
980 */
981static int
982vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
983{
984	struct vm_exit *vmexit;
985	struct vcpu *vcpu;
986	int t, timo, spindown;
987
988	vcpu = &vm->vcpu[vcpuid];
989	spindown = 0;
990
991	vcpu_lock(vcpu);
992
993	/*
994	 * Do a final check for pending NMI or interrupts before
995	 * really putting this thread to sleep.
996	 *
997	 * These interrupts could have happened any time after we
998	 * returned from VMRUN() and before we grabbed the vcpu lock.
999	 */
1000	if (!vm_nmi_pending(vm, vcpuid) &&
1001	    (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
1002		t = ticks;
1003		vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
1004		if (vlapic_enabled(vcpu->vlapic)) {
1005			/*
1006			 * XXX msleep_spin() is not interruptible so use the
1007			 * 'timo' to put an upper bound on the sleep time.
1008			 */
1009			timo = hz;
1010			msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
1011		} else {
1012			/*
1013			 * Spindown the vcpu if the apic is disabled and it
1014			 * had entered the halted state.
1015			 */
1016			spindown = 1;
1017		}
1018		vcpu_require_state_locked(vcpu, VCPU_FROZEN);
1019		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1020	}
1021	vcpu_unlock(vcpu);
1022
1023	/*
1024	 * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
1025	 * outside the confines of the vcpu spinlock.
1026	 */
1027	if (spindown) {
1028		*retu = true;
1029		vmexit = vm_exitinfo(vm, vcpuid);
1030		vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
1031		vm_deactivate_cpu(vm, vcpuid);
1032		VCPU_CTR0(vm, vcpuid, "spinning down cpu");
1033	}
1034
1035	return (0);
1036}
1037
1038static int
1039vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1040{
1041	int rv, ftype;
1042	struct vm_map *map;
1043	struct vcpu *vcpu;
1044	struct vm_exit *vme;
1045
1046	vcpu = &vm->vcpu[vcpuid];
1047	vme = &vcpu->exitinfo;
1048
1049	ftype = vme->u.paging.fault_type;
1050	KASSERT(ftype == VM_PROT_READ ||
1051	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1052	    ("vm_handle_paging: invalid fault_type %d", ftype));
1053
1054	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1055		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1056		    vme->u.paging.gpa, ftype);
1057		if (rv == 0)
1058			goto done;
1059	}
1060
1061	map = &vm->vmspace->vm_map;
1062	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1063
1064	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1065	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1066
1067	if (rv != KERN_SUCCESS)
1068		return (EFAULT);
1069done:
1070	/* restart execution at the faulting instruction */
1071	vme->inst_length = 0;
1072
1073	return (0);
1074}
1075
1076static int
1077vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1078{
1079	struct vie *vie;
1080	struct vcpu *vcpu;
1081	struct vm_exit *vme;
1082	int error, inst_length;
1083	uint64_t rip, gla, gpa, cr3;
1084	mem_region_read_t mread;
1085	mem_region_write_t mwrite;
1086
1087	vcpu = &vm->vcpu[vcpuid];
1088	vme = &vcpu->exitinfo;
1089
1090	rip = vme->rip;
1091	inst_length = vme->inst_length;
1092
1093	gla = vme->u.inst_emul.gla;
1094	gpa = vme->u.inst_emul.gpa;
1095	cr3 = vme->u.inst_emul.cr3;
1096	vie = &vme->u.inst_emul.vie;
1097
1098	vie_init(vie);
1099
1100	/* Fetch, decode and emulate the faulting instruction */
1101	if (vmm_fetch_instruction(vm, vcpuid, rip, inst_length, cr3, vie) != 0)
1102		return (EFAULT);
1103
1104	if (vmm_decode_instruction(vm, vcpuid, gla, vie) != 0)
1105		return (EFAULT);
1106
1107	/* return to userland unless this is an in-kernel emulated device */
1108	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1109		mread = lapic_mmio_read;
1110		mwrite = lapic_mmio_write;
1111	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1112		mread = vioapic_mmio_read;
1113		mwrite = vioapic_mmio_write;
1114	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1115		mread = vhpet_mmio_read;
1116		mwrite = vhpet_mmio_write;
1117	} else {
1118		*retu = true;
1119		return (0);
1120	}
1121
1122	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, mread, mwrite,
1123	    retu);
1124
1125	return (error);
1126}
1127
1128int
1129vm_run(struct vm *vm, struct vm_run *vmrun)
1130{
1131	int error, vcpuid;
1132	struct vcpu *vcpu;
1133	struct pcb *pcb;
1134	uint64_t tscval, rip;
1135	struct vm_exit *vme;
1136	bool retu, intr_disabled;
1137	pmap_t pmap;
1138
1139	vcpuid = vmrun->cpuid;
1140
1141	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1142		return (EINVAL);
1143
1144	pmap = vmspace_pmap(vm->vmspace);
1145	vcpu = &vm->vcpu[vcpuid];
1146	vme = &vcpu->exitinfo;
1147	rip = vmrun->rip;
1148restart:
1149	critical_enter();
1150
1151	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1152	    ("vm_run: absurd pm_active"));
1153
1154	tscval = rdtsc();
1155
1156	pcb = PCPU_GET(curpcb);
1157	set_pcb_flags(pcb, PCB_FULL_IRET);
1158
1159	restore_guest_msrs(vm, vcpuid);
1160	restore_guest_fpustate(vcpu);
1161
1162	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1163	error = VMRUN(vm->cookie, vcpuid, rip, pmap, &vm->rendezvous_func);
1164	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1165
1166	save_guest_fpustate(vcpu);
1167	restore_host_msrs(vm, vcpuid);
1168
1169	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1170
1171	critical_exit();
1172
1173	if (error == 0) {
1174		retu = false;
1175		switch (vme->exitcode) {
1176		case VM_EXITCODE_IOAPIC_EOI:
1177			vioapic_process_eoi(vm, vcpuid,
1178			    vme->u.ioapic_eoi.vector);
1179			break;
1180		case VM_EXITCODE_RENDEZVOUS:
1181			vm_handle_rendezvous(vm, vcpuid);
1182			error = 0;
1183			break;
1184		case VM_EXITCODE_HLT:
1185			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1186			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1187			break;
1188		case VM_EXITCODE_PAGING:
1189			error = vm_handle_paging(vm, vcpuid, &retu);
1190			break;
1191		case VM_EXITCODE_INST_EMUL:
1192			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1193			break;
1194		default:
1195			retu = true;	/* handled in userland */
1196			break;
1197		}
1198	}
1199
1200	if (error == 0 && retu == false) {
1201		rip = vme->rip + vme->inst_length;
1202		goto restart;
1203	}
1204
1205	/* copy the exit information */
1206	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1207	return (error);
1208}
1209
1210int
1211vm_inject_event(struct vm *vm, int vcpuid, int type,
1212		int vector, uint32_t code, int code_valid)
1213{
1214	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1215		return (EINVAL);
1216
1217	if ((type > VM_EVENT_NONE && type < VM_EVENT_MAX) == 0)
1218		return (EINVAL);
1219
1220	if (vector < 0 || vector > 255)
1221		return (EINVAL);
1222
1223	return (VMINJECT(vm->cookie, vcpuid, type, vector, code, code_valid));
1224}
1225
1226static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
1227
1228int
1229vm_inject_nmi(struct vm *vm, int vcpuid)
1230{
1231	struct vcpu *vcpu;
1232
1233	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1234		return (EINVAL);
1235
1236	vcpu = &vm->vcpu[vcpuid];
1237
1238	vcpu->nmi_pending = 1;
1239	vcpu_notify_event(vm, vcpuid, false);
1240	return (0);
1241}
1242
1243int
1244vm_nmi_pending(struct vm *vm, int vcpuid)
1245{
1246	struct vcpu *vcpu;
1247
1248	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1249		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1250
1251	vcpu = &vm->vcpu[vcpuid];
1252
1253	return (vcpu->nmi_pending);
1254}
1255
1256void
1257vm_nmi_clear(struct vm *vm, int vcpuid)
1258{
1259	struct vcpu *vcpu;
1260
1261	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1262		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
1263
1264	vcpu = &vm->vcpu[vcpuid];
1265
1266	if (vcpu->nmi_pending == 0)
1267		panic("vm_nmi_clear: inconsistent nmi_pending state");
1268
1269	vcpu->nmi_pending = 0;
1270	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
1271}
1272
1273int
1274vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
1275{
1276	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1277		return (EINVAL);
1278
1279	if (type < 0 || type >= VM_CAP_MAX)
1280		return (EINVAL);
1281
1282	return (VMGETCAP(vm->cookie, vcpu, type, retval));
1283}
1284
1285int
1286vm_set_capability(struct vm *vm, int vcpu, int type, int val)
1287{
1288	if (vcpu < 0 || vcpu >= VM_MAXCPU)
1289		return (EINVAL);
1290
1291	if (type < 0 || type >= VM_CAP_MAX)
1292		return (EINVAL);
1293
1294	return (VMSETCAP(vm->cookie, vcpu, type, val));
1295}
1296
1297uint64_t *
1298vm_guest_msrs(struct vm *vm, int cpu)
1299{
1300	return (vm->vcpu[cpu].guest_msrs);
1301}
1302
1303struct vlapic *
1304vm_lapic(struct vm *vm, int cpu)
1305{
1306	return (vm->vcpu[cpu].vlapic);
1307}
1308
1309struct vioapic *
1310vm_ioapic(struct vm *vm)
1311{
1312
1313	return (vm->vioapic);
1314}
1315
1316struct vhpet *
1317vm_hpet(struct vm *vm)
1318{
1319
1320	return (vm->vhpet);
1321}
1322
1323boolean_t
1324vmm_is_pptdev(int bus, int slot, int func)
1325{
1326	int found, i, n;
1327	int b, s, f;
1328	char *val, *cp, *cp2;
1329
1330	/*
1331	 * XXX
1332	 * The length of an environment variable is limited to 128 bytes which
1333	 * puts an upper limit on the number of passthru devices that may be
1334	 * specified using a single environment variable.
1335	 *
1336	 * Work around this by scanning multiple environment variable
1337	 * names instead of a single one - yuck!
1338	 */
1339	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
1340
1341	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
1342	found = 0;
1343	for (i = 0; names[i] != NULL && !found; i++) {
1344		cp = val = getenv(names[i]);
1345		while (cp != NULL && *cp != '\0') {
1346			if ((cp2 = strchr(cp, ' ')) != NULL)
1347				*cp2 = '\0';
1348
1349			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
1350			if (n == 3 && bus == b && slot == s && func == f) {
1351				found = 1;
1352				break;
1353			}
1354
1355			if (cp2 != NULL)
1356				*cp2++ = ' ';
1357
1358			cp = cp2;
1359		}
1360		freeenv(val);
1361	}
1362	return (found);
1363}
1364
1365void *
1366vm_iommu_domain(struct vm *vm)
1367{
1368
1369	return (vm->iommu);
1370}
1371
1372int
1373vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1374    bool from_idle)
1375{
1376	int error;
1377	struct vcpu *vcpu;
1378
1379	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1380		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
1381
1382	vcpu = &vm->vcpu[vcpuid];
1383
1384	vcpu_lock(vcpu);
1385	error = vcpu_set_state_locked(vcpu, newstate, from_idle);
1386	vcpu_unlock(vcpu);
1387
1388	return (error);
1389}
1390
1391enum vcpu_state
1392vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
1393{
1394	struct vcpu *vcpu;
1395	enum vcpu_state state;
1396
1397	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1398		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
1399
1400	vcpu = &vm->vcpu[vcpuid];
1401
1402	vcpu_lock(vcpu);
1403	state = vcpu->state;
1404	if (hostcpu != NULL)
1405		*hostcpu = vcpu->hostcpu;
1406	vcpu_unlock(vcpu);
1407
1408	return (state);
1409}
1410
1411void
1412vm_activate_cpu(struct vm *vm, int vcpuid)
1413{
1414
1415	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1416	    ("vm_activate_cpu: invalid vcpuid %d", vcpuid));
1417	KASSERT(!CPU_ISSET(vcpuid, &vm->active_cpus),
1418	    ("vm_activate_cpu: vcpuid %d is already active", vcpuid));
1419
1420	VCPU_CTR0(vm, vcpuid, "activated");
1421	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
1422}
1423
1424static void
1425vm_deactivate_cpu(struct vm *vm, int vcpuid)
1426{
1427
1428	KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU,
1429	    ("vm_deactivate_cpu: invalid vcpuid %d", vcpuid));
1430	KASSERT(CPU_ISSET(vcpuid, &vm->active_cpus),
1431	    ("vm_deactivate_cpu: vcpuid %d is not active", vcpuid));
1432
1433	VCPU_CTR0(vm, vcpuid, "deactivated");
1434	CPU_CLR_ATOMIC(vcpuid, &vm->active_cpus);
1435
1436	/*
1437	 * If a vcpu rendezvous is in progress then it could be blocked
1438	 * on 'vcpuid' - unblock it before disappearing forever.
1439	 */
1440	mtx_lock(&vm->rendezvous_mtx);
1441	if (vm->rendezvous_func != NULL) {
1442		VCPU_CTR0(vm, vcpuid, "unblock rendezvous after deactivation");
1443		wakeup(&vm->rendezvous_func);
1444	}
1445	mtx_unlock(&vm->rendezvous_mtx);
1446}
1447
1448cpuset_t
1449vm_active_cpus(struct vm *vm)
1450{
1451
1452	return (vm->active_cpus);
1453}
1454
1455void *
1456vcpu_stats(struct vm *vm, int vcpuid)
1457{
1458
1459	return (vm->vcpu[vcpuid].stats);
1460}
1461
1462int
1463vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
1464{
1465	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1466		return (EINVAL);
1467
1468	*state = vm->vcpu[vcpuid].x2apic_state;
1469
1470	return (0);
1471}
1472
1473int
1474vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
1475{
1476	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1477		return (EINVAL);
1478
1479	if (state >= X2APIC_STATE_LAST)
1480		return (EINVAL);
1481
1482	vm->vcpu[vcpuid].x2apic_state = state;
1483
1484	vlapic_set_x2apic_state(vm, vcpuid, state);
1485
1486	return (0);
1487}
1488
1489/*
1490 * This function is called to ensure that a vcpu "sees" a pending event
1491 * as soon as possible:
1492 * - If the vcpu thread is sleeping then it is woken up.
1493 * - If the vcpu is running on a different host_cpu then an IPI will be directed
1494 *   to the host_cpu to cause the vcpu to trap into the hypervisor.
1495 */
1496void
1497vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
1498{
1499	int hostcpu;
1500	struct vcpu *vcpu;
1501
1502	vcpu = &vm->vcpu[vcpuid];
1503
1504	vcpu_lock(vcpu);
1505	hostcpu = vcpu->hostcpu;
1506	if (vcpu->state == VCPU_RUNNING) {
1507		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
1508		if (hostcpu != curcpu) {
1509			if (lapic_intr) {
1510				vlapic_post_intr(vcpu->vlapic, hostcpu,
1511				    vmm_ipinum);
1512			} else {
1513				ipi_cpu(hostcpu, vmm_ipinum);
1514			}
1515		} else {
1516			/*
1517			 * If the 'vcpu' is running on 'curcpu' then it must
1518			 * be sending a notification to itself (e.g. SELF_IPI).
1519			 * The pending event will be picked up when the vcpu
1520			 * transitions back to guest context.
1521			 */
1522		}
1523	} else {
1524		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
1525		    "with hostcpu %d", vcpu->state, hostcpu));
1526		if (vcpu->state == VCPU_SLEEPING)
1527			wakeup_one(vcpu);
1528	}
1529	vcpu_unlock(vcpu);
1530}
1531
1532struct vmspace *
1533vm_get_vmspace(struct vm *vm)
1534{
1535
1536	return (vm->vmspace);
1537}
1538
1539int
1540vm_apicid2vcpuid(struct vm *vm, int apicid)
1541{
1542	/*
1543	 * XXX apic id is assumed to be numerically identical to vcpu id
1544	 */
1545	return (apicid);
1546}
1547
1548void
1549vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
1550    vm_rendezvous_func_t func, void *arg)
1551{
1552	int i;
1553
1554	/*
1555	 * Enforce that this function is called without any locks
1556	 */
1557	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
1558	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1559	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
1560
1561restart:
1562	mtx_lock(&vm->rendezvous_mtx);
1563	if (vm->rendezvous_func != NULL) {
1564		/*
1565		 * If a rendezvous is already in progress then we need to
1566		 * call the rendezvous handler in case this 'vcpuid' is one
1567		 * of the targets of the rendezvous.
1568		 */
1569		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
1570		mtx_unlock(&vm->rendezvous_mtx);
1571		vm_handle_rendezvous(vm, vcpuid);
1572		goto restart;
1573	}
1574	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
1575	    "rendezvous is still in progress"));
1576
1577	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
1578	vm->rendezvous_req_cpus = dest;
1579	CPU_ZERO(&vm->rendezvous_done_cpus);
1580	vm->rendezvous_arg = arg;
1581	vm_set_rendezvous_func(vm, func);
1582	mtx_unlock(&vm->rendezvous_mtx);
1583
1584	/*
1585	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
1586	 * vcpus so they handle the rendezvous as soon as possible.
1587	 */
1588	for (i = 0; i < VM_MAXCPU; i++) {
1589		if (CPU_ISSET(i, &dest))
1590			vcpu_notify_event(vm, i, false);
1591	}
1592
1593	vm_handle_rendezvous(vm, vcpuid);
1594}
1595