vmm.h revision 270074
1/*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/sys/amd64/include/vmm.h 270074 2014-08-17 01:23:52Z grehan $
27 */
28
29#ifndef _VMM_H_
30#define	_VMM_H_
31
32enum vm_suspend_how {
33	VM_SUSPEND_NONE,
34	VM_SUSPEND_RESET,
35	VM_SUSPEND_POWEROFF,
36	VM_SUSPEND_HALT,
37	VM_SUSPEND_LAST
38};
39
40#ifdef _KERNEL
41
42#define	VM_MAX_NAMELEN	32
43
44struct vm;
45struct vm_exception;
46struct vm_memory_segment;
47struct seg_desc;
48struct vm_exit;
49struct vm_run;
50struct vhpet;
51struct vioapic;
52struct vlapic;
53struct vmspace;
54struct vm_object;
55struct pmap;
56
57enum vm_reg_name;
58enum x2apic_state;
59
60typedef int	(*vmm_init_func_t)(int ipinum);
61typedef int	(*vmm_cleanup_func_t)(void);
62typedef void	(*vmm_resume_func_t)(void);
63typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
64typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, register_t rip,
65				  struct pmap *pmap, void *rendezvous_cookie,
66				  void *suspend_cookie);
67typedef void	(*vmi_cleanup_func_t)(void *vmi);
68typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
69				      uint64_t *retval);
70typedef int	(*vmi_set_register_t)(void *vmi, int vcpu, int num,
71				      uint64_t val);
72typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
73				  struct seg_desc *desc);
74typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
75				  struct seg_desc *desc);
76typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
77typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
78typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
79typedef void	(*vmi_vmspace_free)(struct vmspace *vmspace);
80typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
81typedef void	(*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
82
83struct vmm_ops {
84	vmm_init_func_t		init;		/* module wide initialization */
85	vmm_cleanup_func_t	cleanup;
86	vmm_resume_func_t	resume;
87
88	vmi_init_func_t		vminit;		/* vm-specific initialization */
89	vmi_run_func_t		vmrun;
90	vmi_cleanup_func_t	vmcleanup;
91	vmi_get_register_t	vmgetreg;
92	vmi_set_register_t	vmsetreg;
93	vmi_get_desc_t		vmgetdesc;
94	vmi_set_desc_t		vmsetdesc;
95	vmi_get_cap_t		vmgetcap;
96	vmi_set_cap_t		vmsetcap;
97	vmi_vmspace_alloc	vmspace_alloc;
98	vmi_vmspace_free	vmspace_free;
99	vmi_vlapic_init		vlapic_init;
100	vmi_vlapic_cleanup	vlapic_cleanup;
101};
102
103extern struct vmm_ops vmm_ops_intel;
104extern struct vmm_ops vmm_ops_amd;
105
106int vm_create(const char *name, struct vm **retvm);
107void vm_destroy(struct vm *vm);
108int vm_reinit(struct vm *vm);
109const char *vm_name(struct vm *vm);
110int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
111int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
112int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
113void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
114		  void **cookie);
115void vm_gpa_release(void *cookie);
116int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
117	      struct vm_memory_segment *seg);
118int vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
119		  vm_offset_t *offset, struct vm_object **object);
120boolean_t vm_mem_allocated(struct vm *vm, vm_paddr_t gpa);
121int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
122int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
123int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
124		    struct seg_desc *ret_desc);
125int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
126		    struct seg_desc *desc);
127int vm_run(struct vm *vm, struct vm_run *vmrun);
128int vm_suspend(struct vm *vm, enum vm_suspend_how how);
129int vm_inject_nmi(struct vm *vm, int vcpu);
130int vm_nmi_pending(struct vm *vm, int vcpuid);
131void vm_nmi_clear(struct vm *vm, int vcpuid);
132int vm_inject_extint(struct vm *vm, int vcpu);
133int vm_extint_pending(struct vm *vm, int vcpuid);
134void vm_extint_clear(struct vm *vm, int vcpuid);
135uint64_t *vm_guest_msrs(struct vm *vm, int cpu);
136struct vlapic *vm_lapic(struct vm *vm, int cpu);
137struct vioapic *vm_ioapic(struct vm *vm);
138struct vhpet *vm_hpet(struct vm *vm);
139int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
140int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
141int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
142int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
143int vm_apicid2vcpuid(struct vm *vm, int apicid);
144int vm_activate_cpu(struct vm *vm, int vcpu);
145cpuset_t vm_active_cpus(struct vm *vm);
146cpuset_t vm_suspended_cpus(struct vm *vm);
147struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
148void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
149void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
150void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
151
152/*
153 * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
154 * The rendezvous 'func(arg)' is not allowed to do anything that will
155 * cause the thread to be put to sleep.
156 *
157 * If the rendezvous is being initiated from a vcpu context then the
158 * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
159 *
160 * The caller cannot hold any locks when initiating the rendezvous.
161 *
162 * The implementation of this API may cause vcpus other than those specified
163 * by 'dest' to be stalled. The caller should not rely on any vcpus making
164 * forward progress when the rendezvous is in progress.
165 */
166typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
167void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
168    vm_rendezvous_func_t func, void *arg);
169
170static __inline int
171vcpu_rendezvous_pending(void *rendezvous_cookie)
172{
173
174	return (*(uintptr_t *)rendezvous_cookie != 0);
175}
176
177static __inline int
178vcpu_suspended(void *suspend_cookie)
179{
180
181	return (*(int *)suspend_cookie);
182}
183
184/*
185 * Return 1 if device indicated by bus/slot/func is supposed to be a
186 * pci passthrough device.
187 *
188 * Return 0 otherwise.
189 */
190int vmm_is_pptdev(int bus, int slot, int func);
191
192void *vm_iommu_domain(struct vm *vm);
193
194enum vcpu_state {
195	VCPU_IDLE,
196	VCPU_FROZEN,
197	VCPU_RUNNING,
198	VCPU_SLEEPING,
199};
200
201int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
202    bool from_idle);
203enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
204
205static int __inline
206vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
207{
208	return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
209}
210
211void *vcpu_stats(struct vm *vm, int vcpu);
212void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
213struct vmspace *vm_get_vmspace(struct vm *vm);
214int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func);
215int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func);
216struct vatpic *vm_atpic(struct vm *vm);
217struct vatpit *vm_atpit(struct vm *vm);
218
219/*
220 * Inject exception 'vme' into the guest vcpu. This function returns 0 on
221 * success and non-zero on failure.
222 *
223 * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
224 * this function directly because they enforce the trap-like or fault-like
225 * behavior of an exception.
226 *
227 * This function should only be called in the context of the thread that is
228 * executing this vcpu.
229 */
230int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
231
232/*
233 * Returns 0 if there is no exception pending for this vcpu. Returns 1 if an
234 * exception is pending and also updates 'vme'. The pending exception is
235 * cleared when this function returns.
236 *
237 * This function should only be called in the context of the thread that is
238 * executing this vcpu.
239 */
240int vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *vme);
241
242void vm_inject_gp(struct vm *vm, int vcpuid); /* general protection fault */
243void vm_inject_ud(struct vm *vm, int vcpuid); /* undefined instruction fault */
244void vm_inject_pf(struct vm *vm, int vcpuid, int error_code, uint64_t cr2);
245
246enum vm_reg_name vm_segment_name(int seg_encoding);
247
248#endif	/* KERNEL */
249
250#define	VM_MAXCPU	16			/* maximum virtual cpus */
251
252/*
253 * Identifiers for architecturally defined registers.
254 */
255enum vm_reg_name {
256	VM_REG_GUEST_RAX,
257	VM_REG_GUEST_RBX,
258	VM_REG_GUEST_RCX,
259	VM_REG_GUEST_RDX,
260	VM_REG_GUEST_RSI,
261	VM_REG_GUEST_RDI,
262	VM_REG_GUEST_RBP,
263	VM_REG_GUEST_R8,
264	VM_REG_GUEST_R9,
265	VM_REG_GUEST_R10,
266	VM_REG_GUEST_R11,
267	VM_REG_GUEST_R12,
268	VM_REG_GUEST_R13,
269	VM_REG_GUEST_R14,
270	VM_REG_GUEST_R15,
271	VM_REG_GUEST_CR0,
272	VM_REG_GUEST_CR3,
273	VM_REG_GUEST_CR4,
274	VM_REG_GUEST_DR7,
275	VM_REG_GUEST_RSP,
276	VM_REG_GUEST_RIP,
277	VM_REG_GUEST_RFLAGS,
278	VM_REG_GUEST_ES,
279	VM_REG_GUEST_CS,
280	VM_REG_GUEST_SS,
281	VM_REG_GUEST_DS,
282	VM_REG_GUEST_FS,
283	VM_REG_GUEST_GS,
284	VM_REG_GUEST_LDTR,
285	VM_REG_GUEST_TR,
286	VM_REG_GUEST_IDTR,
287	VM_REG_GUEST_GDTR,
288	VM_REG_GUEST_EFER,
289	VM_REG_GUEST_CR2,
290	VM_REG_LAST
291};
292
293/*
294 * Identifiers for optional vmm capabilities
295 */
296enum vm_cap_type {
297	VM_CAP_HALT_EXIT,
298	VM_CAP_MTRAP_EXIT,
299	VM_CAP_PAUSE_EXIT,
300	VM_CAP_UNRESTRICTED_GUEST,
301	VM_CAP_ENABLE_INVPCID,
302	VM_CAP_MAX
303};
304
305enum x2apic_state {
306	X2APIC_DISABLED,
307	X2APIC_ENABLED,
308	X2APIC_STATE_LAST
309};
310
311enum vm_intr_trigger {
312	EDGE_TRIGGER,
313	LEVEL_TRIGGER
314};
315
316/*
317 * The 'access' field has the format specified in Table 21-2 of the Intel
318 * Architecture Manual vol 3b.
319 *
320 * XXX The contents of the 'access' field are architecturally defined except
321 * bit 16 - Segment Unusable.
322 */
323struct seg_desc {
324	uint64_t	base;
325	uint32_t	limit;
326	uint32_t	access;
327};
328#define	SEG_DESC_TYPE(desc)		((desc)->access & 0x001f)
329#define	SEG_DESC_PRESENT(desc)		((desc)->access & 0x0080)
330#define	SEG_DESC_DEF32(desc)		((desc)->access & 0x4000)
331#define	SEG_DESC_GRANULARITY(desc)	((desc)->access & 0x8000)
332#define	SEG_DESC_UNUSABLE(desc)		((desc)->access & 0x10000)
333
334enum vm_cpu_mode {
335	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
336	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
337};
338
339enum vm_paging_mode {
340	PAGING_MODE_FLAT,
341	PAGING_MODE_32,
342	PAGING_MODE_PAE,
343	PAGING_MODE_64,
344};
345
346struct vm_guest_paging {
347	uint64_t	cr3;
348	int		cpl;
349	enum vm_cpu_mode cpu_mode;
350	enum vm_paging_mode paging_mode;
351};
352
353/*
354 * The data structures 'vie' and 'vie_op' are meant to be opaque to the
355 * consumers of instruction decoding. The only reason why their contents
356 * need to be exposed is because they are part of the 'vm_exit' structure.
357 */
358struct vie_op {
359	uint8_t		op_byte;	/* actual opcode byte */
360	uint8_t		op_type;	/* type of operation (e.g. MOV) */
361	uint16_t	op_flags;
362};
363
364#define	VIE_INST_SIZE	15
365struct vie {
366	uint8_t		inst[VIE_INST_SIZE];	/* instruction bytes */
367	uint8_t		num_valid;		/* size of the instruction */
368	uint8_t		num_processed;
369
370	uint8_t		rex_w:1,		/* REX prefix */
371			rex_r:1,
372			rex_x:1,
373			rex_b:1,
374			rex_present:1;
375
376	uint8_t		mod:2,			/* ModRM byte */
377			reg:4,
378			rm:4;
379
380	uint8_t		ss:2,			/* SIB byte */
381			index:4,
382			base:4;
383
384	uint8_t		disp_bytes;
385	uint8_t		imm_bytes;
386
387	uint8_t		scale;
388	int		base_register;		/* VM_REG_GUEST_xyz */
389	int		index_register;		/* VM_REG_GUEST_xyz */
390
391	int64_t		displacement;		/* optional addr displacement */
392	int64_t		immediate;		/* optional immediate operand */
393
394	uint8_t		decoded;	/* set to 1 if successfully decoded */
395
396	struct vie_op	op;			/* opcode description */
397};
398
399enum vm_exitcode {
400	VM_EXITCODE_INOUT,
401	VM_EXITCODE_VMX,
402	VM_EXITCODE_BOGUS,
403	VM_EXITCODE_RDMSR,
404	VM_EXITCODE_WRMSR,
405	VM_EXITCODE_HLT,
406	VM_EXITCODE_MTRAP,
407	VM_EXITCODE_PAUSE,
408	VM_EXITCODE_PAGING,
409	VM_EXITCODE_INST_EMUL,
410	VM_EXITCODE_SPINUP_AP,
411	VM_EXITCODE_DEPRECATED1,	/* used to be SPINDOWN_CPU */
412	VM_EXITCODE_RENDEZVOUS,
413	VM_EXITCODE_IOAPIC_EOI,
414	VM_EXITCODE_SUSPENDED,
415	VM_EXITCODE_INOUT_STR,
416	VM_EXITCODE_MAX
417};
418
419struct vm_inout {
420	uint16_t	bytes:3;	/* 1 or 2 or 4 */
421	uint16_t	in:1;
422	uint16_t	string:1;
423	uint16_t	rep:1;
424	uint16_t	port;
425	uint32_t	eax;		/* valid for out */
426};
427
428struct vm_inout_str {
429	struct vm_inout	inout;		/* must be the first element */
430	struct vm_guest_paging paging;
431	uint64_t	rflags;
432	uint64_t	cr0;
433	uint64_t	index;
434	uint64_t	count;		/* rep=1 (%rcx), rep=0 (1) */
435	int		addrsize;
436	enum vm_reg_name seg_name;
437	struct seg_desc seg_desc;
438};
439
440struct vm_exit {
441	enum vm_exitcode	exitcode;
442	int			inst_length;	/* 0 means unknown */
443	uint64_t		rip;
444	union {
445		struct vm_inout	inout;
446		struct vm_inout_str inout_str;
447		struct {
448			uint64_t	gpa;
449			int		fault_type;
450		} paging;
451		struct {
452			uint64_t	gpa;
453			uint64_t	gla;
454			struct vm_guest_paging paging;
455			struct vie	vie;
456		} inst_emul;
457		/*
458		 * VMX specific payload. Used when there is no "better"
459		 * exitcode to represent the VM-exit.
460		 */
461		struct {
462			int		status;		/* vmx inst status */
463			/*
464			 * 'exit_reason' and 'exit_qualification' are valid
465			 * only if 'status' is zero.
466			 */
467			uint32_t	exit_reason;
468			uint64_t	exit_qualification;
469			/*
470			 * 'inst_error' and 'inst_type' are valid
471			 * only if 'status' is non-zero.
472			 */
473			int		inst_type;
474			int		inst_error;
475		} vmx;
476		struct {
477			uint32_t	code;		/* ecx value */
478			uint64_t	wval;
479		} msr;
480		struct {
481			int		vcpu;
482			uint64_t	rip;
483		} spinup_ap;
484		struct {
485			uint64_t	rflags;
486		} hlt;
487		struct {
488			int		vector;
489		} ioapic_eoi;
490		struct {
491			enum vm_suspend_how how;
492		} suspended;
493	} u;
494};
495
496#endif	/* _VMM_H_ */
497