vmm_instruction_emul.c revision 268976
1/*-
2 * Copyright (c) 2012 Sandvine, Inc.
3 * Copyright (c) 2012 NetApp, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 268976 2014-07-22 04:39:16Z jhb $
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 268976 2014-07-22 04:39:16Z jhb $");
32
33#ifdef _KERNEL
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38
39#include <vm/vm.h>
40#include <vm/pmap.h>
41
42#include <machine/vmparam.h>
43#include <machine/vmm.h>
44#else	/* !_KERNEL */
45#include <sys/types.h>
46#include <sys/errno.h>
47
48#include <machine/vmm.h>
49
50#include <assert.h>
51#include <vmmapi.h>
52#define	KASSERT(exp,msg)	assert((exp))
53#endif	/* _KERNEL */
54
55#include <machine/vmm_instruction_emul.h>
56#include <x86/psl.h>
57#include <x86/specialreg.h>
58
59/* struct vie_op.op_type */
60enum {
61	VIE_OP_TYPE_NONE = 0,
62	VIE_OP_TYPE_MOV,
63	VIE_OP_TYPE_MOVSX,
64	VIE_OP_TYPE_MOVZX,
65	VIE_OP_TYPE_AND,
66	VIE_OP_TYPE_OR,
67	VIE_OP_TYPE_TWO_BYTE,
68	VIE_OP_TYPE_LAST
69};
70
71/* struct vie_op.op_flags */
72#define	VIE_OP_F_IMM		(1 << 0)	/* immediate operand present */
73#define	VIE_OP_F_IMM8		(1 << 1)	/* 8-bit immediate operand */
74
75static const struct vie_op two_byte_opcodes[256] = {
76	[0xB6] = {
77		.op_byte = 0xB6,
78		.op_type = VIE_OP_TYPE_MOVZX,
79	},
80	[0xBE] = {
81		.op_byte = 0xBE,
82		.op_type = VIE_OP_TYPE_MOVSX,
83	},
84};
85
86static const struct vie_op one_byte_opcodes[256] = {
87	[0x0F] = {
88		.op_byte = 0x0F,
89		.op_type = VIE_OP_TYPE_TWO_BYTE
90	},
91	[0x88] = {
92		.op_byte = 0x88,
93		.op_type = VIE_OP_TYPE_MOV,
94	},
95	[0x89] = {
96		.op_byte = 0x89,
97		.op_type = VIE_OP_TYPE_MOV,
98	},
99	[0x8A] = {
100		.op_byte = 0x8A,
101		.op_type = VIE_OP_TYPE_MOV,
102	},
103	[0x8B] = {
104		.op_byte = 0x8B,
105		.op_type = VIE_OP_TYPE_MOV,
106	},
107	[0xC7] = {
108		.op_byte = 0xC7,
109		.op_type = VIE_OP_TYPE_MOV,
110		.op_flags = VIE_OP_F_IMM,
111	},
112	[0x23] = {
113		.op_byte = 0x23,
114		.op_type = VIE_OP_TYPE_AND,
115	},
116	[0x81] = {
117		/* XXX Group 1 extended opcode - not just AND */
118		.op_byte = 0x81,
119		.op_type = VIE_OP_TYPE_AND,
120		.op_flags = VIE_OP_F_IMM,
121	},
122	[0x83] = {
123		/* XXX Group 1 extended opcode - not just OR */
124		.op_byte = 0x83,
125		.op_type = VIE_OP_TYPE_OR,
126		.op_flags = VIE_OP_F_IMM8,
127	},
128};
129
130/* struct vie.mod */
131#define	VIE_MOD_INDIRECT		0
132#define	VIE_MOD_INDIRECT_DISP8		1
133#define	VIE_MOD_INDIRECT_DISP32		2
134#define	VIE_MOD_DIRECT			3
135
136/* struct vie.rm */
137#define	VIE_RM_SIB			4
138#define	VIE_RM_DISP32			5
139
140#define	GB				(1024 * 1024 * 1024)
141
142static enum vm_reg_name gpr_map[16] = {
143	VM_REG_GUEST_RAX,
144	VM_REG_GUEST_RCX,
145	VM_REG_GUEST_RDX,
146	VM_REG_GUEST_RBX,
147	VM_REG_GUEST_RSP,
148	VM_REG_GUEST_RBP,
149	VM_REG_GUEST_RSI,
150	VM_REG_GUEST_RDI,
151	VM_REG_GUEST_R8,
152	VM_REG_GUEST_R9,
153	VM_REG_GUEST_R10,
154	VM_REG_GUEST_R11,
155	VM_REG_GUEST_R12,
156	VM_REG_GUEST_R13,
157	VM_REG_GUEST_R14,
158	VM_REG_GUEST_R15
159};
160
161static uint64_t size2mask[] = {
162	[1] = 0xff,
163	[2] = 0xffff,
164	[4] = 0xffffffff,
165	[8] = 0xffffffffffffffff,
166};
167
168static int
169vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
170{
171	int error;
172
173	error = vm_get_register(vm, vcpuid, reg, rval);
174
175	return (error);
176}
177
178static int
179vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
180{
181	uint64_t val;
182	int error, rshift;
183	enum vm_reg_name reg;
184
185	rshift = 0;
186	reg = gpr_map[vie->reg];
187
188	/*
189	 * 64-bit mode imposes limitations on accessing legacy byte registers.
190	 *
191	 * The legacy high-byte registers cannot be addressed if the REX
192	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
193	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
194	 *
195	 * If the REX prefix is not present then the values 4, 5, 6 and 7
196	 * of the 'ModRM:reg' field address the legacy high-byte registers,
197	 * %ah, %ch, %dh and %bh respectively.
198	 */
199	if (!vie->rex_present) {
200		if (vie->reg & 0x4) {
201			/*
202			 * Obtain the value of %ah by reading %rax and shifting
203			 * right by 8 bits (same for %bh, %ch and %dh).
204			 */
205			rshift = 8;
206			reg = gpr_map[vie->reg & 0x3];
207		}
208	}
209
210	error = vm_get_register(vm, vcpuid, reg, &val);
211	*rval = val >> rshift;
212	return (error);
213}
214
215int
216vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
217		    uint64_t val, int size)
218{
219	int error;
220	uint64_t origval;
221
222	switch (size) {
223	case 1:
224	case 2:
225		error = vie_read_register(vm, vcpuid, reg, &origval);
226		if (error)
227			return (error);
228		val &= size2mask[size];
229		val |= origval & ~size2mask[size];
230		break;
231	case 4:
232		val &= 0xffffffffUL;
233		break;
234	case 8:
235		break;
236	default:
237		return (EINVAL);
238	}
239
240	error = vm_set_register(vm, vcpuid, reg, val);
241	return (error);
242}
243
244/*
245 * The following simplifying assumptions are made during emulation:
246 *
247 * - guest is in 64-bit mode
248 *   - default address size is 64-bits
249 *   - default operand size is 32-bits
250 *
251 * - operand size override is not supported
252 *
253 * - address size override is not supported
254 */
255static int
256emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
257	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
258{
259	int error, size;
260	enum vm_reg_name reg;
261	uint8_t byte;
262	uint64_t val;
263
264	size = 4;
265	error = EINVAL;
266
267	switch (vie->op.op_byte) {
268	case 0x88:
269		/*
270		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
271		 * 88/r:	mov r/m8, r8
272		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
273		 */
274		size = 1;
275		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
276		if (error == 0)
277			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
278		break;
279	case 0x89:
280		/*
281		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
282		 * 89/r:	mov r/m32, r32
283		 * REX.W + 89/r	mov r/m64, r64
284		 */
285		if (vie->rex_w)
286			size = 8;
287		reg = gpr_map[vie->reg];
288		error = vie_read_register(vm, vcpuid, reg, &val);
289		if (error == 0) {
290			val &= size2mask[size];
291			error = memwrite(vm, vcpuid, gpa, val, size, arg);
292		}
293		break;
294	case 0x8A:
295	case 0x8B:
296		/*
297		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
298		 * 8A/r:	mov r/m8, r8
299		 * REX + 8A/r:	mov r/m8, r8
300		 * 8B/r:	mov r32, r/m32
301		 * REX.W 8B/r:	mov r64, r/m64
302		 */
303		if (vie->op.op_byte == 0x8A)
304			size = 1;
305		else if (vie->rex_w)
306			size = 8;
307		error = memread(vm, vcpuid, gpa, &val, size, arg);
308		if (error == 0) {
309			reg = gpr_map[vie->reg];
310			error = vie_update_register(vm, vcpuid, reg, val, size);
311		}
312		break;
313	case 0xC7:
314		/*
315		 * MOV from imm32 to mem (ModRM:r/m)
316		 * C7/0		mov r/m32, imm32
317		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
318		 */
319		val = vie->immediate;		/* already sign-extended */
320
321		if (vie->rex_w)
322			size = 8;
323
324		if (size != 8)
325			val &= size2mask[size];
326
327		error = memwrite(vm, vcpuid, gpa, val, size, arg);
328		break;
329	default:
330		break;
331	}
332
333	return (error);
334}
335
336/*
337 * The following simplifying assumptions are made during emulation:
338 *
339 * - guest is in 64-bit mode
340 *   - default address size is 64-bits
341 *   - default operand size is 32-bits
342 *
343 * - operand size override is not supported
344 *
345 * - address size override is not supported
346 */
347static int
348emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
349	     mem_region_read_t memread, mem_region_write_t memwrite,
350	     void *arg)
351{
352	int error, size;
353	enum vm_reg_name reg;
354	uint64_t val;
355
356	size = 4;
357	error = EINVAL;
358
359	switch (vie->op.op_byte) {
360	case 0xB6:
361		/*
362		 * MOV and zero extend byte from mem (ModRM:r/m) to
363		 * reg (ModRM:reg).
364		 *
365		 * 0F B6/r		movzx r/m8, r32
366		 * REX.W + 0F B6/r	movzx r/m8, r64
367		 */
368
369		/* get the first operand */
370		error = memread(vm, vcpuid, gpa, &val, 1, arg);
371		if (error)
372			break;
373
374		/* get the second operand */
375		reg = gpr_map[vie->reg];
376
377		if (vie->rex_w)
378			size = 8;
379
380		/* write the result */
381		error = vie_update_register(vm, vcpuid, reg, val, size);
382		break;
383	case 0xBE:
384		/*
385		 * MOV and sign extend byte from mem (ModRM:r/m) to
386		 * reg (ModRM:reg).
387		 *
388		 * 0F BE/r		movsx r/m8, r32
389		 * REX.W + 0F BE/r	movsx r/m8, r64
390		 */
391
392		/* get the first operand */
393		error = memread(vm, vcpuid, gpa, &val, 1, arg);
394		if (error)
395			break;
396
397		/* get the second operand */
398		reg = gpr_map[vie->reg];
399
400		if (vie->rex_w)
401			size = 8;
402
403		/* sign extend byte */
404		val = (int8_t)val;
405
406		/* write the result */
407		error = vie_update_register(vm, vcpuid, reg, val, size);
408		break;
409	default:
410		break;
411	}
412	return (error);
413}
414
415static int
416emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
417	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
418{
419	int error, size;
420	enum vm_reg_name reg;
421	uint64_t val1, val2;
422
423	size = 4;
424	error = EINVAL;
425
426	switch (vie->op.op_byte) {
427	case 0x23:
428		/*
429		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
430		 * result in reg.
431		 *
432		 * 23/r		and r32, r/m32
433		 * REX.W + 23/r	and r64, r/m64
434		 */
435		if (vie->rex_w)
436			size = 8;
437
438		/* get the first operand */
439		reg = gpr_map[vie->reg];
440		error = vie_read_register(vm, vcpuid, reg, &val1);
441		if (error)
442			break;
443
444		/* get the second operand */
445		error = memread(vm, vcpuid, gpa, &val2, size, arg);
446		if (error)
447			break;
448
449		/* perform the operation and write the result */
450		val1 &= val2;
451		error = vie_update_register(vm, vcpuid, reg, val1, size);
452		break;
453	case 0x81:
454		/*
455		 * AND mem (ModRM:r/m) with immediate and store the
456		 * result in mem.
457		 *
458		 * 81/          and r/m32, imm32
459		 * REX.W + 81/  and r/m64, imm32 sign-extended to 64
460		 *
461		 * Currently, only the AND operation of the 0x81 opcode
462		 * is implemented (ModRM:reg = b100).
463		 */
464		if ((vie->reg & 7) != 4)
465			break;
466
467		if (vie->rex_w)
468			size = 8;
469
470		/* get the first operand */
471                error = memread(vm, vcpuid, gpa, &val1, size, arg);
472                if (error)
473			break;
474
475                /*
476		 * perform the operation with the pre-fetched immediate
477		 * operand and write the result
478		 */
479                val1 &= vie->immediate;
480                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
481		break;
482	default:
483		break;
484	}
485	return (error);
486}
487
488static int
489emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
490	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
491{
492	int error, size;
493	uint64_t val1;
494
495	size = 4;
496	error = EINVAL;
497
498	switch (vie->op.op_byte) {
499	case 0x83:
500		/*
501		 * OR mem (ModRM:r/m) with immediate and store the
502		 * result in mem.
503		 *
504		 * 83/          OR r/m32, imm8 sign-extended to 32
505		 * REX.W + 83/  OR r/m64, imm8 sign-extended to 64
506		 *
507		 * Currently, only the OR operation of the 0x83 opcode
508		 * is implemented (ModRM:reg = b001).
509		 */
510		if ((vie->reg & 7) != 1)
511			break;
512
513		if (vie->rex_w)
514			size = 8;
515
516		/* get the first operand */
517                error = memread(vm, vcpuid, gpa, &val1, size, arg);
518                if (error)
519			break;
520
521                /*
522		 * perform the operation with the pre-fetched immediate
523		 * operand and write the result
524		 */
525                val1 |= vie->immediate;
526                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
527		break;
528	default:
529		break;
530	}
531	return (error);
532}
533
534int
535vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
536			mem_region_read_t memread, mem_region_write_t memwrite,
537			void *memarg)
538{
539	int error;
540
541	if (!vie->decoded)
542		return (EINVAL);
543
544	switch (vie->op.op_type) {
545	case VIE_OP_TYPE_MOV:
546		error = emulate_mov(vm, vcpuid, gpa, vie,
547				    memread, memwrite, memarg);
548		break;
549	case VIE_OP_TYPE_MOVSX:
550	case VIE_OP_TYPE_MOVZX:
551		error = emulate_movx(vm, vcpuid, gpa, vie,
552				     memread, memwrite, memarg);
553		break;
554	case VIE_OP_TYPE_AND:
555		error = emulate_and(vm, vcpuid, gpa, vie,
556				    memread, memwrite, memarg);
557		break;
558	case VIE_OP_TYPE_OR:
559		error = emulate_or(vm, vcpuid, gpa, vie,
560				    memread, memwrite, memarg);
561		break;
562	default:
563		error = EINVAL;
564		break;
565	}
566
567	return (error);
568}
569
570int
571vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
572{
573	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
574	    ("%s: invalid size %d", __func__, size));
575	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
576
577	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
578		return (0);
579
580	return ((gla & (size - 1)) ? 1 : 0);
581}
582
583int
584vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
585{
586	uint64_t mask;
587
588	if (cpu_mode != CPU_MODE_64BIT)
589		return (0);
590
591	/*
592	 * The value of the bit 47 in the 'gla' should be replicated in the
593	 * most significant 16 bits.
594	 */
595	mask = ~((1UL << 48) - 1);
596	if (gla & (1UL << 47))
597		return ((gla & mask) != mask);
598	else
599		return ((gla & mask) != 0);
600}
601
602uint64_t
603vie_size2mask(int size)
604{
605	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
606	    ("vie_size2mask: invalid size %d", size));
607	return (size2mask[size]);
608}
609
610int
611vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
612    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
613    int prot, uint64_t *gla)
614{
615	uint64_t firstoff, low_limit, high_limit, segbase;
616	int glasize, type;
617
618	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
619	    ("%s: invalid segment %d", __func__, seg));
620	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
621	    ("%s: invalid operand size %d", __func__, length));
622	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
623	    ("%s: invalid prot %#x", __func__, prot));
624
625	firstoff = offset;
626	if (cpu_mode == CPU_MODE_64BIT) {
627		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
628		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
629		glasize = 8;
630	} else {
631		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
632		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
633		glasize = 4;
634		/*
635		 * If the segment selector is loaded with a NULL selector
636		 * then the descriptor is unusable and attempting to use
637		 * it results in a #GP(0).
638		 */
639		if (SEG_DESC_UNUSABLE(desc))
640			return (-1);
641
642		/*
643		 * The processor generates a #NP exception when a segment
644		 * register is loaded with a selector that points to a
645		 * descriptor that is not present. If this was the case then
646		 * it would have been checked before the VM-exit.
647		 */
648		KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x",
649		    seg, desc->access));
650
651		/*
652		 * The descriptor type must indicate a code/data segment.
653		 */
654		type = SEG_DESC_TYPE(desc);
655		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
656		    "descriptor type %#x", seg, type));
657
658		if (prot & PROT_READ) {
659			/* #GP on a read access to a exec-only code segment */
660			if ((type & 0xA) == 0x8)
661				return (-1);
662		}
663
664		if (prot & PROT_WRITE) {
665			/*
666			 * #GP on a write access to a code segment or a
667			 * read-only data segment.
668			 */
669			if (type & 0x8)			/* code segment */
670				return (-1);
671
672			if ((type & 0xA) == 0)		/* read-only data seg */
673				return (-1);
674		}
675
676		/*
677		 * 'desc->limit' is fully expanded taking granularity into
678		 * account.
679		 */
680		if ((type & 0xC) == 0x4) {
681			/* expand-down data segment */
682			low_limit = desc->limit + 1;
683			high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff;
684		} else {
685			/* code segment or expand-up data segment */
686			low_limit = 0;
687			high_limit = desc->limit;
688		}
689
690		while (length > 0) {
691			offset &= vie_size2mask(addrsize);
692			if (offset < low_limit || offset > high_limit)
693				return (-1);
694			offset++;
695			length--;
696		}
697	}
698
699	/*
700	 * In 64-bit mode all segments except %fs and %gs have a segment
701	 * base address of 0.
702	 */
703	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
704	    seg != VM_REG_GUEST_GS) {
705		segbase = 0;
706	} else {
707		segbase = desc->base;
708	}
709
710	/*
711	 * Truncate 'firstoff' to the effective address size before adding
712	 * it to the segment base.
713	 */
714	firstoff &= vie_size2mask(addrsize);
715	*gla = (segbase + firstoff) & vie_size2mask(glasize);
716	return (0);
717}
718
719#ifdef _KERNEL
720void
721vie_init(struct vie *vie)
722{
723
724	bzero(vie, sizeof(struct vie));
725
726	vie->base_register = VM_REG_LAST;
727	vie->index_register = VM_REG_LAST;
728}
729
730static int
731pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
732{
733	int error_code = 0;
734
735	if (pte & PG_V)
736		error_code |= PGEX_P;
737	if (prot & VM_PROT_WRITE)
738		error_code |= PGEX_W;
739	if (usermode)
740		error_code |= PGEX_U;
741	if (rsvd)
742		error_code |= PGEX_RSV;
743	if (prot & VM_PROT_EXECUTE)
744		error_code |= PGEX_I;
745
746	return (error_code);
747}
748
749static void
750ptp_release(void **cookie)
751{
752	if (*cookie != NULL) {
753		vm_gpa_release(*cookie);
754		*cookie = NULL;
755	}
756}
757
758static void *
759ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
760{
761	void *ptr;
762
763	ptp_release(cookie);
764	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
765	return (ptr);
766}
767
768int
769vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
770    uint64_t gla, int prot, uint64_t *gpa)
771{
772	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
773	u_int retries;
774	uint64_t *ptpbase, ptpphys, pte, pgsize;
775	uint32_t *ptpbase32, pte32;
776	void *cookie;
777
778	usermode = (paging->cpl == 3 ? 1 : 0);
779	writable = prot & VM_PROT_WRITE;
780	cookie = NULL;
781	retval = 0;
782	retries = 0;
783restart:
784	ptpphys = paging->cr3;		/* root of the page tables */
785	ptp_release(&cookie);
786	if (retries++ > 0)
787		maybe_yield();
788
789	if (vie_canonical_check(paging->cpu_mode, gla)) {
790		/*
791		 * XXX assuming a non-stack reference otherwise a stack fault
792		 * should be generated.
793		 */
794		vm_inject_gp(vm, vcpuid);
795		goto fault;
796	}
797
798	if (paging->paging_mode == PAGING_MODE_FLAT) {
799		*gpa = gla;
800		goto done;
801	}
802
803	if (paging->paging_mode == PAGING_MODE_32) {
804		nlevels = 2;
805		while (--nlevels >= 0) {
806			/* Zero out the lower 12 bits. */
807			ptpphys &= ~0xfff;
808
809			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
810
811			if (ptpbase32 == NULL)
812				goto error;
813
814			ptpshift = PAGE_SHIFT + nlevels * 10;
815			ptpindex = (gla >> ptpshift) & 0x3FF;
816			pgsize = 1UL << ptpshift;
817
818			pte32 = ptpbase32[ptpindex];
819
820			if ((pte32 & PG_V) == 0 ||
821			    (usermode && (pte32 & PG_U) == 0) ||
822			    (writable && (pte32 & PG_RW) == 0)) {
823				pfcode = pf_error_code(usermode, prot, 0,
824				    pte32);
825				vm_inject_pf(vm, vcpuid, pfcode, gla);
826				goto fault;
827			}
828
829			/*
830			 * Emulate the x86 MMU's management of the accessed
831			 * and dirty flags. While the accessed flag is set
832			 * at every level of the page table, the dirty flag
833			 * is only set at the last level providing the guest
834			 * physical address.
835			 */
836			if ((pte32 & PG_A) == 0) {
837				if (atomic_cmpset_32(&ptpbase32[ptpindex],
838				    pte32, pte32 | PG_A) == 0) {
839					goto restart;
840				}
841			}
842
843			/* XXX must be ignored if CR4.PSE=0 */
844			if (nlevels > 0 && (pte32 & PG_PS) != 0)
845				break;
846
847			ptpphys = pte32;
848		}
849
850		/* Set the dirty bit in the page table entry if necessary */
851		if (writable && (pte32 & PG_M) == 0) {
852			if (atomic_cmpset_32(&ptpbase32[ptpindex],
853			    pte32, pte32 | PG_M) == 0) {
854				goto restart;
855			}
856		}
857
858		/* Zero out the lower 'ptpshift' bits */
859		pte32 >>= ptpshift; pte32 <<= ptpshift;
860		*gpa = pte32 | (gla & (pgsize - 1));
861		goto done;
862	}
863
864	if (paging->paging_mode == PAGING_MODE_PAE) {
865		/* Zero out the lower 5 bits and the upper 32 bits */
866		ptpphys &= 0xffffffe0UL;
867
868		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
869		if (ptpbase == NULL)
870			goto error;
871
872		ptpindex = (gla >> 30) & 0x3;
873
874		pte = ptpbase[ptpindex];
875
876		if ((pte & PG_V) == 0) {
877			pfcode = pf_error_code(usermode, prot, 0, pte);
878			vm_inject_pf(vm, vcpuid, pfcode, gla);
879			goto fault;
880		}
881
882		ptpphys = pte;
883
884		nlevels = 2;
885	} else
886		nlevels = 4;
887	while (--nlevels >= 0) {
888		/* Zero out the lower 12 bits and the upper 12 bits */
889		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
890
891		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
892		if (ptpbase == NULL)
893			goto error;
894
895		ptpshift = PAGE_SHIFT + nlevels * 9;
896		ptpindex = (gla >> ptpshift) & 0x1FF;
897		pgsize = 1UL << ptpshift;
898
899		pte = ptpbase[ptpindex];
900
901		if ((pte & PG_V) == 0 ||
902		    (usermode && (pte & PG_U) == 0) ||
903		    (writable && (pte & PG_RW) == 0)) {
904			pfcode = pf_error_code(usermode, prot, 0, pte);
905			vm_inject_pf(vm, vcpuid, pfcode, gla);
906			goto fault;
907		}
908
909		/* Set the accessed bit in the page table entry */
910		if ((pte & PG_A) == 0) {
911			if (atomic_cmpset_64(&ptpbase[ptpindex],
912			    pte, pte | PG_A) == 0) {
913				goto restart;
914			}
915		}
916
917		if (nlevels > 0 && (pte & PG_PS) != 0) {
918			if (pgsize > 1 * GB) {
919				pfcode = pf_error_code(usermode, prot, 1, pte);
920				vm_inject_pf(vm, vcpuid, pfcode, gla);
921				goto fault;
922			}
923			break;
924		}
925
926		ptpphys = pte;
927	}
928
929	/* Set the dirty bit in the page table entry if necessary */
930	if (writable && (pte & PG_M) == 0) {
931		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
932			goto restart;
933	}
934
935	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
936	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
937	*gpa = pte | (gla & (pgsize - 1));
938done:
939	ptp_release(&cookie);
940	return (retval);
941error:
942	retval = -1;
943	goto done;
944fault:
945	retval = 1;
946	goto done;
947}
948
949int
950vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging,
951    uint64_t rip, int inst_length, struct vie *vie)
952{
953	int n, error, prot;
954	uint64_t gpa, off;
955	void *hpa, *cookie;
956
957	/*
958	 * XXX cache previously fetched instructions using 'rip' as the tag
959	 */
960
961	prot = VM_PROT_READ | VM_PROT_EXECUTE;
962	if (inst_length > VIE_INST_SIZE)
963		panic("vmm_fetch_instruction: invalid length %d", inst_length);
964
965	/* Copy the instruction into 'vie' */
966	while (vie->num_valid < inst_length) {
967		error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa);
968		if (error)
969			return (error);
970
971		off = gpa & PAGE_MASK;
972		n = min(inst_length - vie->num_valid, PAGE_SIZE - off);
973
974		if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL)
975			break;
976
977		bcopy(hpa, &vie->inst[vie->num_valid], n);
978
979		vm_gpa_release(cookie);
980
981		rip += n;
982		vie->num_valid += n;
983	}
984
985	if (vie->num_valid == inst_length)
986		return (0);
987	else
988		return (-1);
989}
990
991static int
992vie_peek(struct vie *vie, uint8_t *x)
993{
994
995	if (vie->num_processed < vie->num_valid) {
996		*x = vie->inst[vie->num_processed];
997		return (0);
998	} else
999		return (-1);
1000}
1001
1002static void
1003vie_advance(struct vie *vie)
1004{
1005
1006	vie->num_processed++;
1007}
1008
1009static int
1010decode_rex(struct vie *vie)
1011{
1012	uint8_t x;
1013
1014	if (vie_peek(vie, &x))
1015		return (-1);
1016
1017	if (x >= 0x40 && x <= 0x4F) {
1018		vie->rex_present = 1;
1019
1020		vie->rex_w = x & 0x8 ? 1 : 0;
1021		vie->rex_r = x & 0x4 ? 1 : 0;
1022		vie->rex_x = x & 0x2 ? 1 : 0;
1023		vie->rex_b = x & 0x1 ? 1 : 0;
1024
1025		vie_advance(vie);
1026	}
1027
1028	return (0);
1029}
1030
1031static int
1032decode_two_byte_opcode(struct vie *vie)
1033{
1034	uint8_t x;
1035
1036	if (vie_peek(vie, &x))
1037		return (-1);
1038
1039	vie->op = two_byte_opcodes[x];
1040
1041	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1042		return (-1);
1043
1044	vie_advance(vie);
1045	return (0);
1046}
1047
1048static int
1049decode_opcode(struct vie *vie)
1050{
1051	uint8_t x;
1052
1053	if (vie_peek(vie, &x))
1054		return (-1);
1055
1056	vie->op = one_byte_opcodes[x];
1057
1058	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1059		return (-1);
1060
1061	vie_advance(vie);
1062
1063	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1064		return (decode_two_byte_opcode(vie));
1065
1066	return (0);
1067}
1068
1069static int
1070decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1071{
1072	uint8_t x;
1073
1074	if (vie_peek(vie, &x))
1075		return (-1);
1076
1077	vie->mod = (x >> 6) & 0x3;
1078	vie->rm =  (x >> 0) & 0x7;
1079	vie->reg = (x >> 3) & 0x7;
1080
1081	/*
1082	 * A direct addressing mode makes no sense in the context of an EPT
1083	 * fault. There has to be a memory access involved to cause the
1084	 * EPT fault.
1085	 */
1086	if (vie->mod == VIE_MOD_DIRECT)
1087		return (-1);
1088
1089	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1090	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1091		/*
1092		 * Table 2-5: Special Cases of REX Encodings
1093		 *
1094		 * mod=0, r/m=5 is used in the compatibility mode to
1095		 * indicate a disp32 without a base register.
1096		 *
1097		 * mod!=3, r/m=4 is used in the compatibility mode to
1098		 * indicate that the SIB byte is present.
1099		 *
1100		 * The 'b' bit in the REX prefix is don't care in
1101		 * this case.
1102		 */
1103	} else {
1104		vie->rm |= (vie->rex_b << 3);
1105	}
1106
1107	vie->reg |= (vie->rex_r << 3);
1108
1109	/* SIB */
1110	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1111		goto done;
1112
1113	vie->base_register = gpr_map[vie->rm];
1114
1115	switch (vie->mod) {
1116	case VIE_MOD_INDIRECT_DISP8:
1117		vie->disp_bytes = 1;
1118		break;
1119	case VIE_MOD_INDIRECT_DISP32:
1120		vie->disp_bytes = 4;
1121		break;
1122	case VIE_MOD_INDIRECT:
1123		if (vie->rm == VIE_RM_DISP32) {
1124			vie->disp_bytes = 4;
1125			/*
1126			 * Table 2-7. RIP-Relative Addressing
1127			 *
1128			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1129			 * whereas in compatibility mode it just implies disp32.
1130			 */
1131
1132			if (cpu_mode == CPU_MODE_64BIT)
1133				vie->base_register = VM_REG_GUEST_RIP;
1134			else
1135				vie->base_register = VM_REG_LAST;
1136		}
1137		break;
1138	}
1139
1140done:
1141	vie_advance(vie);
1142
1143	return (0);
1144}
1145
1146static int
1147decode_sib(struct vie *vie)
1148{
1149	uint8_t x;
1150
1151	/* Proceed only if SIB byte is present */
1152	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1153		return (0);
1154
1155	if (vie_peek(vie, &x))
1156		return (-1);
1157
1158	/* De-construct the SIB byte */
1159	vie->ss = (x >> 6) & 0x3;
1160	vie->index = (x >> 3) & 0x7;
1161	vie->base = (x >> 0) & 0x7;
1162
1163	/* Apply the REX prefix modifiers */
1164	vie->index |= vie->rex_x << 3;
1165	vie->base |= vie->rex_b << 3;
1166
1167	switch (vie->mod) {
1168	case VIE_MOD_INDIRECT_DISP8:
1169		vie->disp_bytes = 1;
1170		break;
1171	case VIE_MOD_INDIRECT_DISP32:
1172		vie->disp_bytes = 4;
1173		break;
1174	}
1175
1176	if (vie->mod == VIE_MOD_INDIRECT &&
1177	    (vie->base == 5 || vie->base == 13)) {
1178		/*
1179		 * Special case when base register is unused if mod = 0
1180		 * and base = %rbp or %r13.
1181		 *
1182		 * Documented in:
1183		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1184		 * Table 2-5: Special Cases of REX Encodings
1185		 */
1186		vie->disp_bytes = 4;
1187	} else {
1188		vie->base_register = gpr_map[vie->base];
1189	}
1190
1191	/*
1192	 * All encodings of 'index' are valid except for %rsp (4).
1193	 *
1194	 * Documented in:
1195	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1196	 * Table 2-5: Special Cases of REX Encodings
1197	 */
1198	if (vie->index != 4)
1199		vie->index_register = gpr_map[vie->index];
1200
1201	/* 'scale' makes sense only in the context of an index register */
1202	if (vie->index_register < VM_REG_LAST)
1203		vie->scale = 1 << vie->ss;
1204
1205	vie_advance(vie);
1206
1207	return (0);
1208}
1209
1210static int
1211decode_displacement(struct vie *vie)
1212{
1213	int n, i;
1214	uint8_t x;
1215
1216	union {
1217		char	buf[4];
1218		int8_t	signed8;
1219		int32_t	signed32;
1220	} u;
1221
1222	if ((n = vie->disp_bytes) == 0)
1223		return (0);
1224
1225	if (n != 1 && n != 4)
1226		panic("decode_displacement: invalid disp_bytes %d", n);
1227
1228	for (i = 0; i < n; i++) {
1229		if (vie_peek(vie, &x))
1230			return (-1);
1231
1232		u.buf[i] = x;
1233		vie_advance(vie);
1234	}
1235
1236	if (n == 1)
1237		vie->displacement = u.signed8;		/* sign-extended */
1238	else
1239		vie->displacement = u.signed32;		/* sign-extended */
1240
1241	return (0);
1242}
1243
1244static int
1245decode_immediate(struct vie *vie)
1246{
1247	int i, n;
1248	uint8_t x;
1249	union {
1250		char	buf[4];
1251		int8_t	signed8;
1252		int32_t	signed32;
1253	} u;
1254
1255	/* Figure out immediate operand size (if any) */
1256	if (vie->op.op_flags & VIE_OP_F_IMM)
1257		vie->imm_bytes = 4;
1258	else if (vie->op.op_flags & VIE_OP_F_IMM8)
1259		vie->imm_bytes = 1;
1260
1261	if ((n = vie->imm_bytes) == 0)
1262		return (0);
1263
1264	if (n != 1 && n != 4)
1265		panic("decode_immediate: invalid imm_bytes %d", n);
1266
1267	for (i = 0; i < n; i++) {
1268		if (vie_peek(vie, &x))
1269			return (-1);
1270
1271		u.buf[i] = x;
1272		vie_advance(vie);
1273	}
1274
1275	if (n == 1)
1276		vie->immediate = u.signed8;		/* sign-extended */
1277	else
1278		vie->immediate = u.signed32;		/* sign-extended */
1279
1280	return (0);
1281}
1282
1283/*
1284 * Verify that all the bytes in the instruction buffer were consumed.
1285 */
1286static int
1287verify_inst_length(struct vie *vie)
1288{
1289
1290	if (vie->num_processed == vie->num_valid)
1291		return (0);
1292	else
1293		return (-1);
1294}
1295
1296/*
1297 * Verify that the 'guest linear address' provided as collateral of the nested
1298 * page table fault matches with our instruction decoding.
1299 */
1300static int
1301verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1302{
1303	int error;
1304	uint64_t base, idx;
1305
1306	/* Skip 'gla' verification */
1307	if (gla == VIE_INVALID_GLA)
1308		return (0);
1309
1310	base = 0;
1311	if (vie->base_register != VM_REG_LAST) {
1312		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1313		if (error) {
1314			printf("verify_gla: error %d getting base reg %d\n",
1315				error, vie->base_register);
1316			return (-1);
1317		}
1318
1319		/*
1320		 * RIP-relative addressing starts from the following
1321		 * instruction
1322		 */
1323		if (vie->base_register == VM_REG_GUEST_RIP)
1324			base += vie->num_valid;
1325	}
1326
1327	idx = 0;
1328	if (vie->index_register != VM_REG_LAST) {
1329		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1330		if (error) {
1331			printf("verify_gla: error %d getting index reg %d\n",
1332				error, vie->index_register);
1333			return (-1);
1334		}
1335	}
1336
1337	if (base + vie->scale * idx + vie->displacement != gla) {
1338		printf("verify_gla mismatch: "
1339		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1340		       "disp(0x%0lx), gla(0x%0lx)\n",
1341		       base, vie->scale, idx, vie->displacement, gla);
1342		return (-1);
1343	}
1344
1345	return (0);
1346}
1347
1348int
1349vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1350		       enum vm_cpu_mode cpu_mode, struct vie *vie)
1351{
1352
1353	if (cpu_mode == CPU_MODE_64BIT) {
1354		if (decode_rex(vie))
1355			return (-1);
1356	}
1357
1358	if (decode_opcode(vie))
1359		return (-1);
1360
1361	if (decode_modrm(vie, cpu_mode))
1362		return (-1);
1363
1364	if (decode_sib(vie))
1365		return (-1);
1366
1367	if (decode_displacement(vie))
1368		return (-1);
1369
1370	if (decode_immediate(vie))
1371		return (-1);
1372
1373	if (verify_inst_length(vie))
1374		return (-1);
1375
1376	if (verify_gla(vm, cpuid, gla, vie))
1377		return (-1);
1378
1379	vie->decoded = 1;	/* success */
1380
1381	return (0);
1382}
1383#endif	/* _KERNEL */
1384