1/*-
2 * Copyright (c) 2012 Sandvine, Inc.
3 * Copyright (c) 2012 NetApp, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD$
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD$");
32
33#ifdef _KERNEL
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38
39#include <vm/vm.h>
40#include <vm/pmap.h>
41
42#include <machine/vmparam.h>
43#include <machine/vmm.h>
44#else	/* !_KERNEL */
45#include <sys/types.h>
46#include <sys/errno.h>
47#include <sys/_iovec.h>
48
49#include <machine/vmm.h>
50
51#include <assert.h>
52#include <vmmapi.h>
53#define	KASSERT(exp,msg)	assert((exp))
54#endif	/* _KERNEL */
55
56#include <machine/vmm_instruction_emul.h>
57#include <x86/psl.h>
58#include <x86/specialreg.h>
59
60/* struct vie_op.op_type */
61enum {
62	VIE_OP_TYPE_NONE = 0,
63	VIE_OP_TYPE_MOV,
64	VIE_OP_TYPE_MOVSX,
65	VIE_OP_TYPE_MOVZX,
66	VIE_OP_TYPE_AND,
67	VIE_OP_TYPE_OR,
68	VIE_OP_TYPE_SUB,
69	VIE_OP_TYPE_TWO_BYTE,
70	VIE_OP_TYPE_PUSH,
71	VIE_OP_TYPE_CMP,
72	VIE_OP_TYPE_LAST
73};
74
75/* struct vie_op.op_flags */
76#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
77#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
78#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
79#define	VIE_OP_F_NO_MODRM	(1 << 3)
80
81static const struct vie_op two_byte_opcodes[256] = {
82	[0xB6] = {
83		.op_byte = 0xB6,
84		.op_type = VIE_OP_TYPE_MOVZX,
85	},
86	[0xB7] = {
87		.op_byte = 0xB7,
88		.op_type = VIE_OP_TYPE_MOVZX,
89	},
90	[0xBE] = {
91		.op_byte = 0xBE,
92		.op_type = VIE_OP_TYPE_MOVSX,
93	},
94};
95
96static const struct vie_op one_byte_opcodes[256] = {
97	[0x0F] = {
98		.op_byte = 0x0F,
99		.op_type = VIE_OP_TYPE_TWO_BYTE
100	},
101	[0x2B] = {
102		.op_byte = 0x2B,
103		.op_type = VIE_OP_TYPE_SUB,
104	},
105	[0x3B] = {
106		.op_byte = 0x3B,
107		.op_type = VIE_OP_TYPE_CMP,
108	},
109	[0x88] = {
110		.op_byte = 0x88,
111		.op_type = VIE_OP_TYPE_MOV,
112	},
113	[0x89] = {
114		.op_byte = 0x89,
115		.op_type = VIE_OP_TYPE_MOV,
116	},
117	[0x8A] = {
118		.op_byte = 0x8A,
119		.op_type = VIE_OP_TYPE_MOV,
120	},
121	[0x8B] = {
122		.op_byte = 0x8B,
123		.op_type = VIE_OP_TYPE_MOV,
124	},
125	[0xA1] = {
126		.op_byte = 0xA1,
127		.op_type = VIE_OP_TYPE_MOV,
128		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
129	},
130	[0xA3] = {
131		.op_byte = 0xA3,
132		.op_type = VIE_OP_TYPE_MOV,
133		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
134	},
135	[0xC6] = {
136		/* XXX Group 11 extended opcode - not just MOV */
137		.op_byte = 0xC6,
138		.op_type = VIE_OP_TYPE_MOV,
139		.op_flags = VIE_OP_F_IMM8,
140	},
141	[0xC7] = {
142		.op_byte = 0xC7,
143		.op_type = VIE_OP_TYPE_MOV,
144		.op_flags = VIE_OP_F_IMM,
145	},
146	[0x23] = {
147		.op_byte = 0x23,
148		.op_type = VIE_OP_TYPE_AND,
149	},
150	[0x81] = {
151		/* XXX Group 1 extended opcode - not just AND */
152		.op_byte = 0x81,
153		.op_type = VIE_OP_TYPE_AND,
154		.op_flags = VIE_OP_F_IMM,
155	},
156	[0x83] = {
157		/* XXX Group 1 extended opcode - not just OR */
158		.op_byte = 0x83,
159		.op_type = VIE_OP_TYPE_OR,
160		.op_flags = VIE_OP_F_IMM8,
161	},
162	[0xFF] = {
163		/* XXX Group 5 extended opcode - not just PUSH */
164		.op_byte = 0xFF,
165		.op_type = VIE_OP_TYPE_PUSH,
166	}
167};
168
169/* struct vie.mod */
170#define	VIE_MOD_INDIRECT		0
171#define	VIE_MOD_INDIRECT_DISP8		1
172#define	VIE_MOD_INDIRECT_DISP32		2
173#define	VIE_MOD_DIRECT			3
174
175/* struct vie.rm */
176#define	VIE_RM_SIB			4
177#define	VIE_RM_DISP32			5
178
179#define	GB				(1024 * 1024 * 1024)
180
181static enum vm_reg_name gpr_map[16] = {
182	VM_REG_GUEST_RAX,
183	VM_REG_GUEST_RCX,
184	VM_REG_GUEST_RDX,
185	VM_REG_GUEST_RBX,
186	VM_REG_GUEST_RSP,
187	VM_REG_GUEST_RBP,
188	VM_REG_GUEST_RSI,
189	VM_REG_GUEST_RDI,
190	VM_REG_GUEST_R8,
191	VM_REG_GUEST_R9,
192	VM_REG_GUEST_R10,
193	VM_REG_GUEST_R11,
194	VM_REG_GUEST_R12,
195	VM_REG_GUEST_R13,
196	VM_REG_GUEST_R14,
197	VM_REG_GUEST_R15
198};
199
200static uint64_t size2mask[] = {
201	[1] = 0xff,
202	[2] = 0xffff,
203	[4] = 0xffffffff,
204	[8] = 0xffffffffffffffff,
205};
206
207static int
208vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
209{
210	int error;
211
212	error = vm_get_register(vm, vcpuid, reg, rval);
213
214	return (error);
215}
216
217static void
218vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
219{
220	*lhbr = 0;
221	*reg = gpr_map[vie->reg];
222
223	/*
224	 * 64-bit mode imposes limitations on accessing legacy high byte
225	 * registers (lhbr).
226	 *
227	 * The legacy high-byte registers cannot be addressed if the REX
228	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
229	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
230	 *
231	 * If the REX prefix is not present then the values 4, 5, 6 and 7
232	 * of the 'ModRM:reg' field address the legacy high-byte registers,
233	 * %ah, %ch, %dh and %bh respectively.
234	 */
235	if (!vie->rex_present) {
236		if (vie->reg & 0x4) {
237			*lhbr = 1;
238			*reg = gpr_map[vie->reg & 0x3];
239		}
240	}
241}
242
243static int
244vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
245{
246	uint64_t val;
247	int error, lhbr;
248	enum vm_reg_name reg;
249
250	vie_calc_bytereg(vie, &reg, &lhbr);
251	error = vm_get_register(vm, vcpuid, reg, &val);
252
253	/*
254	 * To obtain the value of a legacy high byte register shift the
255	 * base register right by 8 bits (%ah = %rax >> 8).
256	 */
257	if (lhbr)
258		*rval = val >> 8;
259	else
260		*rval = val;
261	return (error);
262}
263
264static int
265vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
266{
267	uint64_t origval, val, mask;
268	int error, lhbr;
269	enum vm_reg_name reg;
270
271	vie_calc_bytereg(vie, &reg, &lhbr);
272	error = vm_get_register(vm, vcpuid, reg, &origval);
273	if (error == 0) {
274		val = byte;
275		mask = 0xff;
276		if (lhbr) {
277			/*
278			 * Shift left by 8 to store 'byte' in a legacy high
279			 * byte register.
280			 */
281			val <<= 8;
282			mask <<= 8;
283		}
284		val |= origval & ~mask;
285		error = vm_set_register(vm, vcpuid, reg, val);
286	}
287	return (error);
288}
289
290int
291vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
292		    uint64_t val, int size)
293{
294	int error;
295	uint64_t origval;
296
297	switch (size) {
298	case 1:
299	case 2:
300		error = vie_read_register(vm, vcpuid, reg, &origval);
301		if (error)
302			return (error);
303		val &= size2mask[size];
304		val |= origval & ~size2mask[size];
305		break;
306	case 4:
307		val &= 0xffffffffUL;
308		break;
309	case 8:
310		break;
311	default:
312		return (EINVAL);
313	}
314
315	error = vm_set_register(vm, vcpuid, reg, val);
316	return (error);
317}
318
319/*
320 * Return the status flags that would result from doing (x - y).
321 */
322static u_long
323getcc16(uint16_t x, uint16_t y)
324{
325	u_long rflags;
326
327	__asm __volatile("sub %1,%2; pushfq; popq %0" :
328	    "=r" (rflags) : "m" (y), "r" (x));
329	return (rflags);
330}
331
332static u_long
333getcc32(uint32_t x, uint32_t y)
334{
335	u_long rflags;
336
337	__asm __volatile("sub %1,%2; pushfq; popq %0" :
338	    "=r" (rflags) : "m" (y), "r" (x));
339	return (rflags);
340}
341
342static u_long
343getcc64(uint64_t x, uint64_t y)
344{
345	u_long rflags;
346
347	__asm __volatile("sub %1,%2; pushfq; popq %0" :
348	    "=r" (rflags) : "m" (y), "r" (x));
349	return (rflags);
350}
351
352static u_long
353getcc(int opsize, uint64_t x, uint64_t y)
354{
355	KASSERT(opsize == 2 || opsize == 4 || opsize == 8,
356	    ("getcc: invalid operand size %d", opsize));
357
358	if (opsize == 2)
359		return (getcc16(x, y));
360	else if (opsize == 4)
361		return (getcc32(x, y));
362	else
363		return (getcc64(x, y));
364}
365
366static int
367emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
368	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
369{
370	int error, size;
371	enum vm_reg_name reg;
372	uint8_t byte;
373	uint64_t val;
374
375	size = vie->opsize;
376	error = EINVAL;
377
378	switch (vie->op.op_byte) {
379	case 0x88:
380		/*
381		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
382		 * 88/r:	mov r/m8, r8
383		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
384		 */
385		size = 1;	/* override for byte operation */
386		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
387		if (error == 0)
388			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
389		break;
390	case 0x89:
391		/*
392		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
393		 * 89/r:	mov r/m16, r16
394		 * 89/r:	mov r/m32, r32
395		 * REX.W + 89/r	mov r/m64, r64
396		 */
397		reg = gpr_map[vie->reg];
398		error = vie_read_register(vm, vcpuid, reg, &val);
399		if (error == 0) {
400			val &= size2mask[size];
401			error = memwrite(vm, vcpuid, gpa, val, size, arg);
402		}
403		break;
404	case 0x8A:
405		/*
406		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
407		 * 8A/r:	mov r8, r/m8
408		 * REX + 8A/r:	mov r8, r/m8
409		 */
410		size = 1;	/* override for byte operation */
411		error = memread(vm, vcpuid, gpa, &val, size, arg);
412		if (error == 0)
413			error = vie_write_bytereg(vm, vcpuid, vie, val);
414		break;
415	case 0x8B:
416		/*
417		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
418		 * 8B/r:	mov r16, r/m16
419		 * 8B/r:	mov r32, r/m32
420		 * REX.W 8B/r:	mov r64, r/m64
421		 */
422		error = memread(vm, vcpuid, gpa, &val, size, arg);
423		if (error == 0) {
424			reg = gpr_map[vie->reg];
425			error = vie_update_register(vm, vcpuid, reg, val, size);
426		}
427		break;
428	case 0xA1:
429		/*
430		 * MOV from seg:moffset to AX/EAX/RAX
431		 * A1:		mov AX, moffs16
432		 * A1:		mov EAX, moffs32
433		 * REX.W + A1:	mov RAX, moffs64
434		 */
435		error = memread(vm, vcpuid, gpa, &val, size, arg);
436		if (error == 0) {
437			reg = VM_REG_GUEST_RAX;
438			error = vie_update_register(vm, vcpuid, reg, val, size);
439		}
440		break;
441	case 0xA3:
442		/*
443		 * MOV from AX/EAX/RAX to seg:moffset
444		 * A3:		mov moffs16, AX
445		 * A3:		mov moffs32, EAX
446		 * REX.W + A3:	mov moffs64, RAX
447		 */
448		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
449		if (error == 0) {
450			val &= size2mask[size];
451			error = memwrite(vm, vcpuid, gpa, val, size, arg);
452		}
453		break;
454	case 0xC6:
455		/*
456		 * MOV from imm8 to mem (ModRM:r/m)
457		 * C6/0		mov r/m8, imm8
458		 * REX + C6/0	mov r/m8, imm8
459		 */
460		size = 1;	/* override for byte operation */
461		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
462		break;
463	case 0xC7:
464		/*
465		 * MOV from imm16/imm32 to mem (ModRM:r/m)
466		 * C7/0		mov r/m16, imm16
467		 * C7/0		mov r/m32, imm32
468		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
469		 */
470		val = vie->immediate & size2mask[size];
471		error = memwrite(vm, vcpuid, gpa, val, size, arg);
472		break;
473	default:
474		break;
475	}
476
477	return (error);
478}
479
480static int
481emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
482	     mem_region_read_t memread, mem_region_write_t memwrite,
483	     void *arg)
484{
485	int error, size;
486	enum vm_reg_name reg;
487	uint64_t val;
488
489	size = vie->opsize;
490	error = EINVAL;
491
492	switch (vie->op.op_byte) {
493	case 0xB6:
494		/*
495		 * MOV and zero extend byte from mem (ModRM:r/m) to
496		 * reg (ModRM:reg).
497		 *
498		 * 0F B6/r		movzx r16, r/m8
499		 * 0F B6/r		movzx r32, r/m8
500		 * REX.W + 0F B6/r	movzx r64, r/m8
501		 */
502
503		/* get the first operand */
504		error = memread(vm, vcpuid, gpa, &val, 1, arg);
505		if (error)
506			break;
507
508		/* get the second operand */
509		reg = gpr_map[vie->reg];
510
511		/* zero-extend byte */
512		val = (uint8_t)val;
513
514		/* write the result */
515		error = vie_update_register(vm, vcpuid, reg, val, size);
516		break;
517	case 0xB7:
518		/*
519		 * MOV and zero extend word from mem (ModRM:r/m) to
520		 * reg (ModRM:reg).
521		 *
522		 * 0F B7/r		movzx r32, r/m16
523		 * REX.W + 0F B7/r	movzx r64, r/m16
524		 */
525		error = memread(vm, vcpuid, gpa, &val, 2, arg);
526		if (error)
527			return (error);
528
529		reg = gpr_map[vie->reg];
530
531		/* zero-extend word */
532		val = (uint16_t)val;
533
534		error = vie_update_register(vm, vcpuid, reg, val, size);
535		break;
536	case 0xBE:
537		/*
538		 * MOV and sign extend byte from mem (ModRM:r/m) to
539		 * reg (ModRM:reg).
540		 *
541		 * 0F BE/r		movsx r16, r/m8
542		 * 0F BE/r		movsx r32, r/m8
543		 * REX.W + 0F BE/r	movsx r64, r/m8
544		 */
545
546		/* get the first operand */
547		error = memread(vm, vcpuid, gpa, &val, 1, arg);
548		if (error)
549			break;
550
551		/* get the second operand */
552		reg = gpr_map[vie->reg];
553
554		/* sign extend byte */
555		val = (int8_t)val;
556
557		/* write the result */
558		error = vie_update_register(vm, vcpuid, reg, val, size);
559		break;
560	default:
561		break;
562	}
563	return (error);
564}
565
566static int
567emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
568	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
569{
570	int error, size;
571	enum vm_reg_name reg;
572	uint64_t val1, val2;
573
574	size = vie->opsize;
575	error = EINVAL;
576
577	switch (vie->op.op_byte) {
578	case 0x23:
579		/*
580		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
581		 * result in reg.
582		 *
583		 * 23/r		and r16, r/m16
584		 * 23/r		and r32, r/m32
585		 * REX.W + 23/r	and r64, r/m64
586		 */
587
588		/* get the first operand */
589		reg = gpr_map[vie->reg];
590		error = vie_read_register(vm, vcpuid, reg, &val1);
591		if (error)
592			break;
593
594		/* get the second operand */
595		error = memread(vm, vcpuid, gpa, &val2, size, arg);
596		if (error)
597			break;
598
599		/* perform the operation and write the result */
600		val1 &= val2;
601		error = vie_update_register(vm, vcpuid, reg, val1, size);
602		break;
603	case 0x81:
604		/*
605		 * AND/OR mem (ModRM:r/m) with immediate and store the
606		 * result in mem.
607		 *
608		 * AND: i = 4
609		 * OR:  i = 1
610		 * 81 /i		op r/m16, imm16
611		 * 81 /i		op r/m32, imm32
612		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
613		 *
614		 */
615
616		/* get the first operand */
617                error = memread(vm, vcpuid, gpa, &val1, size, arg);
618                if (error)
619			break;
620
621                /*
622                 * perform the operation with the pre-fetched immediate
623                 * operand and write the result
624                 */
625		switch (vie->reg & 7) {
626		case 0x4:
627			/* modrm:reg == b100, AND */
628			val1 &= vie->immediate;
629			break;
630		case 0x1:
631			/* modrm:reg == b001, OR */
632			val1 |= vie->immediate;
633			break;
634		default:
635			error = EINVAL;
636			break;
637		}
638		if (error)
639			break;
640
641		error = memwrite(vm, vcpuid, gpa, val1, size, arg);
642		break;
643	default:
644		break;
645	}
646	return (error);
647}
648
649static int
650emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
651	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
652{
653	int error, size;
654	uint64_t val1;
655
656	size = vie->opsize;
657	error = EINVAL;
658
659	switch (vie->op.op_byte) {
660	case 0x83:
661		/*
662		 * OR mem (ModRM:r/m) with immediate and store the
663		 * result in mem.
664		 *
665		 * 83 /1		OR r/m16, imm8 sign-extended to 16
666		 * 83 /1		OR r/m32, imm8 sign-extended to 32
667		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
668		 *
669		 * Currently, only the OR operation of the 0x83 opcode
670		 * is implemented (ModRM:reg = b001).
671		 */
672		if ((vie->reg & 7) != 1)
673			break;
674
675		/* get the first operand */
676                error = memread(vm, vcpuid, gpa, &val1, size, arg);
677                if (error)
678			break;
679
680                /*
681		 * perform the operation with the pre-fetched immediate
682		 * operand and write the result
683		 */
684                val1 |= vie->immediate;
685                error = memwrite(vm, vcpuid, gpa, val1, size, arg);
686		break;
687	default:
688		break;
689	}
690	return (error);
691}
692
693#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
694
695static int
696emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
697	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
698{
699	int error, size;
700	uint64_t op1, op2, rflags, rflags2;
701	enum vm_reg_name reg;
702
703	size = vie->opsize;
704	switch (vie->op.op_byte) {
705	case 0x3B:
706		/*
707		 * 3B/r		CMP r16, r/m16
708		 * 3B/r		CMP r32, r/m32
709		 * REX.W + 3B/r	CMP r64, r/m64
710		 *
711		 * Compare first operand (reg) with second operand (r/m) and
712		 * set status flags in EFLAGS register. The comparison is
713		 * performed by subtracting the second operand from the first
714		 * operand and then setting the status flags.
715		 */
716
717		/* Get the first operand */
718		reg = gpr_map[vie->reg];
719		error = vie_read_register(vm, vcpuid, reg, &op1);
720		if (error)
721			return (error);
722
723		/* Get the second operand */
724		error = memread(vm, vcpuid, gpa, &op2, size, arg);
725		if (error)
726			return (error);
727
728		break;
729	default:
730		return (EINVAL);
731	}
732	rflags2 = getcc(size, op1, op2);
733	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
734	if (error)
735		return (error);
736	rflags &= ~RFLAGS_STATUS_BITS;
737	rflags |= rflags2 & RFLAGS_STATUS_BITS;
738
739	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
740	return (error);
741}
742
743static int
744emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
745	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
746{
747	int error, size;
748	uint64_t nval, rflags, rflags2, val1, val2;
749	enum vm_reg_name reg;
750
751	size = vie->opsize;
752	error = EINVAL;
753
754	switch (vie->op.op_byte) {
755	case 0x2B:
756		/*
757		 * SUB r/m from r and store the result in r
758		 *
759		 * 2B/r            SUB r16, r/m16
760		 * 2B/r            SUB r32, r/m32
761		 * REX.W + 2B/r    SUB r64, r/m64
762		 */
763
764		/* get the first operand */
765		reg = gpr_map[vie->reg];
766		error = vie_read_register(vm, vcpuid, reg, &val1);
767		if (error)
768			break;
769
770		/* get the second operand */
771		error = memread(vm, vcpuid, gpa, &val2, size, arg);
772		if (error)
773			break;
774
775		/* perform the operation and write the result */
776		nval = val1 - val2;
777		error = vie_update_register(vm, vcpuid, reg, nval, size);
778		break;
779	default:
780		break;
781	}
782
783	if (!error) {
784		rflags2 = getcc(size, val1, val2);
785		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
786		    &rflags);
787		if (error)
788			return (error);
789
790		rflags &= ~RFLAGS_STATUS_BITS;
791		rflags |= rflags2 & RFLAGS_STATUS_BITS;
792		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
793		    rflags, 8);
794	}
795
796	return (error);
797}
798
799static int
800emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
801    struct vm_guest_paging *paging, mem_region_read_t memread,
802    mem_region_write_t memwrite, void *arg)
803{
804#ifdef _KERNEL
805	struct vm_copyinfo copyinfo[2];
806#else
807	struct iovec copyinfo[2];
808#endif
809	struct seg_desc ss_desc;
810	uint64_t cr0, rflags, rsp, stack_gla, val;
811	int error, size, stackaddrsize;
812
813	/*
814	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
815	 *
816	 * PUSH is part of the group 5 extended opcodes and is identified
817	 * by ModRM:reg = b110.
818	 */
819	if ((vie->reg & 7) != 6)
820		return (EINVAL);
821
822	size = vie->opsize;
823	/*
824	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
825	 */
826	if (paging->cpu_mode == CPU_MODE_REAL) {
827		stackaddrsize = 2;
828	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
829		/*
830		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
831		 * - Stack pointer size is always 64-bits.
832		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
833		 * - 16-bit PUSH/POP is supported by using the operand size
834		 *   override prefix (66H).
835		 */
836		stackaddrsize = 8;
837		size = vie->opsize_override ? 2 : 8;
838	} else {
839		/*
840		 * In protected or compability mode the 'B' flag in the
841		 * stack-segment descriptor determines the size of the
842		 * stack pointer.
843		 */
844		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
845		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
846		    __func__, error));
847		if (SEG_DESC_DEF32(ss_desc.access))
848			stackaddrsize = 4;
849		else
850			stackaddrsize = 2;
851	}
852
853	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
854	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
855
856	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
857	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
858
859	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
860	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
861
862	rsp -= size;
863	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
864	    rsp, size, stackaddrsize, PROT_WRITE, &stack_gla)) {
865		vm_inject_ss(vm, vcpuid, 0);
866		return (0);
867	}
868
869	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
870		vm_inject_ss(vm, vcpuid, 0);
871		return (0);
872	}
873
874	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
875		vm_inject_ac(vm, vcpuid, 0);
876		return (0);
877	}
878
879	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, PROT_WRITE,
880	    copyinfo, nitems(copyinfo));
881	if (error == -1) {
882		/*
883		 * XXX cannot return a negative error value here because it
884		 * ends up being the return value of the VM_RUN() ioctl and
885		 * is interpreted as a pseudo-error (for e.g. ERESTART).
886		 */
887		return (EFAULT);
888	} else if (error == 1) {
889		/* Resume guest execution to handle page fault */
890		return (0);
891	}
892
893	error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
894	if (error == 0) {
895		vm_copyout(vm, vcpuid, &val, copyinfo, size);
896		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
897		    stackaddrsize);
898		KASSERT(error == 0, ("error %d updating rsp", error));
899	}
900#ifdef _KERNEL
901	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
902#endif
903	return (error);
904}
905
906int
907vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
908    struct vm_guest_paging *paging, mem_region_read_t memread,
909    mem_region_write_t memwrite, void *memarg)
910{
911	int error;
912
913	if (!vie->decoded)
914		return (EINVAL);
915
916	switch (vie->op.op_type) {
917	case VIE_OP_TYPE_PUSH:
918		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
919		    memwrite, memarg);
920		break;
921	case VIE_OP_TYPE_CMP:
922		error = emulate_cmp(vm, vcpuid, gpa, vie,
923				    memread, memwrite, memarg);
924		break;
925	case VIE_OP_TYPE_MOV:
926		error = emulate_mov(vm, vcpuid, gpa, vie,
927				    memread, memwrite, memarg);
928		break;
929	case VIE_OP_TYPE_MOVSX:
930	case VIE_OP_TYPE_MOVZX:
931		error = emulate_movx(vm, vcpuid, gpa, vie,
932				     memread, memwrite, memarg);
933		break;
934	case VIE_OP_TYPE_AND:
935		error = emulate_and(vm, vcpuid, gpa, vie,
936				    memread, memwrite, memarg);
937		break;
938	case VIE_OP_TYPE_OR:
939		error = emulate_or(vm, vcpuid, gpa, vie,
940				    memread, memwrite, memarg);
941		break;
942	case VIE_OP_TYPE_SUB:
943		error = emulate_sub(vm, vcpuid, gpa, vie,
944				    memread, memwrite, memarg);
945		break;
946	default:
947		error = EINVAL;
948		break;
949	}
950
951	return (error);
952}
953
954int
955vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
956{
957	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
958	    ("%s: invalid size %d", __func__, size));
959	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
960
961	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
962		return (0);
963
964	return ((gla & (size - 1)) ? 1 : 0);
965}
966
967int
968vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
969{
970	uint64_t mask;
971
972	if (cpu_mode != CPU_MODE_64BIT)
973		return (0);
974
975	/*
976	 * The value of the bit 47 in the 'gla' should be replicated in the
977	 * most significant 16 bits.
978	 */
979	mask = ~((1UL << 48) - 1);
980	if (gla & (1UL << 47))
981		return ((gla & mask) != mask);
982	else
983		return ((gla & mask) != 0);
984}
985
986uint64_t
987vie_size2mask(int size)
988{
989	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
990	    ("vie_size2mask: invalid size %d", size));
991	return (size2mask[size]);
992}
993
994int
995vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
996    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
997    int prot, uint64_t *gla)
998{
999	uint64_t firstoff, low_limit, high_limit, segbase;
1000	int glasize, type;
1001
1002	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1003	    ("%s: invalid segment %d", __func__, seg));
1004	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1005	    ("%s: invalid operand size %d", __func__, length));
1006	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1007	    ("%s: invalid prot %#x", __func__, prot));
1008
1009	firstoff = offset;
1010	if (cpu_mode == CPU_MODE_64BIT) {
1011		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1012		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1013		glasize = 8;
1014	} else {
1015		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1016		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1017		glasize = 4;
1018		/*
1019		 * If the segment selector is loaded with a NULL selector
1020		 * then the descriptor is unusable and attempting to use
1021		 * it results in a #GP(0).
1022		 */
1023		if (SEG_DESC_UNUSABLE(desc->access))
1024			return (-1);
1025
1026		/*
1027		 * The processor generates a #NP exception when a segment
1028		 * register is loaded with a selector that points to a
1029		 * descriptor that is not present. If this was the case then
1030		 * it would have been checked before the VM-exit.
1031		 */
1032		KASSERT(SEG_DESC_PRESENT(desc->access),
1033		    ("segment %d not present: %#x", seg, desc->access));
1034
1035		/*
1036		 * The descriptor type must indicate a code/data segment.
1037		 */
1038		type = SEG_DESC_TYPE(desc->access);
1039		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1040		    "descriptor type %#x", seg, type));
1041
1042		if (prot & PROT_READ) {
1043			/* #GP on a read access to a exec-only code segment */
1044			if ((type & 0xA) == 0x8)
1045				return (-1);
1046		}
1047
1048		if (prot & PROT_WRITE) {
1049			/*
1050			 * #GP on a write access to a code segment or a
1051			 * read-only data segment.
1052			 */
1053			if (type & 0x8)			/* code segment */
1054				return (-1);
1055
1056			if ((type & 0xA) == 0)		/* read-only data seg */
1057				return (-1);
1058		}
1059
1060		/*
1061		 * 'desc->limit' is fully expanded taking granularity into
1062		 * account.
1063		 */
1064		if ((type & 0xC) == 0x4) {
1065			/* expand-down data segment */
1066			low_limit = desc->limit + 1;
1067			high_limit = SEG_DESC_DEF32(desc->access) ?
1068			    0xffffffff : 0xffff;
1069		} else {
1070			/* code segment or expand-up data segment */
1071			low_limit = 0;
1072			high_limit = desc->limit;
1073		}
1074
1075		while (length > 0) {
1076			offset &= vie_size2mask(addrsize);
1077			if (offset < low_limit || offset > high_limit)
1078				return (-1);
1079			offset++;
1080			length--;
1081		}
1082	}
1083
1084	/*
1085	 * In 64-bit mode all segments except %fs and %gs have a segment
1086	 * base address of 0.
1087	 */
1088	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1089	    seg != VM_REG_GUEST_GS) {
1090		segbase = 0;
1091	} else {
1092		segbase = desc->base;
1093	}
1094
1095	/*
1096	 * Truncate 'firstoff' to the effective address size before adding
1097	 * it to the segment base.
1098	 */
1099	firstoff &= vie_size2mask(addrsize);
1100	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1101	return (0);
1102}
1103
1104#ifdef _KERNEL
1105void
1106vie_init(struct vie *vie)
1107{
1108
1109	bzero(vie, sizeof(struct vie));
1110
1111	vie->base_register = VM_REG_LAST;
1112	vie->index_register = VM_REG_LAST;
1113}
1114
1115static int
1116pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1117{
1118	int error_code = 0;
1119
1120	if (pte & PG_V)
1121		error_code |= PGEX_P;
1122	if (prot & VM_PROT_WRITE)
1123		error_code |= PGEX_W;
1124	if (usermode)
1125		error_code |= PGEX_U;
1126	if (rsvd)
1127		error_code |= PGEX_RSV;
1128	if (prot & VM_PROT_EXECUTE)
1129		error_code |= PGEX_I;
1130
1131	return (error_code);
1132}
1133
1134static void
1135ptp_release(void **cookie)
1136{
1137	if (*cookie != NULL) {
1138		vm_gpa_release(*cookie);
1139		*cookie = NULL;
1140	}
1141}
1142
1143static void *
1144ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
1145{
1146	void *ptr;
1147
1148	ptp_release(cookie);
1149	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
1150	return (ptr);
1151}
1152
1153int
1154vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1155    uint64_t gla, int prot, uint64_t *gpa)
1156{
1157	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1158	u_int retries;
1159	uint64_t *ptpbase, ptpphys, pte, pgsize;
1160	uint32_t *ptpbase32, pte32;
1161	void *cookie;
1162
1163	usermode = (paging->cpl == 3 ? 1 : 0);
1164	writable = prot & VM_PROT_WRITE;
1165	cookie = NULL;
1166	retval = 0;
1167	retries = 0;
1168restart:
1169	ptpphys = paging->cr3;		/* root of the page tables */
1170	ptp_release(&cookie);
1171	if (retries++ > 0)
1172		maybe_yield();
1173
1174	if (vie_canonical_check(paging->cpu_mode, gla)) {
1175		/*
1176		 * XXX assuming a non-stack reference otherwise a stack fault
1177		 * should be generated.
1178		 */
1179		vm_inject_gp(vm, vcpuid);
1180		goto fault;
1181	}
1182
1183	if (paging->paging_mode == PAGING_MODE_FLAT) {
1184		*gpa = gla;
1185		goto done;
1186	}
1187
1188	if (paging->paging_mode == PAGING_MODE_32) {
1189		nlevels = 2;
1190		while (--nlevels >= 0) {
1191			/* Zero out the lower 12 bits. */
1192			ptpphys &= ~0xfff;
1193
1194			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1195
1196			if (ptpbase32 == NULL)
1197				goto error;
1198
1199			ptpshift = PAGE_SHIFT + nlevels * 10;
1200			ptpindex = (gla >> ptpshift) & 0x3FF;
1201			pgsize = 1UL << ptpshift;
1202
1203			pte32 = ptpbase32[ptpindex];
1204
1205			if ((pte32 & PG_V) == 0 ||
1206			    (usermode && (pte32 & PG_U) == 0) ||
1207			    (writable && (pte32 & PG_RW) == 0)) {
1208				pfcode = pf_error_code(usermode, prot, 0,
1209				    pte32);
1210				vm_inject_pf(vm, vcpuid, pfcode, gla);
1211				goto fault;
1212			}
1213
1214			/*
1215			 * Emulate the x86 MMU's management of the accessed
1216			 * and dirty flags. While the accessed flag is set
1217			 * at every level of the page table, the dirty flag
1218			 * is only set at the last level providing the guest
1219			 * physical address.
1220			 */
1221			if ((pte32 & PG_A) == 0) {
1222				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1223				    pte32, pte32 | PG_A) == 0) {
1224					goto restart;
1225				}
1226			}
1227
1228			/* XXX must be ignored if CR4.PSE=0 */
1229			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1230				break;
1231
1232			ptpphys = pte32;
1233		}
1234
1235		/* Set the dirty bit in the page table entry if necessary */
1236		if (writable && (pte32 & PG_M) == 0) {
1237			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1238			    pte32, pte32 | PG_M) == 0) {
1239				goto restart;
1240			}
1241		}
1242
1243		/* Zero out the lower 'ptpshift' bits */
1244		pte32 >>= ptpshift; pte32 <<= ptpshift;
1245		*gpa = pte32 | (gla & (pgsize - 1));
1246		goto done;
1247	}
1248
1249	if (paging->paging_mode == PAGING_MODE_PAE) {
1250		/* Zero out the lower 5 bits and the upper 32 bits */
1251		ptpphys &= 0xffffffe0UL;
1252
1253		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
1254		if (ptpbase == NULL)
1255			goto error;
1256
1257		ptpindex = (gla >> 30) & 0x3;
1258
1259		pte = ptpbase[ptpindex];
1260
1261		if ((pte & PG_V) == 0) {
1262			pfcode = pf_error_code(usermode, prot, 0, pte);
1263			vm_inject_pf(vm, vcpuid, pfcode, gla);
1264			goto fault;
1265		}
1266
1267		ptpphys = pte;
1268
1269		nlevels = 2;
1270	} else
1271		nlevels = 4;
1272	while (--nlevels >= 0) {
1273		/* Zero out the lower 12 bits and the upper 12 bits */
1274		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1275
1276		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1277		if (ptpbase == NULL)
1278			goto error;
1279
1280		ptpshift = PAGE_SHIFT + nlevels * 9;
1281		ptpindex = (gla >> ptpshift) & 0x1FF;
1282		pgsize = 1UL << ptpshift;
1283
1284		pte = ptpbase[ptpindex];
1285
1286		if ((pte & PG_V) == 0 ||
1287		    (usermode && (pte & PG_U) == 0) ||
1288		    (writable && (pte & PG_RW) == 0)) {
1289			pfcode = pf_error_code(usermode, prot, 0, pte);
1290			vm_inject_pf(vm, vcpuid, pfcode, gla);
1291			goto fault;
1292		}
1293
1294		/* Set the accessed bit in the page table entry */
1295		if ((pte & PG_A) == 0) {
1296			if (atomic_cmpset_64(&ptpbase[ptpindex],
1297			    pte, pte | PG_A) == 0) {
1298				goto restart;
1299			}
1300		}
1301
1302		if (nlevels > 0 && (pte & PG_PS) != 0) {
1303			if (pgsize > 1 * GB) {
1304				pfcode = pf_error_code(usermode, prot, 1, pte);
1305				vm_inject_pf(vm, vcpuid, pfcode, gla);
1306				goto fault;
1307			}
1308			break;
1309		}
1310
1311		ptpphys = pte;
1312	}
1313
1314	/* Set the dirty bit in the page table entry if necessary */
1315	if (writable && (pte & PG_M) == 0) {
1316		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1317			goto restart;
1318	}
1319
1320	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1321	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1322	*gpa = pte | (gla & (pgsize - 1));
1323done:
1324	ptp_release(&cookie);
1325	return (retval);
1326error:
1327	retval = -1;
1328	goto done;
1329fault:
1330	retval = 1;
1331	goto done;
1332}
1333
1334int
1335vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1336    uint64_t rip, int inst_length, struct vie *vie)
1337{
1338	struct vm_copyinfo copyinfo[2];
1339	int error, prot;
1340
1341	if (inst_length > VIE_INST_SIZE)
1342		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1343
1344	prot = PROT_READ | PROT_EXEC;
1345	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1346	    copyinfo, nitems(copyinfo));
1347	if (error == 0) {
1348		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1349		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1350		vie->num_valid = inst_length;
1351	}
1352	return (error);
1353}
1354
1355static int
1356vie_peek(struct vie *vie, uint8_t *x)
1357{
1358
1359	if (vie->num_processed < vie->num_valid) {
1360		*x = vie->inst[vie->num_processed];
1361		return (0);
1362	} else
1363		return (-1);
1364}
1365
1366static void
1367vie_advance(struct vie *vie)
1368{
1369
1370	vie->num_processed++;
1371}
1372
1373static int
1374decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1375{
1376	uint8_t x;
1377
1378	while (1) {
1379		if (vie_peek(vie, &x))
1380			return (-1);
1381
1382		if (x == 0x66)
1383			vie->opsize_override = 1;
1384		else if (x == 0x67)
1385			vie->addrsize_override = 1;
1386		else
1387			break;
1388
1389		vie_advance(vie);
1390	}
1391
1392	/*
1393	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1394	 * - Only one REX prefix is allowed per instruction.
1395	 * - The REX prefix must immediately precede the opcode byte or the
1396	 *   escape opcode byte.
1397	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1398	 *   the mandatory prefix must come before the REX prefix.
1399	 */
1400	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1401		vie->rex_present = 1;
1402		vie->rex_w = x & 0x8 ? 1 : 0;
1403		vie->rex_r = x & 0x4 ? 1 : 0;
1404		vie->rex_x = x & 0x2 ? 1 : 0;
1405		vie->rex_b = x & 0x1 ? 1 : 0;
1406		vie_advance(vie);
1407	}
1408
1409	/*
1410	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1411	 */
1412	if (cpu_mode == CPU_MODE_64BIT) {
1413		/*
1414		 * Default address size is 64-bits and default operand size
1415		 * is 32-bits.
1416		 */
1417		vie->addrsize = vie->addrsize_override ? 4 : 8;
1418		if (vie->rex_w)
1419			vie->opsize = 8;
1420		else if (vie->opsize_override)
1421			vie->opsize = 2;
1422		else
1423			vie->opsize = 4;
1424	} else if (cs_d) {
1425		/* Default address and operand sizes are 32-bits */
1426		vie->addrsize = vie->addrsize_override ? 2 : 4;
1427		vie->opsize = vie->opsize_override ? 2 : 4;
1428	} else {
1429		/* Default address and operand sizes are 16-bits */
1430		vie->addrsize = vie->addrsize_override ? 4 : 2;
1431		vie->opsize = vie->opsize_override ? 4 : 2;
1432	}
1433	return (0);
1434}
1435
1436static int
1437decode_two_byte_opcode(struct vie *vie)
1438{
1439	uint8_t x;
1440
1441	if (vie_peek(vie, &x))
1442		return (-1);
1443
1444	vie->op = two_byte_opcodes[x];
1445
1446	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1447		return (-1);
1448
1449	vie_advance(vie);
1450	return (0);
1451}
1452
1453static int
1454decode_opcode(struct vie *vie)
1455{
1456	uint8_t x;
1457
1458	if (vie_peek(vie, &x))
1459		return (-1);
1460
1461	vie->op = one_byte_opcodes[x];
1462
1463	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1464		return (-1);
1465
1466	vie_advance(vie);
1467
1468	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1469		return (decode_two_byte_opcode(vie));
1470
1471	return (0);
1472}
1473
1474static int
1475decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1476{
1477	uint8_t x;
1478
1479	if (cpu_mode == CPU_MODE_REAL)
1480		return (-1);
1481
1482	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
1483		return (0);
1484
1485	if (vie_peek(vie, &x))
1486		return (-1);
1487
1488	vie->mod = (x >> 6) & 0x3;
1489	vie->rm =  (x >> 0) & 0x7;
1490	vie->reg = (x >> 3) & 0x7;
1491
1492	/*
1493	 * A direct addressing mode makes no sense in the context of an EPT
1494	 * fault. There has to be a memory access involved to cause the
1495	 * EPT fault.
1496	 */
1497	if (vie->mod == VIE_MOD_DIRECT)
1498		return (-1);
1499
1500	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1501	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1502		/*
1503		 * Table 2-5: Special Cases of REX Encodings
1504		 *
1505		 * mod=0, r/m=5 is used in the compatibility mode to
1506		 * indicate a disp32 without a base register.
1507		 *
1508		 * mod!=3, r/m=4 is used in the compatibility mode to
1509		 * indicate that the SIB byte is present.
1510		 *
1511		 * The 'b' bit in the REX prefix is don't care in
1512		 * this case.
1513		 */
1514	} else {
1515		vie->rm |= (vie->rex_b << 3);
1516	}
1517
1518	vie->reg |= (vie->rex_r << 3);
1519
1520	/* SIB */
1521	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1522		goto done;
1523
1524	vie->base_register = gpr_map[vie->rm];
1525
1526	switch (vie->mod) {
1527	case VIE_MOD_INDIRECT_DISP8:
1528		vie->disp_bytes = 1;
1529		break;
1530	case VIE_MOD_INDIRECT_DISP32:
1531		vie->disp_bytes = 4;
1532		break;
1533	case VIE_MOD_INDIRECT:
1534		if (vie->rm == VIE_RM_DISP32) {
1535			vie->disp_bytes = 4;
1536			/*
1537			 * Table 2-7. RIP-Relative Addressing
1538			 *
1539			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1540			 * whereas in compatibility mode it just implies disp32.
1541			 */
1542
1543			if (cpu_mode == CPU_MODE_64BIT)
1544				vie->base_register = VM_REG_GUEST_RIP;
1545			else
1546				vie->base_register = VM_REG_LAST;
1547		}
1548		break;
1549	}
1550
1551done:
1552	vie_advance(vie);
1553
1554	return (0);
1555}
1556
1557static int
1558decode_sib(struct vie *vie)
1559{
1560	uint8_t x;
1561
1562	/* Proceed only if SIB byte is present */
1563	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1564		return (0);
1565
1566	if (vie_peek(vie, &x))
1567		return (-1);
1568
1569	/* De-construct the SIB byte */
1570	vie->ss = (x >> 6) & 0x3;
1571	vie->index = (x >> 3) & 0x7;
1572	vie->base = (x >> 0) & 0x7;
1573
1574	/* Apply the REX prefix modifiers */
1575	vie->index |= vie->rex_x << 3;
1576	vie->base |= vie->rex_b << 3;
1577
1578	switch (vie->mod) {
1579	case VIE_MOD_INDIRECT_DISP8:
1580		vie->disp_bytes = 1;
1581		break;
1582	case VIE_MOD_INDIRECT_DISP32:
1583		vie->disp_bytes = 4;
1584		break;
1585	}
1586
1587	if (vie->mod == VIE_MOD_INDIRECT &&
1588	    (vie->base == 5 || vie->base == 13)) {
1589		/*
1590		 * Special case when base register is unused if mod = 0
1591		 * and base = %rbp or %r13.
1592		 *
1593		 * Documented in:
1594		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1595		 * Table 2-5: Special Cases of REX Encodings
1596		 */
1597		vie->disp_bytes = 4;
1598	} else {
1599		vie->base_register = gpr_map[vie->base];
1600	}
1601
1602	/*
1603	 * All encodings of 'index' are valid except for %rsp (4).
1604	 *
1605	 * Documented in:
1606	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1607	 * Table 2-5: Special Cases of REX Encodings
1608	 */
1609	if (vie->index != 4)
1610		vie->index_register = gpr_map[vie->index];
1611
1612	/* 'scale' makes sense only in the context of an index register */
1613	if (vie->index_register < VM_REG_LAST)
1614		vie->scale = 1 << vie->ss;
1615
1616	vie_advance(vie);
1617
1618	return (0);
1619}
1620
1621static int
1622decode_displacement(struct vie *vie)
1623{
1624	int n, i;
1625	uint8_t x;
1626
1627	union {
1628		char	buf[4];
1629		int8_t	signed8;
1630		int32_t	signed32;
1631	} u;
1632
1633	if ((n = vie->disp_bytes) == 0)
1634		return (0);
1635
1636	if (n != 1 && n != 4)
1637		panic("decode_displacement: invalid disp_bytes %d", n);
1638
1639	for (i = 0; i < n; i++) {
1640		if (vie_peek(vie, &x))
1641			return (-1);
1642
1643		u.buf[i] = x;
1644		vie_advance(vie);
1645	}
1646
1647	if (n == 1)
1648		vie->displacement = u.signed8;		/* sign-extended */
1649	else
1650		vie->displacement = u.signed32;		/* sign-extended */
1651
1652	return (0);
1653}
1654
1655static int
1656decode_immediate(struct vie *vie)
1657{
1658	int i, n;
1659	uint8_t x;
1660	union {
1661		char	buf[4];
1662		int8_t	signed8;
1663		int16_t	signed16;
1664		int32_t	signed32;
1665	} u;
1666
1667	/* Figure out immediate operand size (if any) */
1668	if (vie->op.op_flags & VIE_OP_F_IMM) {
1669		/*
1670		 * Section 2.2.1.5 "Immediates", Intel SDM:
1671		 * In 64-bit mode the typical size of immediate operands
1672		 * remains 32-bits. When the operand size if 64-bits, the
1673		 * processor sign-extends all immediates to 64-bits prior
1674		 * to their use.
1675		 */
1676		if (vie->opsize == 4 || vie->opsize == 8)
1677			vie->imm_bytes = 4;
1678		else
1679			vie->imm_bytes = 2;
1680	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
1681		vie->imm_bytes = 1;
1682	}
1683
1684	if ((n = vie->imm_bytes) == 0)
1685		return (0);
1686
1687	KASSERT(n == 1 || n == 2 || n == 4,
1688	    ("%s: invalid number of immediate bytes: %d", __func__, n));
1689
1690	for (i = 0; i < n; i++) {
1691		if (vie_peek(vie, &x))
1692			return (-1);
1693
1694		u.buf[i] = x;
1695		vie_advance(vie);
1696	}
1697
1698	/* sign-extend the immediate value before use */
1699	if (n == 1)
1700		vie->immediate = u.signed8;
1701	else if (n == 2)
1702		vie->immediate = u.signed16;
1703	else
1704		vie->immediate = u.signed32;
1705
1706	return (0);
1707}
1708
1709static int
1710decode_moffset(struct vie *vie)
1711{
1712	int i, n;
1713	uint8_t x;
1714	union {
1715		char	buf[8];
1716		uint64_t u64;
1717	} u;
1718
1719	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
1720		return (0);
1721
1722	/*
1723	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
1724	 * The memory offset size follows the address-size of the instruction.
1725	 */
1726	n = vie->addrsize;
1727	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
1728
1729	u.u64 = 0;
1730	for (i = 0; i < n; i++) {
1731		if (vie_peek(vie, &x))
1732			return (-1);
1733
1734		u.buf[i] = x;
1735		vie_advance(vie);
1736	}
1737	vie->displacement = u.u64;
1738	return (0);
1739}
1740
1741/*
1742 * Verify that all the bytes in the instruction buffer were consumed.
1743 */
1744static int
1745verify_inst_length(struct vie *vie)
1746{
1747
1748	if (vie->num_processed == vie->num_valid)
1749		return (0);
1750	else
1751		return (-1);
1752}
1753
1754/*
1755 * Verify that the 'guest linear address' provided as collateral of the nested
1756 * page table fault matches with our instruction decoding.
1757 */
1758static int
1759verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1760{
1761	int error;
1762	uint64_t base, idx, gla2;
1763
1764	/* Skip 'gla' verification */
1765	if (gla == VIE_INVALID_GLA)
1766		return (0);
1767
1768	base = 0;
1769	if (vie->base_register != VM_REG_LAST) {
1770		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1771		if (error) {
1772			printf("verify_gla: error %d getting base reg %d\n",
1773				error, vie->base_register);
1774			return (-1);
1775		}
1776
1777		/*
1778		 * RIP-relative addressing starts from the following
1779		 * instruction
1780		 */
1781		if (vie->base_register == VM_REG_GUEST_RIP)
1782			base += vie->num_valid;
1783	}
1784
1785	idx = 0;
1786	if (vie->index_register != VM_REG_LAST) {
1787		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1788		if (error) {
1789			printf("verify_gla: error %d getting index reg %d\n",
1790				error, vie->index_register);
1791			return (-1);
1792		}
1793	}
1794
1795	/* XXX assuming that the base address of the segment is 0 */
1796	gla2 = base + vie->scale * idx + vie->displacement;
1797	gla2 &= size2mask[vie->addrsize];
1798	if (gla != gla2) {
1799		printf("verify_gla mismatch: "
1800		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1801		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
1802		       base, vie->scale, idx, vie->displacement, gla, gla2);
1803		return (-1);
1804	}
1805
1806	return (0);
1807}
1808
1809int
1810vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1811		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
1812{
1813
1814	if (decode_prefixes(vie, cpu_mode, cs_d))
1815		return (-1);
1816
1817	if (decode_opcode(vie))
1818		return (-1);
1819
1820	if (decode_modrm(vie, cpu_mode))
1821		return (-1);
1822
1823	if (decode_sib(vie))
1824		return (-1);
1825
1826	if (decode_displacement(vie))
1827		return (-1);
1828
1829	if (decode_immediate(vie))
1830		return (-1);
1831
1832	if (decode_moffset(vie))
1833		return (-1);
1834
1835	if (verify_inst_length(vie))
1836		return (-1);
1837
1838	if (verify_gla(vm, cpuid, gla, vie))
1839		return (-1);
1840
1841	vie->decoded = 1;	/* success */
1842
1843	return (0);
1844}
1845#endif	/* _KERNEL */
1846