vmm_instruction_emul.c revision 284900
1/*-
2 * Copyright (c) 2012 Sandvine, Inc.
3 * Copyright (c) 2012 NetApp, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 284900 2015-06-28 03:22:26Z neel $
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 284900 2015-06-28 03:22:26Z neel $");
32
33#ifdef _KERNEL
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38
39#include <vm/vm.h>
40#include <vm/pmap.h>
41
42#include <machine/vmparam.h>
43#include <machine/vmm.h>
44#else	/* !_KERNEL */
45#include <sys/types.h>
46#include <sys/errno.h>
47#include <sys/_iovec.h>
48
49#include <machine/vmm.h>
50
51#include <assert.h>
52#include <vmmapi.h>
53#define	KASSERT(exp,msg)	assert((exp))
54#endif	/* _KERNEL */
55
56#include <machine/vmm_instruction_emul.h>
57#include <x86/psl.h>
58#include <x86/specialreg.h>
59
60/* struct vie_op.op_type */
61enum {
62	VIE_OP_TYPE_NONE = 0,
63	VIE_OP_TYPE_MOV,
64	VIE_OP_TYPE_MOVSX,
65	VIE_OP_TYPE_MOVZX,
66	VIE_OP_TYPE_AND,
67	VIE_OP_TYPE_OR,
68	VIE_OP_TYPE_SUB,
69	VIE_OP_TYPE_TWO_BYTE,
70	VIE_OP_TYPE_PUSH,
71	VIE_OP_TYPE_CMP,
72	VIE_OP_TYPE_POP,
73	VIE_OP_TYPE_MOVS,
74	VIE_OP_TYPE_GROUP1,
75	VIE_OP_TYPE_STOS,
76	VIE_OP_TYPE_BITTEST,
77	VIE_OP_TYPE_LAST
78};
79
80/* struct vie_op.op_flags */
81#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
82#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
83#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
84#define	VIE_OP_F_NO_MODRM	(1 << 3)
85#define	VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
86
87static const struct vie_op two_byte_opcodes[256] = {
88	[0xB6] = {
89		.op_byte = 0xB6,
90		.op_type = VIE_OP_TYPE_MOVZX,
91	},
92	[0xB7] = {
93		.op_byte = 0xB7,
94		.op_type = VIE_OP_TYPE_MOVZX,
95	},
96	[0xBA] = {
97		.op_byte = 0xBA,
98		.op_type = VIE_OP_TYPE_BITTEST,
99		.op_flags = VIE_OP_F_IMM8,
100	},
101	[0xBE] = {
102		.op_byte = 0xBE,
103		.op_type = VIE_OP_TYPE_MOVSX,
104	},
105};
106
107static const struct vie_op one_byte_opcodes[256] = {
108	[0x0F] = {
109		.op_byte = 0x0F,
110		.op_type = VIE_OP_TYPE_TWO_BYTE
111	},
112	[0x2B] = {
113		.op_byte = 0x2B,
114		.op_type = VIE_OP_TYPE_SUB,
115	},
116	[0x39] = {
117		.op_byte = 0x39,
118		.op_type = VIE_OP_TYPE_CMP,
119	},
120	[0x3B] = {
121		.op_byte = 0x3B,
122		.op_type = VIE_OP_TYPE_CMP,
123	},
124	[0x88] = {
125		.op_byte = 0x88,
126		.op_type = VIE_OP_TYPE_MOV,
127	},
128	[0x89] = {
129		.op_byte = 0x89,
130		.op_type = VIE_OP_TYPE_MOV,
131	},
132	[0x8A] = {
133		.op_byte = 0x8A,
134		.op_type = VIE_OP_TYPE_MOV,
135	},
136	[0x8B] = {
137		.op_byte = 0x8B,
138		.op_type = VIE_OP_TYPE_MOV,
139	},
140	[0xA1] = {
141		.op_byte = 0xA1,
142		.op_type = VIE_OP_TYPE_MOV,
143		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
144	},
145	[0xA3] = {
146		.op_byte = 0xA3,
147		.op_type = VIE_OP_TYPE_MOV,
148		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
149	},
150	[0xA4] = {
151		.op_byte = 0xA4,
152		.op_type = VIE_OP_TYPE_MOVS,
153		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
154	},
155	[0xA5] = {
156		.op_byte = 0xA5,
157		.op_type = VIE_OP_TYPE_MOVS,
158		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
159	},
160	[0xAA] = {
161		.op_byte = 0xAA,
162		.op_type = VIE_OP_TYPE_STOS,
163		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
164	},
165	[0xAB] = {
166		.op_byte = 0xAB,
167		.op_type = VIE_OP_TYPE_STOS,
168		.op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
169	},
170	[0xC6] = {
171		/* XXX Group 11 extended opcode - not just MOV */
172		.op_byte = 0xC6,
173		.op_type = VIE_OP_TYPE_MOV,
174		.op_flags = VIE_OP_F_IMM8,
175	},
176	[0xC7] = {
177		.op_byte = 0xC7,
178		.op_type = VIE_OP_TYPE_MOV,
179		.op_flags = VIE_OP_F_IMM,
180	},
181	[0x23] = {
182		.op_byte = 0x23,
183		.op_type = VIE_OP_TYPE_AND,
184	},
185	[0x80] = {
186		/* Group 1 extended opcode */
187		.op_byte = 0x80,
188		.op_type = VIE_OP_TYPE_GROUP1,
189		.op_flags = VIE_OP_F_IMM8,
190	},
191	[0x81] = {
192		/* Group 1 extended opcode */
193		.op_byte = 0x81,
194		.op_type = VIE_OP_TYPE_GROUP1,
195		.op_flags = VIE_OP_F_IMM,
196	},
197	[0x83] = {
198		/* Group 1 extended opcode */
199		.op_byte = 0x83,
200		.op_type = VIE_OP_TYPE_GROUP1,
201		.op_flags = VIE_OP_F_IMM8,
202	},
203	[0x8F] = {
204		/* XXX Group 1A extended opcode - not just POP */
205		.op_byte = 0x8F,
206		.op_type = VIE_OP_TYPE_POP,
207	},
208	[0xFF] = {
209		/* XXX Group 5 extended opcode - not just PUSH */
210		.op_byte = 0xFF,
211		.op_type = VIE_OP_TYPE_PUSH,
212	}
213};
214
215/* struct vie.mod */
216#define	VIE_MOD_INDIRECT		0
217#define	VIE_MOD_INDIRECT_DISP8		1
218#define	VIE_MOD_INDIRECT_DISP32		2
219#define	VIE_MOD_DIRECT			3
220
221/* struct vie.rm */
222#define	VIE_RM_SIB			4
223#define	VIE_RM_DISP32			5
224
225#define	GB				(1024 * 1024 * 1024)
226
227static enum vm_reg_name gpr_map[16] = {
228	VM_REG_GUEST_RAX,
229	VM_REG_GUEST_RCX,
230	VM_REG_GUEST_RDX,
231	VM_REG_GUEST_RBX,
232	VM_REG_GUEST_RSP,
233	VM_REG_GUEST_RBP,
234	VM_REG_GUEST_RSI,
235	VM_REG_GUEST_RDI,
236	VM_REG_GUEST_R8,
237	VM_REG_GUEST_R9,
238	VM_REG_GUEST_R10,
239	VM_REG_GUEST_R11,
240	VM_REG_GUEST_R12,
241	VM_REG_GUEST_R13,
242	VM_REG_GUEST_R14,
243	VM_REG_GUEST_R15
244};
245
246static uint64_t size2mask[] = {
247	[1] = 0xff,
248	[2] = 0xffff,
249	[4] = 0xffffffff,
250	[8] = 0xffffffffffffffff,
251};
252
253static int
254vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
255{
256	int error;
257
258	error = vm_get_register(vm, vcpuid, reg, rval);
259
260	return (error);
261}
262
263static void
264vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
265{
266	*lhbr = 0;
267	*reg = gpr_map[vie->reg];
268
269	/*
270	 * 64-bit mode imposes limitations on accessing legacy high byte
271	 * registers (lhbr).
272	 *
273	 * The legacy high-byte registers cannot be addressed if the REX
274	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
275	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
276	 *
277	 * If the REX prefix is not present then the values 4, 5, 6 and 7
278	 * of the 'ModRM:reg' field address the legacy high-byte registers,
279	 * %ah, %ch, %dh and %bh respectively.
280	 */
281	if (!vie->rex_present) {
282		if (vie->reg & 0x4) {
283			*lhbr = 1;
284			*reg = gpr_map[vie->reg & 0x3];
285		}
286	}
287}
288
289static int
290vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
291{
292	uint64_t val;
293	int error, lhbr;
294	enum vm_reg_name reg;
295
296	vie_calc_bytereg(vie, &reg, &lhbr);
297	error = vm_get_register(vm, vcpuid, reg, &val);
298
299	/*
300	 * To obtain the value of a legacy high byte register shift the
301	 * base register right by 8 bits (%ah = %rax >> 8).
302	 */
303	if (lhbr)
304		*rval = val >> 8;
305	else
306		*rval = val;
307	return (error);
308}
309
310static int
311vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
312{
313	uint64_t origval, val, mask;
314	int error, lhbr;
315	enum vm_reg_name reg;
316
317	vie_calc_bytereg(vie, &reg, &lhbr);
318	error = vm_get_register(vm, vcpuid, reg, &origval);
319	if (error == 0) {
320		val = byte;
321		mask = 0xff;
322		if (lhbr) {
323			/*
324			 * Shift left by 8 to store 'byte' in a legacy high
325			 * byte register.
326			 */
327			val <<= 8;
328			mask <<= 8;
329		}
330		val |= origval & ~mask;
331		error = vm_set_register(vm, vcpuid, reg, val);
332	}
333	return (error);
334}
335
336int
337vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
338		    uint64_t val, int size)
339{
340	int error;
341	uint64_t origval;
342
343	switch (size) {
344	case 1:
345	case 2:
346		error = vie_read_register(vm, vcpuid, reg, &origval);
347		if (error)
348			return (error);
349		val &= size2mask[size];
350		val |= origval & ~size2mask[size];
351		break;
352	case 4:
353		val &= 0xffffffffUL;
354		break;
355	case 8:
356		break;
357	default:
358		return (EINVAL);
359	}
360
361	error = vm_set_register(vm, vcpuid, reg, val);
362	return (error);
363}
364
365#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
366
367/*
368 * Return the status flags that would result from doing (x - y).
369 */
370#define	GETCC(sz)							\
371static u_long								\
372getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
373{									\
374	u_long rflags;							\
375									\
376	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
377	    "=r" (rflags), "+r" (x) : "m" (y));				\
378	return (rflags);						\
379} struct __hack
380
381GETCC(8);
382GETCC(16);
383GETCC(32);
384GETCC(64);
385
386static u_long
387getcc(int opsize, uint64_t x, uint64_t y)
388{
389	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
390	    ("getcc: invalid operand size %d", opsize));
391
392	if (opsize == 1)
393		return (getcc8(x, y));
394	else if (opsize == 2)
395		return (getcc16(x, y));
396	else if (opsize == 4)
397		return (getcc32(x, y));
398	else
399		return (getcc64(x, y));
400}
401
402static int
403emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
404	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
405{
406	int error, size;
407	enum vm_reg_name reg;
408	uint8_t byte;
409	uint64_t val;
410
411	size = vie->opsize;
412	error = EINVAL;
413
414	switch (vie->op.op_byte) {
415	case 0x88:
416		/*
417		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
418		 * 88/r:	mov r/m8, r8
419		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
420		 */
421		size = 1;	/* override for byte operation */
422		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
423		if (error == 0)
424			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
425		break;
426	case 0x89:
427		/*
428		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
429		 * 89/r:	mov r/m16, r16
430		 * 89/r:	mov r/m32, r32
431		 * REX.W + 89/r	mov r/m64, r64
432		 */
433		reg = gpr_map[vie->reg];
434		error = vie_read_register(vm, vcpuid, reg, &val);
435		if (error == 0) {
436			val &= size2mask[size];
437			error = memwrite(vm, vcpuid, gpa, val, size, arg);
438		}
439		break;
440	case 0x8A:
441		/*
442		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
443		 * 8A/r:	mov r8, r/m8
444		 * REX + 8A/r:	mov r8, r/m8
445		 */
446		size = 1;	/* override for byte operation */
447		error = memread(vm, vcpuid, gpa, &val, size, arg);
448		if (error == 0)
449			error = vie_write_bytereg(vm, vcpuid, vie, val);
450		break;
451	case 0x8B:
452		/*
453		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
454		 * 8B/r:	mov r16, r/m16
455		 * 8B/r:	mov r32, r/m32
456		 * REX.W 8B/r:	mov r64, r/m64
457		 */
458		error = memread(vm, vcpuid, gpa, &val, size, arg);
459		if (error == 0) {
460			reg = gpr_map[vie->reg];
461			error = vie_update_register(vm, vcpuid, reg, val, size);
462		}
463		break;
464	case 0xA1:
465		/*
466		 * MOV from seg:moffset to AX/EAX/RAX
467		 * A1:		mov AX, moffs16
468		 * A1:		mov EAX, moffs32
469		 * REX.W + A1:	mov RAX, moffs64
470		 */
471		error = memread(vm, vcpuid, gpa, &val, size, arg);
472		if (error == 0) {
473			reg = VM_REG_GUEST_RAX;
474			error = vie_update_register(vm, vcpuid, reg, val, size);
475		}
476		break;
477	case 0xA3:
478		/*
479		 * MOV from AX/EAX/RAX to seg:moffset
480		 * A3:		mov moffs16, AX
481		 * A3:		mov moffs32, EAX
482		 * REX.W + A3:	mov moffs64, RAX
483		 */
484		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
485		if (error == 0) {
486			val &= size2mask[size];
487			error = memwrite(vm, vcpuid, gpa, val, size, arg);
488		}
489		break;
490	case 0xC6:
491		/*
492		 * MOV from imm8 to mem (ModRM:r/m)
493		 * C6/0		mov r/m8, imm8
494		 * REX + C6/0	mov r/m8, imm8
495		 */
496		size = 1;	/* override for byte operation */
497		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
498		break;
499	case 0xC7:
500		/*
501		 * MOV from imm16/imm32 to mem (ModRM:r/m)
502		 * C7/0		mov r/m16, imm16
503		 * C7/0		mov r/m32, imm32
504		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
505		 */
506		val = vie->immediate & size2mask[size];
507		error = memwrite(vm, vcpuid, gpa, val, size, arg);
508		break;
509	default:
510		break;
511	}
512
513	return (error);
514}
515
516static int
517emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
518	     mem_region_read_t memread, mem_region_write_t memwrite,
519	     void *arg)
520{
521	int error, size;
522	enum vm_reg_name reg;
523	uint64_t val;
524
525	size = vie->opsize;
526	error = EINVAL;
527
528	switch (vie->op.op_byte) {
529	case 0xB6:
530		/*
531		 * MOV and zero extend byte from mem (ModRM:r/m) to
532		 * reg (ModRM:reg).
533		 *
534		 * 0F B6/r		movzx r16, r/m8
535		 * 0F B6/r		movzx r32, r/m8
536		 * REX.W + 0F B6/r	movzx r64, r/m8
537		 */
538
539		/* get the first operand */
540		error = memread(vm, vcpuid, gpa, &val, 1, arg);
541		if (error)
542			break;
543
544		/* get the second operand */
545		reg = gpr_map[vie->reg];
546
547		/* zero-extend byte */
548		val = (uint8_t)val;
549
550		/* write the result */
551		error = vie_update_register(vm, vcpuid, reg, val, size);
552		break;
553	case 0xB7:
554		/*
555		 * MOV and zero extend word from mem (ModRM:r/m) to
556		 * reg (ModRM:reg).
557		 *
558		 * 0F B7/r		movzx r32, r/m16
559		 * REX.W + 0F B7/r	movzx r64, r/m16
560		 */
561		error = memread(vm, vcpuid, gpa, &val, 2, arg);
562		if (error)
563			return (error);
564
565		reg = gpr_map[vie->reg];
566
567		/* zero-extend word */
568		val = (uint16_t)val;
569
570		error = vie_update_register(vm, vcpuid, reg, val, size);
571		break;
572	case 0xBE:
573		/*
574		 * MOV and sign extend byte from mem (ModRM:r/m) to
575		 * reg (ModRM:reg).
576		 *
577		 * 0F BE/r		movsx r16, r/m8
578		 * 0F BE/r		movsx r32, r/m8
579		 * REX.W + 0F BE/r	movsx r64, r/m8
580		 */
581
582		/* get the first operand */
583		error = memread(vm, vcpuid, gpa, &val, 1, arg);
584		if (error)
585			break;
586
587		/* get the second operand */
588		reg = gpr_map[vie->reg];
589
590		/* sign extend byte */
591		val = (int8_t)val;
592
593		/* write the result */
594		error = vie_update_register(vm, vcpuid, reg, val, size);
595		break;
596	default:
597		break;
598	}
599	return (error);
600}
601
602/*
603 * Helper function to calculate and validate a linear address.
604 */
605static int
606get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
607    int opsize, int addrsize, int prot, enum vm_reg_name seg,
608    enum vm_reg_name gpr, uint64_t *gla, int *fault)
609{
610	struct seg_desc desc;
611	uint64_t cr0, val, rflags;
612	int error;
613
614	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
615	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
616
617	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
618	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
619
620	error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
621	KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
622	    __func__, error, seg));
623
624	error = vie_read_register(vm, vcpuid, gpr, &val);
625	KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
626	    error, gpr));
627
628	if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
629	    addrsize, prot, gla)) {
630		if (seg == VM_REG_GUEST_SS)
631			vm_inject_ss(vm, vcpuid, 0);
632		else
633			vm_inject_gp(vm, vcpuid);
634		goto guest_fault;
635	}
636
637	if (vie_canonical_check(paging->cpu_mode, *gla)) {
638		if (seg == VM_REG_GUEST_SS)
639			vm_inject_ss(vm, vcpuid, 0);
640		else
641			vm_inject_gp(vm, vcpuid);
642		goto guest_fault;
643	}
644
645	if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
646		vm_inject_ac(vm, vcpuid, 0);
647		goto guest_fault;
648	}
649
650	*fault = 0;
651	return (0);
652
653guest_fault:
654	*fault = 1;
655	return (0);
656}
657
658static int
659emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
660    struct vm_guest_paging *paging, mem_region_read_t memread,
661    mem_region_write_t memwrite, void *arg)
662{
663#ifdef _KERNEL
664	struct vm_copyinfo copyinfo[2];
665#else
666	struct iovec copyinfo[2];
667#endif
668	uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
669	uint64_t rcx, rdi, rsi, rflags;
670	int error, fault, opsize, seg, repeat;
671
672	opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
673	val = 0;
674	error = 0;
675
676	/*
677	 * XXX although the MOVS instruction is only supposed to be used with
678	 * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
679	 *
680	 * Empirically the "repnz" prefix has identical behavior to "rep"
681	 * and the zero flag does not make a difference.
682	 */
683	repeat = vie->repz_present | vie->repnz_present;
684
685	if (repeat) {
686		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
687		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
688
689		/*
690		 * The count register is %rcx, %ecx or %cx depending on the
691		 * address size of the instruction.
692		 */
693		if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
694			error = 0;
695			goto done;
696		}
697	}
698
699	/*
700	 *	Source		Destination	Comments
701	 *	--------------------------------------------
702	 * (1)  memory		memory		n/a
703	 * (2)  memory		mmio		emulated
704	 * (3)  mmio		memory		emulated
705	 * (4)  mmio		mmio		emulated
706	 *
707	 * At this point we don't have sufficient information to distinguish
708	 * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
709	 * out because it will succeed only when operating on regular memory.
710	 *
711	 * XXX the emulation doesn't properly handle the case where 'gpa'
712	 * is straddling the boundary between the normal memory and MMIO.
713	 */
714
715	seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
716	error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
717	    PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
718	if (error || fault)
719		goto done;
720
721	error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
722	    copyinfo, nitems(copyinfo), &fault);
723	if (error == 0) {
724		if (fault)
725			goto done;	/* Resume guest to handle fault */
726
727		/*
728		 * case (2): read from system memory and write to mmio.
729		 */
730		vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
731		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
732		error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
733		if (error)
734			goto done;
735	} else {
736		/*
737		 * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
738		 * if 'srcaddr' is in the mmio space.
739		 */
740
741		error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
742		    PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
743		    &fault);
744		if (error || fault)
745			goto done;
746
747		error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
748		    PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
749		if (error == 0) {
750			if (fault)
751				goto done;    /* Resume guest to handle fault */
752
753			/*
754			 * case (3): read from MMIO and write to system memory.
755			 *
756			 * A MMIO read can have side-effects so we
757			 * commit to it only after vm_copy_setup() is
758			 * successful. If a page-fault needs to be
759			 * injected into the guest then it will happen
760			 * before the MMIO read is attempted.
761			 */
762			error = memread(vm, vcpuid, gpa, &val, opsize, arg);
763			if (error)
764				goto done;
765
766			vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
767			vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
768		} else {
769			/*
770			 * Case (4): read from and write to mmio.
771			 *
772			 * Commit to the MMIO read/write (with potential
773			 * side-effects) only after we are sure that the
774			 * instruction is not going to be restarted due
775			 * to address translation faults.
776			 */
777			error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
778			    PROT_READ, &srcgpa, &fault);
779			if (error || fault)
780				goto done;
781
782			error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
783			   PROT_WRITE, &dstgpa, &fault);
784			if (error || fault)
785				goto done;
786
787			error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
788			if (error)
789				goto done;
790
791			error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
792			if (error)
793				goto done;
794		}
795	}
796
797	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
798	KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
799
800	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
801	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
802
803	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
804	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
805
806	if (rflags & PSL_D) {
807		rsi -= opsize;
808		rdi -= opsize;
809	} else {
810		rsi += opsize;
811		rdi += opsize;
812	}
813
814	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
815	    vie->addrsize);
816	KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
817
818	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
819	    vie->addrsize);
820	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
821
822	if (repeat) {
823		rcx = rcx - 1;
824		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
825		    rcx, vie->addrsize);
826		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
827
828		/*
829		 * Repeat the instruction if the count register is not zero.
830		 */
831		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
832			vm_restart_instruction(vm, vcpuid);
833	}
834done:
835	KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
836	    __func__, error));
837	return (error);
838}
839
840static int
841emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
842    struct vm_guest_paging *paging, mem_region_read_t memread,
843    mem_region_write_t memwrite, void *arg)
844{
845	int error, opsize, repeat;
846	uint64_t val;
847	uint64_t rcx, rdi, rflags;
848
849	opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
850	repeat = vie->repz_present | vie->repnz_present;
851
852	if (repeat) {
853		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
854		KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
855
856		/*
857		 * The count register is %rcx, %ecx or %cx depending on the
858		 * address size of the instruction.
859		 */
860		if ((rcx & vie_size2mask(vie->addrsize)) == 0)
861			return (0);
862	}
863
864	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
865	KASSERT(!error, ("%s: error %d getting rax", __func__, error));
866
867	error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
868	if (error)
869		return (error);
870
871	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
872	KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
873
874	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
875	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
876
877	if (rflags & PSL_D)
878		rdi -= opsize;
879	else
880		rdi += opsize;
881
882	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
883	    vie->addrsize);
884	KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
885
886	if (repeat) {
887		rcx = rcx - 1;
888		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
889		    rcx, vie->addrsize);
890		KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
891
892		/*
893		 * Repeat the instruction if the count register is not zero.
894		 */
895		if ((rcx & vie_size2mask(vie->addrsize)) != 0)
896			vm_restart_instruction(vm, vcpuid);
897	}
898
899	return (0);
900}
901
902static int
903emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
904	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
905{
906	int error, size;
907	enum vm_reg_name reg;
908	uint64_t result, rflags, rflags2, val1, val2;
909
910	size = vie->opsize;
911	error = EINVAL;
912
913	switch (vie->op.op_byte) {
914	case 0x23:
915		/*
916		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
917		 * result in reg.
918		 *
919		 * 23/r		and r16, r/m16
920		 * 23/r		and r32, r/m32
921		 * REX.W + 23/r	and r64, r/m64
922		 */
923
924		/* get the first operand */
925		reg = gpr_map[vie->reg];
926		error = vie_read_register(vm, vcpuid, reg, &val1);
927		if (error)
928			break;
929
930		/* get the second operand */
931		error = memread(vm, vcpuid, gpa, &val2, size, arg);
932		if (error)
933			break;
934
935		/* perform the operation and write the result */
936		result = val1 & val2;
937		error = vie_update_register(vm, vcpuid, reg, result, size);
938		break;
939	case 0x81:
940	case 0x83:
941		/*
942		 * AND mem (ModRM:r/m) with immediate and store the
943		 * result in mem.
944		 *
945		 * 81 /4		and r/m16, imm16
946		 * 81 /4		and r/m32, imm32
947		 * REX.W + 81 /4	and r/m64, imm32 sign-extended to 64
948		 *
949		 * 83 /4		and r/m16, imm8 sign-extended to 16
950		 * 83 /4		and r/m32, imm8 sign-extended to 32
951		 * REX.W + 83/4		and r/m64, imm8 sign-extended to 64
952		 */
953
954		/* get the first operand */
955                error = memread(vm, vcpuid, gpa, &val1, size, arg);
956                if (error)
957			break;
958
959                /*
960		 * perform the operation with the pre-fetched immediate
961		 * operand and write the result
962		 */
963                result = val1 & vie->immediate;
964                error = memwrite(vm, vcpuid, gpa, result, size, arg);
965		break;
966	default:
967		break;
968	}
969	if (error)
970		return (error);
971
972	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
973	if (error)
974		return (error);
975
976	/*
977	 * OF and CF are cleared; the SF, ZF and PF flags are set according
978	 * to the result; AF is undefined.
979	 *
980	 * The updated status flags are obtained by subtracting 0 from 'result'.
981	 */
982	rflags2 = getcc(size, result, 0);
983	rflags &= ~RFLAGS_STATUS_BITS;
984	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
985
986	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
987	return (error);
988}
989
990static int
991emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
992	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
993{
994	int error, size;
995	uint64_t val1, result, rflags, rflags2;
996
997	size = vie->opsize;
998	error = EINVAL;
999
1000	switch (vie->op.op_byte) {
1001	case 0x81:
1002	case 0x83:
1003		/*
1004		 * OR mem (ModRM:r/m) with immediate and store the
1005		 * result in mem.
1006		 *
1007		 * 81 /1		or r/m16, imm16
1008		 * 81 /1		or r/m32, imm32
1009		 * REX.W + 81 /1	or r/m64, imm32 sign-extended to 64
1010		 *
1011		 * 83 /1		or r/m16, imm8 sign-extended to 16
1012		 * 83 /1		or r/m32, imm8 sign-extended to 32
1013		 * REX.W + 83/1		or r/m64, imm8 sign-extended to 64
1014		 */
1015
1016		/* get the first operand */
1017                error = memread(vm, vcpuid, gpa, &val1, size, arg);
1018                if (error)
1019			break;
1020
1021                /*
1022		 * perform the operation with the pre-fetched immediate
1023		 * operand and write the result
1024		 */
1025                result = val1 | vie->immediate;
1026                error = memwrite(vm, vcpuid, gpa, result, size, arg);
1027		break;
1028	default:
1029		break;
1030	}
1031	if (error)
1032		return (error);
1033
1034	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1035	if (error)
1036		return (error);
1037
1038	/*
1039	 * OF and CF are cleared; the SF, ZF and PF flags are set according
1040	 * to the result; AF is undefined.
1041	 *
1042	 * The updated status flags are obtained by subtracting 0 from 'result'.
1043	 */
1044	rflags2 = getcc(size, result, 0);
1045	rflags &= ~RFLAGS_STATUS_BITS;
1046	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
1047
1048	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1049	return (error);
1050}
1051
1052static int
1053emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1054	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1055{
1056	int error, size;
1057	uint64_t regop, memop, op1, op2, rflags, rflags2;
1058	enum vm_reg_name reg;
1059
1060	size = vie->opsize;
1061	switch (vie->op.op_byte) {
1062	case 0x39:
1063	case 0x3B:
1064		/*
1065		 * 39/r		CMP r/m16, r16
1066		 * 39/r		CMP r/m32, r32
1067		 * REX.W 39/r	CMP r/m64, r64
1068		 *
1069		 * 3B/r		CMP r16, r/m16
1070		 * 3B/r		CMP r32, r/m32
1071		 * REX.W + 3B/r	CMP r64, r/m64
1072		 *
1073		 * Compare the first operand with the second operand and
1074		 * set status flags in EFLAGS register. The comparison is
1075		 * performed by subtracting the second operand from the first
1076		 * operand and then setting the status flags.
1077		 */
1078
1079		/* Get the register operand */
1080		reg = gpr_map[vie->reg];
1081		error = vie_read_register(vm, vcpuid, reg, &regop);
1082		if (error)
1083			return (error);
1084
1085		/* Get the memory operand */
1086		error = memread(vm, vcpuid, gpa, &memop, size, arg);
1087		if (error)
1088			return (error);
1089
1090		if (vie->op.op_byte == 0x3B) {
1091			op1 = regop;
1092			op2 = memop;
1093		} else {
1094			op1 = memop;
1095			op2 = regop;
1096		}
1097		rflags2 = getcc(size, op1, op2);
1098		break;
1099	case 0x80:
1100	case 0x81:
1101	case 0x83:
1102		/*
1103		 * 80 /7		cmp r/m8, imm8
1104		 * REX + 80 /7		cmp r/m8, imm8
1105		 *
1106		 * 81 /7		cmp r/m16, imm16
1107		 * 81 /7		cmp r/m32, imm32
1108		 * REX.W + 81 /7	cmp r/m64, imm32 sign-extended to 64
1109		 *
1110		 * 83 /7		cmp r/m16, imm8 sign-extended to 16
1111		 * 83 /7		cmp r/m32, imm8 sign-extended to 32
1112		 * REX.W + 83 /7	cmp r/m64, imm8 sign-extended to 64
1113		 *
1114		 * Compare mem (ModRM:r/m) with immediate and set
1115		 * status flags according to the results.  The
1116		 * comparison is performed by subtracting the
1117		 * immediate from the first operand and then setting
1118		 * the status flags.
1119		 *
1120		 */
1121		if (vie->op.op_byte == 0x80)
1122			size = 1;
1123
1124		/* get the first operand */
1125                error = memread(vm, vcpuid, gpa, &op1, size, arg);
1126		if (error)
1127			return (error);
1128
1129		rflags2 = getcc(size, op1, vie->immediate);
1130		break;
1131	default:
1132		return (EINVAL);
1133	}
1134	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1135	if (error)
1136		return (error);
1137	rflags &= ~RFLAGS_STATUS_BITS;
1138	rflags |= rflags2 & RFLAGS_STATUS_BITS;
1139
1140	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1141	return (error);
1142}
1143
1144static int
1145emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1146	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
1147{
1148	int error, size;
1149	uint64_t nval, rflags, rflags2, val1, val2;
1150	enum vm_reg_name reg;
1151
1152	size = vie->opsize;
1153	error = EINVAL;
1154
1155	switch (vie->op.op_byte) {
1156	case 0x2B:
1157		/*
1158		 * SUB r/m from r and store the result in r
1159		 *
1160		 * 2B/r            SUB r16, r/m16
1161		 * 2B/r            SUB r32, r/m32
1162		 * REX.W + 2B/r    SUB r64, r/m64
1163		 */
1164
1165		/* get the first operand */
1166		reg = gpr_map[vie->reg];
1167		error = vie_read_register(vm, vcpuid, reg, &val1);
1168		if (error)
1169			break;
1170
1171		/* get the second operand */
1172		error = memread(vm, vcpuid, gpa, &val2, size, arg);
1173		if (error)
1174			break;
1175
1176		/* perform the operation and write the result */
1177		nval = val1 - val2;
1178		error = vie_update_register(vm, vcpuid, reg, nval, size);
1179		break;
1180	default:
1181		break;
1182	}
1183
1184	if (!error) {
1185		rflags2 = getcc(size, val1, val2);
1186		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1187		    &rflags);
1188		if (error)
1189			return (error);
1190
1191		rflags &= ~RFLAGS_STATUS_BITS;
1192		rflags |= rflags2 & RFLAGS_STATUS_BITS;
1193		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
1194		    rflags, 8);
1195	}
1196
1197	return (error);
1198}
1199
1200static int
1201emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1202    struct vm_guest_paging *paging, mem_region_read_t memread,
1203    mem_region_write_t memwrite, void *arg)
1204{
1205#ifdef _KERNEL
1206	struct vm_copyinfo copyinfo[2];
1207#else
1208	struct iovec copyinfo[2];
1209#endif
1210	struct seg_desc ss_desc;
1211	uint64_t cr0, rflags, rsp, stack_gla, val;
1212	int error, fault, size, stackaddrsize, pushop;
1213
1214	val = 0;
1215	size = vie->opsize;
1216	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
1217
1218	/*
1219	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
1220	 */
1221	if (paging->cpu_mode == CPU_MODE_REAL) {
1222		stackaddrsize = 2;
1223	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
1224		/*
1225		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
1226		 * - Stack pointer size is always 64-bits.
1227		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
1228		 * - 16-bit PUSH/POP is supported by using the operand size
1229		 *   override prefix (66H).
1230		 */
1231		stackaddrsize = 8;
1232		size = vie->opsize_override ? 2 : 8;
1233	} else {
1234		/*
1235		 * In protected or compability mode the 'B' flag in the
1236		 * stack-segment descriptor determines the size of the
1237		 * stack pointer.
1238		 */
1239		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
1240		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
1241		    __func__, error));
1242		if (SEG_DESC_DEF32(ss_desc.access))
1243			stackaddrsize = 4;
1244		else
1245			stackaddrsize = 2;
1246	}
1247
1248	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
1249	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
1250
1251	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1252	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1253
1254	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
1255	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
1256	if (pushop) {
1257		rsp -= size;
1258	}
1259
1260	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
1261	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
1262	    &stack_gla)) {
1263		vm_inject_ss(vm, vcpuid, 0);
1264		return (0);
1265	}
1266
1267	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
1268		vm_inject_ss(vm, vcpuid, 0);
1269		return (0);
1270	}
1271
1272	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
1273		vm_inject_ac(vm, vcpuid, 0);
1274		return (0);
1275	}
1276
1277	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
1278	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
1279	    &fault);
1280	if (error || fault)
1281		return (error);
1282
1283	if (pushop) {
1284		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
1285		if (error == 0)
1286			vm_copyout(vm, vcpuid, &val, copyinfo, size);
1287	} else {
1288		vm_copyin(vm, vcpuid, copyinfo, &val, size);
1289		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
1290		rsp += size;
1291	}
1292	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1293
1294	if (error == 0) {
1295		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
1296		    stackaddrsize);
1297		KASSERT(error == 0, ("error %d updating rsp", error));
1298	}
1299	return (error);
1300}
1301
1302static int
1303emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1304    struct vm_guest_paging *paging, mem_region_read_t memread,
1305    mem_region_write_t memwrite, void *arg)
1306{
1307	int error;
1308
1309	/*
1310	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1311	 *
1312	 * PUSH is part of the group 5 extended opcodes and is identified
1313	 * by ModRM:reg = b110.
1314	 */
1315	if ((vie->reg & 7) != 6)
1316		return (EINVAL);
1317
1318	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1319	    memwrite, arg);
1320	return (error);
1321}
1322
1323static int
1324emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
1325    struct vm_guest_paging *paging, mem_region_read_t memread,
1326    mem_region_write_t memwrite, void *arg)
1327{
1328	int error;
1329
1330	/*
1331	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
1332	 *
1333	 * POP is part of the group 1A extended opcodes and is identified
1334	 * by ModRM:reg = b000.
1335	 */
1336	if ((vie->reg & 7) != 0)
1337		return (EINVAL);
1338
1339	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
1340	    memwrite, arg);
1341	return (error);
1342}
1343
1344static int
1345emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1346    struct vm_guest_paging *paging, mem_region_read_t memread,
1347    mem_region_write_t memwrite, void *memarg)
1348{
1349	int error;
1350
1351	switch (vie->reg & 7) {
1352	case 0x1:	/* OR */
1353		error = emulate_or(vm, vcpuid, gpa, vie,
1354		    memread, memwrite, memarg);
1355		break;
1356	case 0x4:	/* AND */
1357		error = emulate_and(vm, vcpuid, gpa, vie,
1358		    memread, memwrite, memarg);
1359		break;
1360	case 0x7:	/* CMP */
1361		error = emulate_cmp(vm, vcpuid, gpa, vie,
1362		    memread, memwrite, memarg);
1363		break;
1364	default:
1365		error = EINVAL;
1366		break;
1367	}
1368
1369	return (error);
1370}
1371
1372static int
1373emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1374    mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
1375{
1376	uint64_t val, rflags;
1377	int error, bitmask, bitoff;
1378
1379	/*
1380	 * 0F BA is a Group 8 extended opcode.
1381	 *
1382	 * Currently we only emulate the 'Bit Test' instruction which is
1383	 * identified by a ModR/M:reg encoding of 100b.
1384	 */
1385	if ((vie->reg & 7) != 4)
1386		return (EINVAL);
1387
1388	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
1389	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
1390
1391	error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
1392	if (error)
1393		return (error);
1394
1395	/*
1396	 * Intel SDM, Vol 2, Table 3-2:
1397	 * "Range of Bit Positions Specified by Bit Offset Operands"
1398	 */
1399	bitmask = vie->opsize * 8 - 1;
1400	bitoff = vie->immediate & bitmask;
1401
1402	/* Copy the bit into the Carry flag in %rflags */
1403	if (val & (1UL << bitoff))
1404		rflags |= PSL_C;
1405	else
1406		rflags &= ~PSL_C;
1407
1408	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
1409	KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
1410
1411	return (0);
1412}
1413
1414int
1415vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
1416    struct vm_guest_paging *paging, mem_region_read_t memread,
1417    mem_region_write_t memwrite, void *memarg)
1418{
1419	int error;
1420
1421	if (!vie->decoded)
1422		return (EINVAL);
1423
1424	switch (vie->op.op_type) {
1425	case VIE_OP_TYPE_GROUP1:
1426		error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
1427		    memwrite, memarg);
1428		break;
1429	case VIE_OP_TYPE_POP:
1430		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
1431		    memwrite, memarg);
1432		break;
1433	case VIE_OP_TYPE_PUSH:
1434		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1435		    memwrite, memarg);
1436		break;
1437	case VIE_OP_TYPE_CMP:
1438		error = emulate_cmp(vm, vcpuid, gpa, vie,
1439				    memread, memwrite, memarg);
1440		break;
1441	case VIE_OP_TYPE_MOV:
1442		error = emulate_mov(vm, vcpuid, gpa, vie,
1443				    memread, memwrite, memarg);
1444		break;
1445	case VIE_OP_TYPE_MOVSX:
1446	case VIE_OP_TYPE_MOVZX:
1447		error = emulate_movx(vm, vcpuid, gpa, vie,
1448				     memread, memwrite, memarg);
1449		break;
1450	case VIE_OP_TYPE_MOVS:
1451		error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
1452		    memwrite, memarg);
1453		break;
1454	case VIE_OP_TYPE_STOS:
1455		error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
1456		    memwrite, memarg);
1457		break;
1458	case VIE_OP_TYPE_AND:
1459		error = emulate_and(vm, vcpuid, gpa, vie,
1460				    memread, memwrite, memarg);
1461		break;
1462	case VIE_OP_TYPE_OR:
1463		error = emulate_or(vm, vcpuid, gpa, vie,
1464				    memread, memwrite, memarg);
1465		break;
1466	case VIE_OP_TYPE_SUB:
1467		error = emulate_sub(vm, vcpuid, gpa, vie,
1468				    memread, memwrite, memarg);
1469		break;
1470	case VIE_OP_TYPE_BITTEST:
1471		error = emulate_bittest(vm, vcpuid, gpa, vie,
1472		    memread, memwrite, memarg);
1473		break;
1474	default:
1475		error = EINVAL;
1476		break;
1477	}
1478
1479	return (error);
1480}
1481
1482int
1483vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1484{
1485	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1486	    ("%s: invalid size %d", __func__, size));
1487	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1488
1489	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1490		return (0);
1491
1492	return ((gla & (size - 1)) ? 1 : 0);
1493}
1494
1495int
1496vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1497{
1498	uint64_t mask;
1499
1500	if (cpu_mode != CPU_MODE_64BIT)
1501		return (0);
1502
1503	/*
1504	 * The value of the bit 47 in the 'gla' should be replicated in the
1505	 * most significant 16 bits.
1506	 */
1507	mask = ~((1UL << 48) - 1);
1508	if (gla & (1UL << 47))
1509		return ((gla & mask) != mask);
1510	else
1511		return ((gla & mask) != 0);
1512}
1513
1514uint64_t
1515vie_size2mask(int size)
1516{
1517	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1518	    ("vie_size2mask: invalid size %d", size));
1519	return (size2mask[size]);
1520}
1521
1522int
1523vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1524    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1525    int prot, uint64_t *gla)
1526{
1527	uint64_t firstoff, low_limit, high_limit, segbase;
1528	int glasize, type;
1529
1530	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1531	    ("%s: invalid segment %d", __func__, seg));
1532	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1533	    ("%s: invalid operand size %d", __func__, length));
1534	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1535	    ("%s: invalid prot %#x", __func__, prot));
1536
1537	firstoff = offset;
1538	if (cpu_mode == CPU_MODE_64BIT) {
1539		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1540		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1541		glasize = 8;
1542	} else {
1543		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1544		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1545		glasize = 4;
1546		/*
1547		 * If the segment selector is loaded with a NULL selector
1548		 * then the descriptor is unusable and attempting to use
1549		 * it results in a #GP(0).
1550		 */
1551		if (SEG_DESC_UNUSABLE(desc->access))
1552			return (-1);
1553
1554		/*
1555		 * The processor generates a #NP exception when a segment
1556		 * register is loaded with a selector that points to a
1557		 * descriptor that is not present. If this was the case then
1558		 * it would have been checked before the VM-exit.
1559		 */
1560		KASSERT(SEG_DESC_PRESENT(desc->access),
1561		    ("segment %d not present: %#x", seg, desc->access));
1562
1563		/*
1564		 * The descriptor type must indicate a code/data segment.
1565		 */
1566		type = SEG_DESC_TYPE(desc->access);
1567		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1568		    "descriptor type %#x", seg, type));
1569
1570		if (prot & PROT_READ) {
1571			/* #GP on a read access to a exec-only code segment */
1572			if ((type & 0xA) == 0x8)
1573				return (-1);
1574		}
1575
1576		if (prot & PROT_WRITE) {
1577			/*
1578			 * #GP on a write access to a code segment or a
1579			 * read-only data segment.
1580			 */
1581			if (type & 0x8)			/* code segment */
1582				return (-1);
1583
1584			if ((type & 0xA) == 0)		/* read-only data seg */
1585				return (-1);
1586		}
1587
1588		/*
1589		 * 'desc->limit' is fully expanded taking granularity into
1590		 * account.
1591		 */
1592		if ((type & 0xC) == 0x4) {
1593			/* expand-down data segment */
1594			low_limit = desc->limit + 1;
1595			high_limit = SEG_DESC_DEF32(desc->access) ?
1596			    0xffffffff : 0xffff;
1597		} else {
1598			/* code segment or expand-up data segment */
1599			low_limit = 0;
1600			high_limit = desc->limit;
1601		}
1602
1603		while (length > 0) {
1604			offset &= vie_size2mask(addrsize);
1605			if (offset < low_limit || offset > high_limit)
1606				return (-1);
1607			offset++;
1608			length--;
1609		}
1610	}
1611
1612	/*
1613	 * In 64-bit mode all segments except %fs and %gs have a segment
1614	 * base address of 0.
1615	 */
1616	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1617	    seg != VM_REG_GUEST_GS) {
1618		segbase = 0;
1619	} else {
1620		segbase = desc->base;
1621	}
1622
1623	/*
1624	 * Truncate 'firstoff' to the effective address size before adding
1625	 * it to the segment base.
1626	 */
1627	firstoff &= vie_size2mask(addrsize);
1628	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1629	return (0);
1630}
1631
1632#ifdef _KERNEL
1633void
1634vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1635{
1636	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1637	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1638
1639	bzero(vie, sizeof(struct vie));
1640
1641	vie->base_register = VM_REG_LAST;
1642	vie->index_register = VM_REG_LAST;
1643	vie->segment_register = VM_REG_LAST;
1644
1645	if (inst_length) {
1646		bcopy(inst_bytes, vie->inst, inst_length);
1647		vie->num_valid = inst_length;
1648	}
1649}
1650
1651static int
1652pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1653{
1654	int error_code = 0;
1655
1656	if (pte & PG_V)
1657		error_code |= PGEX_P;
1658	if (prot & VM_PROT_WRITE)
1659		error_code |= PGEX_W;
1660	if (usermode)
1661		error_code |= PGEX_U;
1662	if (rsvd)
1663		error_code |= PGEX_RSV;
1664	if (prot & VM_PROT_EXECUTE)
1665		error_code |= PGEX_I;
1666
1667	return (error_code);
1668}
1669
1670static void
1671ptp_release(void **cookie)
1672{
1673	if (*cookie != NULL) {
1674		vm_gpa_release(*cookie);
1675		*cookie = NULL;
1676	}
1677}
1678
1679static void *
1680ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
1681{
1682	void *ptr;
1683
1684	ptp_release(cookie);
1685	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
1686	return (ptr);
1687}
1688
1689int
1690vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1691    uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
1692{
1693	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1694	u_int retries;
1695	uint64_t *ptpbase, ptpphys, pte, pgsize;
1696	uint32_t *ptpbase32, pte32;
1697	void *cookie;
1698
1699	*guest_fault = 0;
1700
1701	usermode = (paging->cpl == 3 ? 1 : 0);
1702	writable = prot & VM_PROT_WRITE;
1703	cookie = NULL;
1704	retval = 0;
1705	retries = 0;
1706restart:
1707	ptpphys = paging->cr3;		/* root of the page tables */
1708	ptp_release(&cookie);
1709	if (retries++ > 0)
1710		maybe_yield();
1711
1712	if (vie_canonical_check(paging->cpu_mode, gla)) {
1713		/*
1714		 * XXX assuming a non-stack reference otherwise a stack fault
1715		 * should be generated.
1716		 */
1717		vm_inject_gp(vm, vcpuid);
1718		goto fault;
1719	}
1720
1721	if (paging->paging_mode == PAGING_MODE_FLAT) {
1722		*gpa = gla;
1723		goto done;
1724	}
1725
1726	if (paging->paging_mode == PAGING_MODE_32) {
1727		nlevels = 2;
1728		while (--nlevels >= 0) {
1729			/* Zero out the lower 12 bits. */
1730			ptpphys &= ~0xfff;
1731
1732			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1733
1734			if (ptpbase32 == NULL)
1735				goto error;
1736
1737			ptpshift = PAGE_SHIFT + nlevels * 10;
1738			ptpindex = (gla >> ptpshift) & 0x3FF;
1739			pgsize = 1UL << ptpshift;
1740
1741			pte32 = ptpbase32[ptpindex];
1742
1743			if ((pte32 & PG_V) == 0 ||
1744			    (usermode && (pte32 & PG_U) == 0) ||
1745			    (writable && (pte32 & PG_RW) == 0)) {
1746				pfcode = pf_error_code(usermode, prot, 0,
1747				    pte32);
1748				vm_inject_pf(vm, vcpuid, pfcode, gla);
1749				goto fault;
1750			}
1751
1752			/*
1753			 * Emulate the x86 MMU's management of the accessed
1754			 * and dirty flags. While the accessed flag is set
1755			 * at every level of the page table, the dirty flag
1756			 * is only set at the last level providing the guest
1757			 * physical address.
1758			 */
1759			if ((pte32 & PG_A) == 0) {
1760				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1761				    pte32, pte32 | PG_A) == 0) {
1762					goto restart;
1763				}
1764			}
1765
1766			/* XXX must be ignored if CR4.PSE=0 */
1767			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1768				break;
1769
1770			ptpphys = pte32;
1771		}
1772
1773		/* Set the dirty bit in the page table entry if necessary */
1774		if (writable && (pte32 & PG_M) == 0) {
1775			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1776			    pte32, pte32 | PG_M) == 0) {
1777				goto restart;
1778			}
1779		}
1780
1781		/* Zero out the lower 'ptpshift' bits */
1782		pte32 >>= ptpshift; pte32 <<= ptpshift;
1783		*gpa = pte32 | (gla & (pgsize - 1));
1784		goto done;
1785	}
1786
1787	if (paging->paging_mode == PAGING_MODE_PAE) {
1788		/* Zero out the lower 5 bits and the upper 32 bits */
1789		ptpphys &= 0xffffffe0UL;
1790
1791		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
1792		if (ptpbase == NULL)
1793			goto error;
1794
1795		ptpindex = (gla >> 30) & 0x3;
1796
1797		pte = ptpbase[ptpindex];
1798
1799		if ((pte & PG_V) == 0) {
1800			pfcode = pf_error_code(usermode, prot, 0, pte);
1801			vm_inject_pf(vm, vcpuid, pfcode, gla);
1802			goto fault;
1803		}
1804
1805		ptpphys = pte;
1806
1807		nlevels = 2;
1808	} else
1809		nlevels = 4;
1810	while (--nlevels >= 0) {
1811		/* Zero out the lower 12 bits and the upper 12 bits */
1812		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1813
1814		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1815		if (ptpbase == NULL)
1816			goto error;
1817
1818		ptpshift = PAGE_SHIFT + nlevels * 9;
1819		ptpindex = (gla >> ptpshift) & 0x1FF;
1820		pgsize = 1UL << ptpshift;
1821
1822		pte = ptpbase[ptpindex];
1823
1824		if ((pte & PG_V) == 0 ||
1825		    (usermode && (pte & PG_U) == 0) ||
1826		    (writable && (pte & PG_RW) == 0)) {
1827			pfcode = pf_error_code(usermode, prot, 0, pte);
1828			vm_inject_pf(vm, vcpuid, pfcode, gla);
1829			goto fault;
1830		}
1831
1832		/* Set the accessed bit in the page table entry */
1833		if ((pte & PG_A) == 0) {
1834			if (atomic_cmpset_64(&ptpbase[ptpindex],
1835			    pte, pte | PG_A) == 0) {
1836				goto restart;
1837			}
1838		}
1839
1840		if (nlevels > 0 && (pte & PG_PS) != 0) {
1841			if (pgsize > 1 * GB) {
1842				pfcode = pf_error_code(usermode, prot, 1, pte);
1843				vm_inject_pf(vm, vcpuid, pfcode, gla);
1844				goto fault;
1845			}
1846			break;
1847		}
1848
1849		ptpphys = pte;
1850	}
1851
1852	/* Set the dirty bit in the page table entry if necessary */
1853	if (writable && (pte & PG_M) == 0) {
1854		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1855			goto restart;
1856	}
1857
1858	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1859	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1860	*gpa = pte | (gla & (pgsize - 1));
1861done:
1862	ptp_release(&cookie);
1863	KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
1864	    __func__, retval));
1865	return (retval);
1866error:
1867	retval = EFAULT;
1868	goto done;
1869fault:
1870	*guest_fault = 1;
1871	goto done;
1872}
1873
1874int
1875vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1876    uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
1877{
1878	struct vm_copyinfo copyinfo[2];
1879	int error, prot;
1880
1881	if (inst_length > VIE_INST_SIZE)
1882		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1883
1884	prot = PROT_READ | PROT_EXEC;
1885	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1886	    copyinfo, nitems(copyinfo), faultptr);
1887	if (error || *faultptr)
1888		return (error);
1889
1890	vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1891	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1892	vie->num_valid = inst_length;
1893	return (0);
1894}
1895
1896static int
1897vie_peek(struct vie *vie, uint8_t *x)
1898{
1899
1900	if (vie->num_processed < vie->num_valid) {
1901		*x = vie->inst[vie->num_processed];
1902		return (0);
1903	} else
1904		return (-1);
1905}
1906
1907static void
1908vie_advance(struct vie *vie)
1909{
1910
1911	vie->num_processed++;
1912}
1913
1914static bool
1915segment_override(uint8_t x, int *seg)
1916{
1917
1918	switch (x) {
1919	case 0x2E:
1920		*seg = VM_REG_GUEST_CS;
1921		break;
1922	case 0x36:
1923		*seg = VM_REG_GUEST_SS;
1924		break;
1925	case 0x3E:
1926		*seg = VM_REG_GUEST_DS;
1927		break;
1928	case 0x26:
1929		*seg = VM_REG_GUEST_ES;
1930		break;
1931	case 0x64:
1932		*seg = VM_REG_GUEST_FS;
1933		break;
1934	case 0x65:
1935		*seg = VM_REG_GUEST_GS;
1936		break;
1937	default:
1938		return (false);
1939	}
1940	return (true);
1941}
1942
1943static int
1944decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1945{
1946	uint8_t x;
1947
1948	while (1) {
1949		if (vie_peek(vie, &x))
1950			return (-1);
1951
1952		if (x == 0x66)
1953			vie->opsize_override = 1;
1954		else if (x == 0x67)
1955			vie->addrsize_override = 1;
1956		else if (x == 0xF3)
1957			vie->repz_present = 1;
1958		else if (x == 0xF2)
1959			vie->repnz_present = 1;
1960		else if (segment_override(x, &vie->segment_register))
1961			vie->segment_override = 1;
1962		else
1963			break;
1964
1965		vie_advance(vie);
1966	}
1967
1968	/*
1969	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1970	 * - Only one REX prefix is allowed per instruction.
1971	 * - The REX prefix must immediately precede the opcode byte or the
1972	 *   escape opcode byte.
1973	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1974	 *   the mandatory prefix must come before the REX prefix.
1975	 */
1976	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1977		vie->rex_present = 1;
1978		vie->rex_w = x & 0x8 ? 1 : 0;
1979		vie->rex_r = x & 0x4 ? 1 : 0;
1980		vie->rex_x = x & 0x2 ? 1 : 0;
1981		vie->rex_b = x & 0x1 ? 1 : 0;
1982		vie_advance(vie);
1983	}
1984
1985	/*
1986	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1987	 */
1988	if (cpu_mode == CPU_MODE_64BIT) {
1989		/*
1990		 * Default address size is 64-bits and default operand size
1991		 * is 32-bits.
1992		 */
1993		vie->addrsize = vie->addrsize_override ? 4 : 8;
1994		if (vie->rex_w)
1995			vie->opsize = 8;
1996		else if (vie->opsize_override)
1997			vie->opsize = 2;
1998		else
1999			vie->opsize = 4;
2000	} else if (cs_d) {
2001		/* Default address and operand sizes are 32-bits */
2002		vie->addrsize = vie->addrsize_override ? 2 : 4;
2003		vie->opsize = vie->opsize_override ? 2 : 4;
2004	} else {
2005		/* Default address and operand sizes are 16-bits */
2006		vie->addrsize = vie->addrsize_override ? 4 : 2;
2007		vie->opsize = vie->opsize_override ? 4 : 2;
2008	}
2009	return (0);
2010}
2011
2012static int
2013decode_two_byte_opcode(struct vie *vie)
2014{
2015	uint8_t x;
2016
2017	if (vie_peek(vie, &x))
2018		return (-1);
2019
2020	vie->op = two_byte_opcodes[x];
2021
2022	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2023		return (-1);
2024
2025	vie_advance(vie);
2026	return (0);
2027}
2028
2029static int
2030decode_opcode(struct vie *vie)
2031{
2032	uint8_t x;
2033
2034	if (vie_peek(vie, &x))
2035		return (-1);
2036
2037	vie->op = one_byte_opcodes[x];
2038
2039	if (vie->op.op_type == VIE_OP_TYPE_NONE)
2040		return (-1);
2041
2042	vie_advance(vie);
2043
2044	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
2045		return (decode_two_byte_opcode(vie));
2046
2047	return (0);
2048}
2049
2050static int
2051decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
2052{
2053	uint8_t x;
2054
2055	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
2056		return (0);
2057
2058	if (cpu_mode == CPU_MODE_REAL)
2059		return (-1);
2060
2061	if (vie_peek(vie, &x))
2062		return (-1);
2063
2064	vie->mod = (x >> 6) & 0x3;
2065	vie->rm =  (x >> 0) & 0x7;
2066	vie->reg = (x >> 3) & 0x7;
2067
2068	/*
2069	 * A direct addressing mode makes no sense in the context of an EPT
2070	 * fault. There has to be a memory access involved to cause the
2071	 * EPT fault.
2072	 */
2073	if (vie->mod == VIE_MOD_DIRECT)
2074		return (-1);
2075
2076	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
2077	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
2078		/*
2079		 * Table 2-5: Special Cases of REX Encodings
2080		 *
2081		 * mod=0, r/m=5 is used in the compatibility mode to
2082		 * indicate a disp32 without a base register.
2083		 *
2084		 * mod!=3, r/m=4 is used in the compatibility mode to
2085		 * indicate that the SIB byte is present.
2086		 *
2087		 * The 'b' bit in the REX prefix is don't care in
2088		 * this case.
2089		 */
2090	} else {
2091		vie->rm |= (vie->rex_b << 3);
2092	}
2093
2094	vie->reg |= (vie->rex_r << 3);
2095
2096	/* SIB */
2097	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
2098		goto done;
2099
2100	vie->base_register = gpr_map[vie->rm];
2101
2102	switch (vie->mod) {
2103	case VIE_MOD_INDIRECT_DISP8:
2104		vie->disp_bytes = 1;
2105		break;
2106	case VIE_MOD_INDIRECT_DISP32:
2107		vie->disp_bytes = 4;
2108		break;
2109	case VIE_MOD_INDIRECT:
2110		if (vie->rm == VIE_RM_DISP32) {
2111			vie->disp_bytes = 4;
2112			/*
2113			 * Table 2-7. RIP-Relative Addressing
2114			 *
2115			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
2116			 * whereas in compatibility mode it just implies disp32.
2117			 */
2118
2119			if (cpu_mode == CPU_MODE_64BIT)
2120				vie->base_register = VM_REG_GUEST_RIP;
2121			else
2122				vie->base_register = VM_REG_LAST;
2123		}
2124		break;
2125	}
2126
2127done:
2128	vie_advance(vie);
2129
2130	return (0);
2131}
2132
2133static int
2134decode_sib(struct vie *vie)
2135{
2136	uint8_t x;
2137
2138	/* Proceed only if SIB byte is present */
2139	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
2140		return (0);
2141
2142	if (vie_peek(vie, &x))
2143		return (-1);
2144
2145	/* De-construct the SIB byte */
2146	vie->ss = (x >> 6) & 0x3;
2147	vie->index = (x >> 3) & 0x7;
2148	vie->base = (x >> 0) & 0x7;
2149
2150	/* Apply the REX prefix modifiers */
2151	vie->index |= vie->rex_x << 3;
2152	vie->base |= vie->rex_b << 3;
2153
2154	switch (vie->mod) {
2155	case VIE_MOD_INDIRECT_DISP8:
2156		vie->disp_bytes = 1;
2157		break;
2158	case VIE_MOD_INDIRECT_DISP32:
2159		vie->disp_bytes = 4;
2160		break;
2161	}
2162
2163	if (vie->mod == VIE_MOD_INDIRECT &&
2164	    (vie->base == 5 || vie->base == 13)) {
2165		/*
2166		 * Special case when base register is unused if mod = 0
2167		 * and base = %rbp or %r13.
2168		 *
2169		 * Documented in:
2170		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2171		 * Table 2-5: Special Cases of REX Encodings
2172		 */
2173		vie->disp_bytes = 4;
2174	} else {
2175		vie->base_register = gpr_map[vie->base];
2176	}
2177
2178	/*
2179	 * All encodings of 'index' are valid except for %rsp (4).
2180	 *
2181	 * Documented in:
2182	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
2183	 * Table 2-5: Special Cases of REX Encodings
2184	 */
2185	if (vie->index != 4)
2186		vie->index_register = gpr_map[vie->index];
2187
2188	/* 'scale' makes sense only in the context of an index register */
2189	if (vie->index_register < VM_REG_LAST)
2190		vie->scale = 1 << vie->ss;
2191
2192	vie_advance(vie);
2193
2194	return (0);
2195}
2196
2197static int
2198decode_displacement(struct vie *vie)
2199{
2200	int n, i;
2201	uint8_t x;
2202
2203	union {
2204		char	buf[4];
2205		int8_t	signed8;
2206		int32_t	signed32;
2207	} u;
2208
2209	if ((n = vie->disp_bytes) == 0)
2210		return (0);
2211
2212	if (n != 1 && n != 4)
2213		panic("decode_displacement: invalid disp_bytes %d", n);
2214
2215	for (i = 0; i < n; i++) {
2216		if (vie_peek(vie, &x))
2217			return (-1);
2218
2219		u.buf[i] = x;
2220		vie_advance(vie);
2221	}
2222
2223	if (n == 1)
2224		vie->displacement = u.signed8;		/* sign-extended */
2225	else
2226		vie->displacement = u.signed32;		/* sign-extended */
2227
2228	return (0);
2229}
2230
2231static int
2232decode_immediate(struct vie *vie)
2233{
2234	int i, n;
2235	uint8_t x;
2236	union {
2237		char	buf[4];
2238		int8_t	signed8;
2239		int16_t	signed16;
2240		int32_t	signed32;
2241	} u;
2242
2243	/* Figure out immediate operand size (if any) */
2244	if (vie->op.op_flags & VIE_OP_F_IMM) {
2245		/*
2246		 * Section 2.2.1.5 "Immediates", Intel SDM:
2247		 * In 64-bit mode the typical size of immediate operands
2248		 * remains 32-bits. When the operand size if 64-bits, the
2249		 * processor sign-extends all immediates to 64-bits prior
2250		 * to their use.
2251		 */
2252		if (vie->opsize == 4 || vie->opsize == 8)
2253			vie->imm_bytes = 4;
2254		else
2255			vie->imm_bytes = 2;
2256	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
2257		vie->imm_bytes = 1;
2258	}
2259
2260	if ((n = vie->imm_bytes) == 0)
2261		return (0);
2262
2263	KASSERT(n == 1 || n == 2 || n == 4,
2264	    ("%s: invalid number of immediate bytes: %d", __func__, n));
2265
2266	for (i = 0; i < n; i++) {
2267		if (vie_peek(vie, &x))
2268			return (-1);
2269
2270		u.buf[i] = x;
2271		vie_advance(vie);
2272	}
2273
2274	/* sign-extend the immediate value before use */
2275	if (n == 1)
2276		vie->immediate = u.signed8;
2277	else if (n == 2)
2278		vie->immediate = u.signed16;
2279	else
2280		vie->immediate = u.signed32;
2281
2282	return (0);
2283}
2284
2285static int
2286decode_moffset(struct vie *vie)
2287{
2288	int i, n;
2289	uint8_t x;
2290	union {
2291		char	buf[8];
2292		uint64_t u64;
2293	} u;
2294
2295	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
2296		return (0);
2297
2298	/*
2299	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
2300	 * The memory offset size follows the address-size of the instruction.
2301	 */
2302	n = vie->addrsize;
2303	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
2304
2305	u.u64 = 0;
2306	for (i = 0; i < n; i++) {
2307		if (vie_peek(vie, &x))
2308			return (-1);
2309
2310		u.buf[i] = x;
2311		vie_advance(vie);
2312	}
2313	vie->displacement = u.u64;
2314	return (0);
2315}
2316
2317/*
2318 * Verify that the 'guest linear address' provided as collateral of the nested
2319 * page table fault matches with our instruction decoding.
2320 */
2321static int
2322verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
2323{
2324	int error;
2325	uint64_t base, idx, gla2;
2326
2327	/* Skip 'gla' verification */
2328	if (gla == VIE_INVALID_GLA)
2329		return (0);
2330
2331	base = 0;
2332	if (vie->base_register != VM_REG_LAST) {
2333		error = vm_get_register(vm, cpuid, vie->base_register, &base);
2334		if (error) {
2335			printf("verify_gla: error %d getting base reg %d\n",
2336				error, vie->base_register);
2337			return (-1);
2338		}
2339
2340		/*
2341		 * RIP-relative addressing starts from the following
2342		 * instruction
2343		 */
2344		if (vie->base_register == VM_REG_GUEST_RIP)
2345			base += vie->num_processed;
2346	}
2347
2348	idx = 0;
2349	if (vie->index_register != VM_REG_LAST) {
2350		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
2351		if (error) {
2352			printf("verify_gla: error %d getting index reg %d\n",
2353				error, vie->index_register);
2354			return (-1);
2355		}
2356	}
2357
2358	/* XXX assuming that the base address of the segment is 0 */
2359	gla2 = base + vie->scale * idx + vie->displacement;
2360	gla2 &= size2mask[vie->addrsize];
2361	if (gla != gla2) {
2362		printf("verify_gla mismatch: "
2363		       "base(0x%0lx), scale(%d), index(0x%0lx), "
2364		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
2365		       base, vie->scale, idx, vie->displacement, gla, gla2);
2366		return (-1);
2367	}
2368
2369	return (0);
2370}
2371
2372int
2373vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
2374		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
2375{
2376
2377	if (decode_prefixes(vie, cpu_mode, cs_d))
2378		return (-1);
2379
2380	if (decode_opcode(vie))
2381		return (-1);
2382
2383	if (decode_modrm(vie, cpu_mode))
2384		return (-1);
2385
2386	if (decode_sib(vie))
2387		return (-1);
2388
2389	if (decode_displacement(vie))
2390		return (-1);
2391
2392	if (decode_immediate(vie))
2393		return (-1);
2394
2395	if (decode_moffset(vie))
2396		return (-1);
2397
2398	if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
2399		if (verify_gla(vm, cpuid, gla, vie))
2400			return (-1);
2401	}
2402
2403	vie->decoded = 1;	/* success */
2404
2405	return (0);
2406}
2407#endif	/* _KERNEL */
2408