vmm_instruction_emul.c revision 276403
1/*-
2 * Copyright (c) 2012 Sandvine, Inc.
3 * Copyright (c) 2012 NetApp, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 *
27 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 276403 2014-12-30 08:24:14Z neel $
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 276403 2014-12-30 08:24:14Z neel $");
32
33#ifdef _KERNEL
34#include <sys/param.h>
35#include <sys/pcpu.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38
39#include <vm/vm.h>
40#include <vm/pmap.h>
41
42#include <machine/vmparam.h>
43#include <machine/vmm.h>
44#else	/* !_KERNEL */
45#include <sys/types.h>
46#include <sys/errno.h>
47#include <sys/_iovec.h>
48
49#include <machine/vmm.h>
50
51#include <assert.h>
52#include <vmmapi.h>
53#define	KASSERT(exp,msg)	assert((exp))
54#endif	/* _KERNEL */
55
56#include <machine/vmm_instruction_emul.h>
57#include <x86/psl.h>
58#include <x86/specialreg.h>
59
60/* struct vie_op.op_type */
61enum {
62	VIE_OP_TYPE_NONE = 0,
63	VIE_OP_TYPE_MOV,
64	VIE_OP_TYPE_MOVSX,
65	VIE_OP_TYPE_MOVZX,
66	VIE_OP_TYPE_AND,
67	VIE_OP_TYPE_OR,
68	VIE_OP_TYPE_SUB,
69	VIE_OP_TYPE_TWO_BYTE,
70	VIE_OP_TYPE_PUSH,
71	VIE_OP_TYPE_CMP,
72	VIE_OP_TYPE_POP,
73	VIE_OP_TYPE_LAST
74};
75
76/* struct vie_op.op_flags */
77#define	VIE_OP_F_IMM		(1 << 0)  /* 16/32-bit immediate operand */
78#define	VIE_OP_F_IMM8		(1 << 1)  /* 8-bit immediate operand */
79#define	VIE_OP_F_MOFFSET	(1 << 2)  /* 16/32/64-bit immediate moffset */
80#define	VIE_OP_F_NO_MODRM	(1 << 3)
81
82static const struct vie_op two_byte_opcodes[256] = {
83	[0xB6] = {
84		.op_byte = 0xB6,
85		.op_type = VIE_OP_TYPE_MOVZX,
86	},
87	[0xB7] = {
88		.op_byte = 0xB7,
89		.op_type = VIE_OP_TYPE_MOVZX,
90	},
91	[0xBE] = {
92		.op_byte = 0xBE,
93		.op_type = VIE_OP_TYPE_MOVSX,
94	},
95};
96
97static const struct vie_op one_byte_opcodes[256] = {
98	[0x0F] = {
99		.op_byte = 0x0F,
100		.op_type = VIE_OP_TYPE_TWO_BYTE
101	},
102	[0x2B] = {
103		.op_byte = 0x2B,
104		.op_type = VIE_OP_TYPE_SUB,
105	},
106	[0x3B] = {
107		.op_byte = 0x3B,
108		.op_type = VIE_OP_TYPE_CMP,
109	},
110	[0x88] = {
111		.op_byte = 0x88,
112		.op_type = VIE_OP_TYPE_MOV,
113	},
114	[0x89] = {
115		.op_byte = 0x89,
116		.op_type = VIE_OP_TYPE_MOV,
117	},
118	[0x8A] = {
119		.op_byte = 0x8A,
120		.op_type = VIE_OP_TYPE_MOV,
121	},
122	[0x8B] = {
123		.op_byte = 0x8B,
124		.op_type = VIE_OP_TYPE_MOV,
125	},
126	[0xA1] = {
127		.op_byte = 0xA1,
128		.op_type = VIE_OP_TYPE_MOV,
129		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
130	},
131	[0xA3] = {
132		.op_byte = 0xA3,
133		.op_type = VIE_OP_TYPE_MOV,
134		.op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
135	},
136	[0xC6] = {
137		/* XXX Group 11 extended opcode - not just MOV */
138		.op_byte = 0xC6,
139		.op_type = VIE_OP_TYPE_MOV,
140		.op_flags = VIE_OP_F_IMM8,
141	},
142	[0xC7] = {
143		.op_byte = 0xC7,
144		.op_type = VIE_OP_TYPE_MOV,
145		.op_flags = VIE_OP_F_IMM,
146	},
147	[0x23] = {
148		.op_byte = 0x23,
149		.op_type = VIE_OP_TYPE_AND,
150	},
151	[0x81] = {
152		/* XXX Group 1 extended opcode - not just AND */
153		.op_byte = 0x81,
154		.op_type = VIE_OP_TYPE_AND,
155		.op_flags = VIE_OP_F_IMM,
156	},
157	[0x83] = {
158		/* XXX Group 1 extended opcode - not just OR */
159		.op_byte = 0x83,
160		.op_type = VIE_OP_TYPE_OR,
161		.op_flags = VIE_OP_F_IMM8,
162	},
163	[0x8F] = {
164		/* XXX Group 1A extended opcode - not just POP */
165		.op_byte = 0x8F,
166		.op_type = VIE_OP_TYPE_POP,
167	},
168	[0xFF] = {
169		/* XXX Group 5 extended opcode - not just PUSH */
170		.op_byte = 0xFF,
171		.op_type = VIE_OP_TYPE_PUSH,
172	}
173};
174
175/* struct vie.mod */
176#define	VIE_MOD_INDIRECT		0
177#define	VIE_MOD_INDIRECT_DISP8		1
178#define	VIE_MOD_INDIRECT_DISP32		2
179#define	VIE_MOD_DIRECT			3
180
181/* struct vie.rm */
182#define	VIE_RM_SIB			4
183#define	VIE_RM_DISP32			5
184
185#define	GB				(1024 * 1024 * 1024)
186
187static enum vm_reg_name gpr_map[16] = {
188	VM_REG_GUEST_RAX,
189	VM_REG_GUEST_RCX,
190	VM_REG_GUEST_RDX,
191	VM_REG_GUEST_RBX,
192	VM_REG_GUEST_RSP,
193	VM_REG_GUEST_RBP,
194	VM_REG_GUEST_RSI,
195	VM_REG_GUEST_RDI,
196	VM_REG_GUEST_R8,
197	VM_REG_GUEST_R9,
198	VM_REG_GUEST_R10,
199	VM_REG_GUEST_R11,
200	VM_REG_GUEST_R12,
201	VM_REG_GUEST_R13,
202	VM_REG_GUEST_R14,
203	VM_REG_GUEST_R15
204};
205
206static uint64_t size2mask[] = {
207	[1] = 0xff,
208	[2] = 0xffff,
209	[4] = 0xffffffff,
210	[8] = 0xffffffffffffffff,
211};
212
213static int
214vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
215{
216	int error;
217
218	error = vm_get_register(vm, vcpuid, reg, rval);
219
220	return (error);
221}
222
223static void
224vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
225{
226	*lhbr = 0;
227	*reg = gpr_map[vie->reg];
228
229	/*
230	 * 64-bit mode imposes limitations on accessing legacy high byte
231	 * registers (lhbr).
232	 *
233	 * The legacy high-byte registers cannot be addressed if the REX
234	 * prefix is present. In this case the values 4, 5, 6 and 7 of the
235	 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
236	 *
237	 * If the REX prefix is not present then the values 4, 5, 6 and 7
238	 * of the 'ModRM:reg' field address the legacy high-byte registers,
239	 * %ah, %ch, %dh and %bh respectively.
240	 */
241	if (!vie->rex_present) {
242		if (vie->reg & 0x4) {
243			*lhbr = 1;
244			*reg = gpr_map[vie->reg & 0x3];
245		}
246	}
247}
248
249static int
250vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
251{
252	uint64_t val;
253	int error, lhbr;
254	enum vm_reg_name reg;
255
256	vie_calc_bytereg(vie, &reg, &lhbr);
257	error = vm_get_register(vm, vcpuid, reg, &val);
258
259	/*
260	 * To obtain the value of a legacy high byte register shift the
261	 * base register right by 8 bits (%ah = %rax >> 8).
262	 */
263	if (lhbr)
264		*rval = val >> 8;
265	else
266		*rval = val;
267	return (error);
268}
269
270static int
271vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
272{
273	uint64_t origval, val, mask;
274	int error, lhbr;
275	enum vm_reg_name reg;
276
277	vie_calc_bytereg(vie, &reg, &lhbr);
278	error = vm_get_register(vm, vcpuid, reg, &origval);
279	if (error == 0) {
280		val = byte;
281		mask = 0xff;
282		if (lhbr) {
283			/*
284			 * Shift left by 8 to store 'byte' in a legacy high
285			 * byte register.
286			 */
287			val <<= 8;
288			mask <<= 8;
289		}
290		val |= origval & ~mask;
291		error = vm_set_register(vm, vcpuid, reg, val);
292	}
293	return (error);
294}
295
296int
297vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
298		    uint64_t val, int size)
299{
300	int error;
301	uint64_t origval;
302
303	switch (size) {
304	case 1:
305	case 2:
306		error = vie_read_register(vm, vcpuid, reg, &origval);
307		if (error)
308			return (error);
309		val &= size2mask[size];
310		val |= origval & ~size2mask[size];
311		break;
312	case 4:
313		val &= 0xffffffffUL;
314		break;
315	case 8:
316		break;
317	default:
318		return (EINVAL);
319	}
320
321	error = vm_set_register(vm, vcpuid, reg, val);
322	return (error);
323}
324
325#define	RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
326
327/*
328 * Return the status flags that would result from doing (x - y).
329 */
330#define	GETCC(sz)							\
331static u_long								\
332getcc##sz(uint##sz##_t x, uint##sz##_t y)				\
333{									\
334	u_long rflags;							\
335									\
336	__asm __volatile("sub %2,%1; pushfq; popq %0" :			\
337	    "=r" (rflags), "+r" (x) : "m" (y));				\
338	return (rflags);						\
339} struct __hack
340
341GETCC(8);
342GETCC(16);
343GETCC(32);
344GETCC(64);
345
346static u_long
347getcc(int opsize, uint64_t x, uint64_t y)
348{
349	KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
350	    ("getcc: invalid operand size %d", opsize));
351
352	if (opsize == 1)
353		return (getcc8(x, y));
354	else if (opsize == 2)
355		return (getcc16(x, y));
356	else if (opsize == 4)
357		return (getcc32(x, y));
358	else
359		return (getcc64(x, y));
360}
361
362static int
363emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
364	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
365{
366	int error, size;
367	enum vm_reg_name reg;
368	uint8_t byte;
369	uint64_t val;
370
371	size = vie->opsize;
372	error = EINVAL;
373
374	switch (vie->op.op_byte) {
375	case 0x88:
376		/*
377		 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
378		 * 88/r:	mov r/m8, r8
379		 * REX + 88/r:	mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
380		 */
381		size = 1;	/* override for byte operation */
382		error = vie_read_bytereg(vm, vcpuid, vie, &byte);
383		if (error == 0)
384			error = memwrite(vm, vcpuid, gpa, byte, size, arg);
385		break;
386	case 0x89:
387		/*
388		 * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
389		 * 89/r:	mov r/m16, r16
390		 * 89/r:	mov r/m32, r32
391		 * REX.W + 89/r	mov r/m64, r64
392		 */
393		reg = gpr_map[vie->reg];
394		error = vie_read_register(vm, vcpuid, reg, &val);
395		if (error == 0) {
396			val &= size2mask[size];
397			error = memwrite(vm, vcpuid, gpa, val, size, arg);
398		}
399		break;
400	case 0x8A:
401		/*
402		 * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
403		 * 8A/r:	mov r8, r/m8
404		 * REX + 8A/r:	mov r8, r/m8
405		 */
406		size = 1;	/* override for byte operation */
407		error = memread(vm, vcpuid, gpa, &val, size, arg);
408		if (error == 0)
409			error = vie_write_bytereg(vm, vcpuid, vie, val);
410		break;
411	case 0x8B:
412		/*
413		 * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
414		 * 8B/r:	mov r16, r/m16
415		 * 8B/r:	mov r32, r/m32
416		 * REX.W 8B/r:	mov r64, r/m64
417		 */
418		error = memread(vm, vcpuid, gpa, &val, size, arg);
419		if (error == 0) {
420			reg = gpr_map[vie->reg];
421			error = vie_update_register(vm, vcpuid, reg, val, size);
422		}
423		break;
424	case 0xA1:
425		/*
426		 * MOV from seg:moffset to AX/EAX/RAX
427		 * A1:		mov AX, moffs16
428		 * A1:		mov EAX, moffs32
429		 * REX.W + A1:	mov RAX, moffs64
430		 */
431		error = memread(vm, vcpuid, gpa, &val, size, arg);
432		if (error == 0) {
433			reg = VM_REG_GUEST_RAX;
434			error = vie_update_register(vm, vcpuid, reg, val, size);
435		}
436		break;
437	case 0xA3:
438		/*
439		 * MOV from AX/EAX/RAX to seg:moffset
440		 * A3:		mov moffs16, AX
441		 * A3:		mov moffs32, EAX
442		 * REX.W + A3:	mov moffs64, RAX
443		 */
444		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
445		if (error == 0) {
446			val &= size2mask[size];
447			error = memwrite(vm, vcpuid, gpa, val, size, arg);
448		}
449		break;
450	case 0xC6:
451		/*
452		 * MOV from imm8 to mem (ModRM:r/m)
453		 * C6/0		mov r/m8, imm8
454		 * REX + C6/0	mov r/m8, imm8
455		 */
456		size = 1;	/* override for byte operation */
457		error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
458		break;
459	case 0xC7:
460		/*
461		 * MOV from imm16/imm32 to mem (ModRM:r/m)
462		 * C7/0		mov r/m16, imm16
463		 * C7/0		mov r/m32, imm32
464		 * REX.W + C7/0	mov r/m64, imm32 (sign-extended to 64-bits)
465		 */
466		val = vie->immediate & size2mask[size];
467		error = memwrite(vm, vcpuid, gpa, val, size, arg);
468		break;
469	default:
470		break;
471	}
472
473	return (error);
474}
475
476static int
477emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
478	     mem_region_read_t memread, mem_region_write_t memwrite,
479	     void *arg)
480{
481	int error, size;
482	enum vm_reg_name reg;
483	uint64_t val;
484
485	size = vie->opsize;
486	error = EINVAL;
487
488	switch (vie->op.op_byte) {
489	case 0xB6:
490		/*
491		 * MOV and zero extend byte from mem (ModRM:r/m) to
492		 * reg (ModRM:reg).
493		 *
494		 * 0F B6/r		movzx r16, r/m8
495		 * 0F B6/r		movzx r32, r/m8
496		 * REX.W + 0F B6/r	movzx r64, r/m8
497		 */
498
499		/* get the first operand */
500		error = memread(vm, vcpuid, gpa, &val, 1, arg);
501		if (error)
502			break;
503
504		/* get the second operand */
505		reg = gpr_map[vie->reg];
506
507		/* zero-extend byte */
508		val = (uint8_t)val;
509
510		/* write the result */
511		error = vie_update_register(vm, vcpuid, reg, val, size);
512		break;
513	case 0xB7:
514		/*
515		 * MOV and zero extend word from mem (ModRM:r/m) to
516		 * reg (ModRM:reg).
517		 *
518		 * 0F B7/r		movzx r32, r/m16
519		 * REX.W + 0F B7/r	movzx r64, r/m16
520		 */
521		error = memread(vm, vcpuid, gpa, &val, 2, arg);
522		if (error)
523			return (error);
524
525		reg = gpr_map[vie->reg];
526
527		/* zero-extend word */
528		val = (uint16_t)val;
529
530		error = vie_update_register(vm, vcpuid, reg, val, size);
531		break;
532	case 0xBE:
533		/*
534		 * MOV and sign extend byte from mem (ModRM:r/m) to
535		 * reg (ModRM:reg).
536		 *
537		 * 0F BE/r		movsx r16, r/m8
538		 * 0F BE/r		movsx r32, r/m8
539		 * REX.W + 0F BE/r	movsx r64, r/m8
540		 */
541
542		/* get the first operand */
543		error = memread(vm, vcpuid, gpa, &val, 1, arg);
544		if (error)
545			break;
546
547		/* get the second operand */
548		reg = gpr_map[vie->reg];
549
550		/* sign extend byte */
551		val = (int8_t)val;
552
553		/* write the result */
554		error = vie_update_register(vm, vcpuid, reg, val, size);
555		break;
556	default:
557		break;
558	}
559	return (error);
560}
561
562static int
563emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
564	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
565{
566	int error, size;
567	enum vm_reg_name reg;
568	uint64_t result, rflags, rflags2, val1, val2;
569
570	size = vie->opsize;
571	error = EINVAL;
572
573	switch (vie->op.op_byte) {
574	case 0x23:
575		/*
576		 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
577		 * result in reg.
578		 *
579		 * 23/r		and r16, r/m16
580		 * 23/r		and r32, r/m32
581		 * REX.W + 23/r	and r64, r/m64
582		 */
583
584		/* get the first operand */
585		reg = gpr_map[vie->reg];
586		error = vie_read_register(vm, vcpuid, reg, &val1);
587		if (error)
588			break;
589
590		/* get the second operand */
591		error = memread(vm, vcpuid, gpa, &val2, size, arg);
592		if (error)
593			break;
594
595		/* perform the operation and write the result */
596		result = val1 & val2;
597		error = vie_update_register(vm, vcpuid, reg, result, size);
598		break;
599	case 0x81:
600		/*
601		 * AND/OR mem (ModRM:r/m) with immediate and store the
602		 * result in mem.
603		 *
604		 * AND: i = 4
605		 * OR:  i = 1
606		 * 81 /i		op r/m16, imm16
607		 * 81 /i		op r/m32, imm32
608		 * REX.W + 81 /i	op r/m64, imm32 sign-extended to 64
609		 *
610		 */
611
612		/* get the first operand */
613                error = memread(vm, vcpuid, gpa, &val1, size, arg);
614                if (error)
615			break;
616
617                /*
618                 * perform the operation with the pre-fetched immediate
619                 * operand and write the result
620                 */
621		switch (vie->reg & 7) {
622		case 0x4:
623			/* modrm:reg == b100, AND */
624			result = val1 & vie->immediate;
625			break;
626		case 0x1:
627			/* modrm:reg == b001, OR */
628			result = val1 | vie->immediate;
629			break;
630		default:
631			error = EINVAL;
632			break;
633		}
634		if (error)
635			break;
636
637		error = memwrite(vm, vcpuid, gpa, result, size, arg);
638		break;
639	default:
640		break;
641	}
642	if (error)
643		return (error);
644
645	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
646	if (error)
647		return (error);
648
649	/*
650	 * OF and CF are cleared; the SF, ZF and PF flags are set according
651	 * to the result; AF is undefined.
652	 *
653	 * The updated status flags are obtained by subtracting 0 from 'result'.
654	 */
655	rflags2 = getcc(size, result, 0);
656	rflags &= ~RFLAGS_STATUS_BITS;
657	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
658
659	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
660	return (error);
661}
662
663static int
664emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
665	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
666{
667	int error, size;
668	uint64_t val1, result, rflags, rflags2;
669
670	size = vie->opsize;
671	error = EINVAL;
672
673	switch (vie->op.op_byte) {
674	case 0x83:
675		/*
676		 * OR mem (ModRM:r/m) with immediate and store the
677		 * result in mem.
678		 *
679		 * 83 /1		OR r/m16, imm8 sign-extended to 16
680		 * 83 /1		OR r/m32, imm8 sign-extended to 32
681		 * REX.W + 83/1		OR r/m64, imm8 sign-extended to 64
682		 *
683		 * Currently, only the OR operation of the 0x83 opcode
684		 * is implemented (ModRM:reg = b001).
685		 */
686		if ((vie->reg & 7) != 1)
687			break;
688
689		/* get the first operand */
690                error = memread(vm, vcpuid, gpa, &val1, size, arg);
691                if (error)
692			break;
693
694                /*
695		 * perform the operation with the pre-fetched immediate
696		 * operand and write the result
697		 */
698                result = val1 | vie->immediate;
699                error = memwrite(vm, vcpuid, gpa, result, size, arg);
700		break;
701	default:
702		break;
703	}
704	if (error)
705		return (error);
706
707	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
708	if (error)
709		return (error);
710
711	/*
712	 * OF and CF are cleared; the SF, ZF and PF flags are set according
713	 * to the result; AF is undefined.
714	 *
715	 * The updated status flags are obtained by subtracting 0 from 'result'.
716	 */
717	rflags2 = getcc(size, result, 0);
718	rflags &= ~RFLAGS_STATUS_BITS;
719	rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
720
721	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
722	return (error);
723}
724
725static int
726emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
727	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
728{
729	int error, size;
730	uint64_t op1, op2, rflags, rflags2;
731	enum vm_reg_name reg;
732
733	size = vie->opsize;
734	switch (vie->op.op_byte) {
735	case 0x3B:
736		/*
737		 * 3B/r		CMP r16, r/m16
738		 * 3B/r		CMP r32, r/m32
739		 * REX.W + 3B/r	CMP r64, r/m64
740		 *
741		 * Compare first operand (reg) with second operand (r/m) and
742		 * set status flags in EFLAGS register. The comparison is
743		 * performed by subtracting the second operand from the first
744		 * operand and then setting the status flags.
745		 */
746
747		/* Get the first operand */
748		reg = gpr_map[vie->reg];
749		error = vie_read_register(vm, vcpuid, reg, &op1);
750		if (error)
751			return (error);
752
753		/* Get the second operand */
754		error = memread(vm, vcpuid, gpa, &op2, size, arg);
755		if (error)
756			return (error);
757
758		break;
759	default:
760		return (EINVAL);
761	}
762	rflags2 = getcc(size, op1, op2);
763	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
764	if (error)
765		return (error);
766	rflags &= ~RFLAGS_STATUS_BITS;
767	rflags |= rflags2 & RFLAGS_STATUS_BITS;
768
769	error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
770	return (error);
771}
772
773static int
774emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
775	    mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
776{
777	int error, size;
778	uint64_t nval, rflags, rflags2, val1, val2;
779	enum vm_reg_name reg;
780
781	size = vie->opsize;
782	error = EINVAL;
783
784	switch (vie->op.op_byte) {
785	case 0x2B:
786		/*
787		 * SUB r/m from r and store the result in r
788		 *
789		 * 2B/r            SUB r16, r/m16
790		 * 2B/r            SUB r32, r/m32
791		 * REX.W + 2B/r    SUB r64, r/m64
792		 */
793
794		/* get the first operand */
795		reg = gpr_map[vie->reg];
796		error = vie_read_register(vm, vcpuid, reg, &val1);
797		if (error)
798			break;
799
800		/* get the second operand */
801		error = memread(vm, vcpuid, gpa, &val2, size, arg);
802		if (error)
803			break;
804
805		/* perform the operation and write the result */
806		nval = val1 - val2;
807		error = vie_update_register(vm, vcpuid, reg, nval, size);
808		break;
809	default:
810		break;
811	}
812
813	if (!error) {
814		rflags2 = getcc(size, val1, val2);
815		error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
816		    &rflags);
817		if (error)
818			return (error);
819
820		rflags &= ~RFLAGS_STATUS_BITS;
821		rflags |= rflags2 & RFLAGS_STATUS_BITS;
822		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
823		    rflags, 8);
824	}
825
826	return (error);
827}
828
829static int
830emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
831    struct vm_guest_paging *paging, mem_region_read_t memread,
832    mem_region_write_t memwrite, void *arg)
833{
834#ifdef _KERNEL
835	struct vm_copyinfo copyinfo[2];
836#else
837	struct iovec copyinfo[2];
838#endif
839	struct seg_desc ss_desc;
840	uint64_t cr0, rflags, rsp, stack_gla, val;
841	int error, size, stackaddrsize, pushop;
842
843	val = 0;
844	size = vie->opsize;
845	pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
846
847	/*
848	 * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
849	 */
850	if (paging->cpu_mode == CPU_MODE_REAL) {
851		stackaddrsize = 2;
852	} else if (paging->cpu_mode == CPU_MODE_64BIT) {
853		/*
854		 * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
855		 * - Stack pointer size is always 64-bits.
856		 * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
857		 * - 16-bit PUSH/POP is supported by using the operand size
858		 *   override prefix (66H).
859		 */
860		stackaddrsize = 8;
861		size = vie->opsize_override ? 2 : 8;
862	} else {
863		/*
864		 * In protected or compability mode the 'B' flag in the
865		 * stack-segment descriptor determines the size of the
866		 * stack pointer.
867		 */
868		error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
869		KASSERT(error == 0, ("%s: error %d getting SS descriptor",
870		    __func__, error));
871		if (SEG_DESC_DEF32(ss_desc.access))
872			stackaddrsize = 4;
873		else
874			stackaddrsize = 2;
875	}
876
877	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
878	KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
879
880	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
881	KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
882
883	error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
884	KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
885	if (pushop) {
886		rsp -= size;
887	}
888
889	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
890	    rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
891	    &stack_gla)) {
892		vm_inject_ss(vm, vcpuid, 0);
893		return (0);
894	}
895
896	if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
897		vm_inject_ss(vm, vcpuid, 0);
898		return (0);
899	}
900
901	if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
902		vm_inject_ac(vm, vcpuid, 0);
903		return (0);
904	}
905
906	error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
907	    pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
908	if (error == -1) {
909		/*
910		 * XXX cannot return a negative error value here because it
911		 * ends up being the return value of the VM_RUN() ioctl and
912		 * is interpreted as a pseudo-error (for e.g. ERESTART).
913		 */
914		return (EFAULT);
915	} else if (error == 1) {
916		/* Resume guest execution to handle page fault */
917		return (0);
918	}
919
920	if (pushop) {
921		error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
922		if (error == 0)
923			vm_copyout(vm, vcpuid, &val, copyinfo, size);
924	} else {
925		vm_copyin(vm, vcpuid, copyinfo, &val, size);
926		error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
927		rsp += size;
928	}
929#ifdef _KERNEL
930	vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
931#endif
932
933	if (error == 0) {
934		error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
935		    stackaddrsize);
936		KASSERT(error == 0, ("error %d updating rsp", error));
937	}
938	return (error);
939}
940
941static int
942emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
943    struct vm_guest_paging *paging, mem_region_read_t memread,
944    mem_region_write_t memwrite, void *arg)
945{
946	int error;
947
948	/*
949	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
950	 *
951	 * PUSH is part of the group 5 extended opcodes and is identified
952	 * by ModRM:reg = b110.
953	 */
954	if ((vie->reg & 7) != 6)
955		return (EINVAL);
956
957	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
958	    memwrite, arg);
959	return (error);
960}
961
962static int
963emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
964    struct vm_guest_paging *paging, mem_region_read_t memread,
965    mem_region_write_t memwrite, void *arg)
966{
967	int error;
968
969	/*
970	 * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
971	 *
972	 * POP is part of the group 1A extended opcodes and is identified
973	 * by ModRM:reg = b000.
974	 */
975	if ((vie->reg & 7) != 0)
976		return (EINVAL);
977
978	error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
979	    memwrite, arg);
980	return (error);
981}
982
983int
984vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
985    struct vm_guest_paging *paging, mem_region_read_t memread,
986    mem_region_write_t memwrite, void *memarg)
987{
988	int error;
989
990	if (!vie->decoded)
991		return (EINVAL);
992
993	switch (vie->op.op_type) {
994	case VIE_OP_TYPE_POP:
995		error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
996		    memwrite, memarg);
997		break;
998	case VIE_OP_TYPE_PUSH:
999		error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
1000		    memwrite, memarg);
1001		break;
1002	case VIE_OP_TYPE_CMP:
1003		error = emulate_cmp(vm, vcpuid, gpa, vie,
1004				    memread, memwrite, memarg);
1005		break;
1006	case VIE_OP_TYPE_MOV:
1007		error = emulate_mov(vm, vcpuid, gpa, vie,
1008				    memread, memwrite, memarg);
1009		break;
1010	case VIE_OP_TYPE_MOVSX:
1011	case VIE_OP_TYPE_MOVZX:
1012		error = emulate_movx(vm, vcpuid, gpa, vie,
1013				     memread, memwrite, memarg);
1014		break;
1015	case VIE_OP_TYPE_AND:
1016		error = emulate_and(vm, vcpuid, gpa, vie,
1017				    memread, memwrite, memarg);
1018		break;
1019	case VIE_OP_TYPE_OR:
1020		error = emulate_or(vm, vcpuid, gpa, vie,
1021				    memread, memwrite, memarg);
1022		break;
1023	case VIE_OP_TYPE_SUB:
1024		error = emulate_sub(vm, vcpuid, gpa, vie,
1025				    memread, memwrite, memarg);
1026		break;
1027	default:
1028		error = EINVAL;
1029		break;
1030	}
1031
1032	return (error);
1033}
1034
1035int
1036vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
1037{
1038	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1039	    ("%s: invalid size %d", __func__, size));
1040	KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
1041
1042	if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
1043		return (0);
1044
1045	return ((gla & (size - 1)) ? 1 : 0);
1046}
1047
1048int
1049vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
1050{
1051	uint64_t mask;
1052
1053	if (cpu_mode != CPU_MODE_64BIT)
1054		return (0);
1055
1056	/*
1057	 * The value of the bit 47 in the 'gla' should be replicated in the
1058	 * most significant 16 bits.
1059	 */
1060	mask = ~((1UL << 48) - 1);
1061	if (gla & (1UL << 47))
1062		return ((gla & mask) != mask);
1063	else
1064		return ((gla & mask) != 0);
1065}
1066
1067uint64_t
1068vie_size2mask(int size)
1069{
1070	KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
1071	    ("vie_size2mask: invalid size %d", size));
1072	return (size2mask[size]);
1073}
1074
1075int
1076vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
1077    struct seg_desc *desc, uint64_t offset, int length, int addrsize,
1078    int prot, uint64_t *gla)
1079{
1080	uint64_t firstoff, low_limit, high_limit, segbase;
1081	int glasize, type;
1082
1083	KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
1084	    ("%s: invalid segment %d", __func__, seg));
1085	KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
1086	    ("%s: invalid operand size %d", __func__, length));
1087	KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
1088	    ("%s: invalid prot %#x", __func__, prot));
1089
1090	firstoff = offset;
1091	if (cpu_mode == CPU_MODE_64BIT) {
1092		KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
1093		    "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
1094		glasize = 8;
1095	} else {
1096		KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
1097		    "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
1098		glasize = 4;
1099		/*
1100		 * If the segment selector is loaded with a NULL selector
1101		 * then the descriptor is unusable and attempting to use
1102		 * it results in a #GP(0).
1103		 */
1104		if (SEG_DESC_UNUSABLE(desc->access))
1105			return (-1);
1106
1107		/*
1108		 * The processor generates a #NP exception when a segment
1109		 * register is loaded with a selector that points to a
1110		 * descriptor that is not present. If this was the case then
1111		 * it would have been checked before the VM-exit.
1112		 */
1113		KASSERT(SEG_DESC_PRESENT(desc->access),
1114		    ("segment %d not present: %#x", seg, desc->access));
1115
1116		/*
1117		 * The descriptor type must indicate a code/data segment.
1118		 */
1119		type = SEG_DESC_TYPE(desc->access);
1120		KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
1121		    "descriptor type %#x", seg, type));
1122
1123		if (prot & PROT_READ) {
1124			/* #GP on a read access to a exec-only code segment */
1125			if ((type & 0xA) == 0x8)
1126				return (-1);
1127		}
1128
1129		if (prot & PROT_WRITE) {
1130			/*
1131			 * #GP on a write access to a code segment or a
1132			 * read-only data segment.
1133			 */
1134			if (type & 0x8)			/* code segment */
1135				return (-1);
1136
1137			if ((type & 0xA) == 0)		/* read-only data seg */
1138				return (-1);
1139		}
1140
1141		/*
1142		 * 'desc->limit' is fully expanded taking granularity into
1143		 * account.
1144		 */
1145		if ((type & 0xC) == 0x4) {
1146			/* expand-down data segment */
1147			low_limit = desc->limit + 1;
1148			high_limit = SEG_DESC_DEF32(desc->access) ?
1149			    0xffffffff : 0xffff;
1150		} else {
1151			/* code segment or expand-up data segment */
1152			low_limit = 0;
1153			high_limit = desc->limit;
1154		}
1155
1156		while (length > 0) {
1157			offset &= vie_size2mask(addrsize);
1158			if (offset < low_limit || offset > high_limit)
1159				return (-1);
1160			offset++;
1161			length--;
1162		}
1163	}
1164
1165	/*
1166	 * In 64-bit mode all segments except %fs and %gs have a segment
1167	 * base address of 0.
1168	 */
1169	if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
1170	    seg != VM_REG_GUEST_GS) {
1171		segbase = 0;
1172	} else {
1173		segbase = desc->base;
1174	}
1175
1176	/*
1177	 * Truncate 'firstoff' to the effective address size before adding
1178	 * it to the segment base.
1179	 */
1180	firstoff &= vie_size2mask(addrsize);
1181	*gla = (segbase + firstoff) & vie_size2mask(glasize);
1182	return (0);
1183}
1184
1185#ifdef _KERNEL
1186void
1187vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
1188{
1189	KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
1190	    ("%s: invalid instruction length (%d)", __func__, inst_length));
1191
1192	bzero(vie, sizeof(struct vie));
1193
1194	vie->base_register = VM_REG_LAST;
1195	vie->index_register = VM_REG_LAST;
1196
1197	if (inst_length) {
1198		bcopy(inst_bytes, vie->inst, inst_length);
1199		vie->num_valid = inst_length;
1200	}
1201}
1202
1203static int
1204pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
1205{
1206	int error_code = 0;
1207
1208	if (pte & PG_V)
1209		error_code |= PGEX_P;
1210	if (prot & VM_PROT_WRITE)
1211		error_code |= PGEX_W;
1212	if (usermode)
1213		error_code |= PGEX_U;
1214	if (rsvd)
1215		error_code |= PGEX_RSV;
1216	if (prot & VM_PROT_EXECUTE)
1217		error_code |= PGEX_I;
1218
1219	return (error_code);
1220}
1221
1222static void
1223ptp_release(void **cookie)
1224{
1225	if (*cookie != NULL) {
1226		vm_gpa_release(*cookie);
1227		*cookie = NULL;
1228	}
1229}
1230
1231static void *
1232ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
1233{
1234	void *ptr;
1235
1236	ptp_release(cookie);
1237	ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
1238	return (ptr);
1239}
1240
1241int
1242vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1243    uint64_t gla, int prot, uint64_t *gpa)
1244{
1245	int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
1246	u_int retries;
1247	uint64_t *ptpbase, ptpphys, pte, pgsize;
1248	uint32_t *ptpbase32, pte32;
1249	void *cookie;
1250
1251	usermode = (paging->cpl == 3 ? 1 : 0);
1252	writable = prot & VM_PROT_WRITE;
1253	cookie = NULL;
1254	retval = 0;
1255	retries = 0;
1256restart:
1257	ptpphys = paging->cr3;		/* root of the page tables */
1258	ptp_release(&cookie);
1259	if (retries++ > 0)
1260		maybe_yield();
1261
1262	if (vie_canonical_check(paging->cpu_mode, gla)) {
1263		/*
1264		 * XXX assuming a non-stack reference otherwise a stack fault
1265		 * should be generated.
1266		 */
1267		vm_inject_gp(vm, vcpuid);
1268		goto fault;
1269	}
1270
1271	if (paging->paging_mode == PAGING_MODE_FLAT) {
1272		*gpa = gla;
1273		goto done;
1274	}
1275
1276	if (paging->paging_mode == PAGING_MODE_32) {
1277		nlevels = 2;
1278		while (--nlevels >= 0) {
1279			/* Zero out the lower 12 bits. */
1280			ptpphys &= ~0xfff;
1281
1282			ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1283
1284			if (ptpbase32 == NULL)
1285				goto error;
1286
1287			ptpshift = PAGE_SHIFT + nlevels * 10;
1288			ptpindex = (gla >> ptpshift) & 0x3FF;
1289			pgsize = 1UL << ptpshift;
1290
1291			pte32 = ptpbase32[ptpindex];
1292
1293			if ((pte32 & PG_V) == 0 ||
1294			    (usermode && (pte32 & PG_U) == 0) ||
1295			    (writable && (pte32 & PG_RW) == 0)) {
1296				pfcode = pf_error_code(usermode, prot, 0,
1297				    pte32);
1298				vm_inject_pf(vm, vcpuid, pfcode, gla);
1299				goto fault;
1300			}
1301
1302			/*
1303			 * Emulate the x86 MMU's management of the accessed
1304			 * and dirty flags. While the accessed flag is set
1305			 * at every level of the page table, the dirty flag
1306			 * is only set at the last level providing the guest
1307			 * physical address.
1308			 */
1309			if ((pte32 & PG_A) == 0) {
1310				if (atomic_cmpset_32(&ptpbase32[ptpindex],
1311				    pte32, pte32 | PG_A) == 0) {
1312					goto restart;
1313				}
1314			}
1315
1316			/* XXX must be ignored if CR4.PSE=0 */
1317			if (nlevels > 0 && (pte32 & PG_PS) != 0)
1318				break;
1319
1320			ptpphys = pte32;
1321		}
1322
1323		/* Set the dirty bit in the page table entry if necessary */
1324		if (writable && (pte32 & PG_M) == 0) {
1325			if (atomic_cmpset_32(&ptpbase32[ptpindex],
1326			    pte32, pte32 | PG_M) == 0) {
1327				goto restart;
1328			}
1329		}
1330
1331		/* Zero out the lower 'ptpshift' bits */
1332		pte32 >>= ptpshift; pte32 <<= ptpshift;
1333		*gpa = pte32 | (gla & (pgsize - 1));
1334		goto done;
1335	}
1336
1337	if (paging->paging_mode == PAGING_MODE_PAE) {
1338		/* Zero out the lower 5 bits and the upper 32 bits */
1339		ptpphys &= 0xffffffe0UL;
1340
1341		ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
1342		if (ptpbase == NULL)
1343			goto error;
1344
1345		ptpindex = (gla >> 30) & 0x3;
1346
1347		pte = ptpbase[ptpindex];
1348
1349		if ((pte & PG_V) == 0) {
1350			pfcode = pf_error_code(usermode, prot, 0, pte);
1351			vm_inject_pf(vm, vcpuid, pfcode, gla);
1352			goto fault;
1353		}
1354
1355		ptpphys = pte;
1356
1357		nlevels = 2;
1358	} else
1359		nlevels = 4;
1360	while (--nlevels >= 0) {
1361		/* Zero out the lower 12 bits and the upper 12 bits */
1362		ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
1363
1364		ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
1365		if (ptpbase == NULL)
1366			goto error;
1367
1368		ptpshift = PAGE_SHIFT + nlevels * 9;
1369		ptpindex = (gla >> ptpshift) & 0x1FF;
1370		pgsize = 1UL << ptpshift;
1371
1372		pte = ptpbase[ptpindex];
1373
1374		if ((pte & PG_V) == 0 ||
1375		    (usermode && (pte & PG_U) == 0) ||
1376		    (writable && (pte & PG_RW) == 0)) {
1377			pfcode = pf_error_code(usermode, prot, 0, pte);
1378			vm_inject_pf(vm, vcpuid, pfcode, gla);
1379			goto fault;
1380		}
1381
1382		/* Set the accessed bit in the page table entry */
1383		if ((pte & PG_A) == 0) {
1384			if (atomic_cmpset_64(&ptpbase[ptpindex],
1385			    pte, pte | PG_A) == 0) {
1386				goto restart;
1387			}
1388		}
1389
1390		if (nlevels > 0 && (pte & PG_PS) != 0) {
1391			if (pgsize > 1 * GB) {
1392				pfcode = pf_error_code(usermode, prot, 1, pte);
1393				vm_inject_pf(vm, vcpuid, pfcode, gla);
1394				goto fault;
1395			}
1396			break;
1397		}
1398
1399		ptpphys = pte;
1400	}
1401
1402	/* Set the dirty bit in the page table entry if necessary */
1403	if (writable && (pte & PG_M) == 0) {
1404		if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
1405			goto restart;
1406	}
1407
1408	/* Zero out the lower 'ptpshift' bits and the upper 12 bits */
1409	pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
1410	*gpa = pte | (gla & (pgsize - 1));
1411done:
1412	ptp_release(&cookie);
1413	return (retval);
1414error:
1415	retval = -1;
1416	goto done;
1417fault:
1418	retval = 1;
1419	goto done;
1420}
1421
1422int
1423vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
1424    uint64_t rip, int inst_length, struct vie *vie)
1425{
1426	struct vm_copyinfo copyinfo[2];
1427	int error, prot;
1428
1429	if (inst_length > VIE_INST_SIZE)
1430		panic("vmm_fetch_instruction: invalid length %d", inst_length);
1431
1432	prot = PROT_READ | PROT_EXEC;
1433	error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
1434	    copyinfo, nitems(copyinfo));
1435	if (error == 0) {
1436		vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
1437		vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
1438		vie->num_valid = inst_length;
1439	}
1440	return (error);
1441}
1442
1443static int
1444vie_peek(struct vie *vie, uint8_t *x)
1445{
1446
1447	if (vie->num_processed < vie->num_valid) {
1448		*x = vie->inst[vie->num_processed];
1449		return (0);
1450	} else
1451		return (-1);
1452}
1453
1454static void
1455vie_advance(struct vie *vie)
1456{
1457
1458	vie->num_processed++;
1459}
1460
1461static int
1462decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
1463{
1464	uint8_t x;
1465
1466	while (1) {
1467		if (vie_peek(vie, &x))
1468			return (-1);
1469
1470		if (x == 0x66)
1471			vie->opsize_override = 1;
1472		else if (x == 0x67)
1473			vie->addrsize_override = 1;
1474		else
1475			break;
1476
1477		vie_advance(vie);
1478	}
1479
1480	/*
1481	 * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
1482	 * - Only one REX prefix is allowed per instruction.
1483	 * - The REX prefix must immediately precede the opcode byte or the
1484	 *   escape opcode byte.
1485	 * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
1486	 *   the mandatory prefix must come before the REX prefix.
1487	 */
1488	if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
1489		vie->rex_present = 1;
1490		vie->rex_w = x & 0x8 ? 1 : 0;
1491		vie->rex_r = x & 0x4 ? 1 : 0;
1492		vie->rex_x = x & 0x2 ? 1 : 0;
1493		vie->rex_b = x & 0x1 ? 1 : 0;
1494		vie_advance(vie);
1495	}
1496
1497	/*
1498	 * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
1499	 */
1500	if (cpu_mode == CPU_MODE_64BIT) {
1501		/*
1502		 * Default address size is 64-bits and default operand size
1503		 * is 32-bits.
1504		 */
1505		vie->addrsize = vie->addrsize_override ? 4 : 8;
1506		if (vie->rex_w)
1507			vie->opsize = 8;
1508		else if (vie->opsize_override)
1509			vie->opsize = 2;
1510		else
1511			vie->opsize = 4;
1512	} else if (cs_d) {
1513		/* Default address and operand sizes are 32-bits */
1514		vie->addrsize = vie->addrsize_override ? 2 : 4;
1515		vie->opsize = vie->opsize_override ? 2 : 4;
1516	} else {
1517		/* Default address and operand sizes are 16-bits */
1518		vie->addrsize = vie->addrsize_override ? 4 : 2;
1519		vie->opsize = vie->opsize_override ? 4 : 2;
1520	}
1521	return (0);
1522}
1523
1524static int
1525decode_two_byte_opcode(struct vie *vie)
1526{
1527	uint8_t x;
1528
1529	if (vie_peek(vie, &x))
1530		return (-1);
1531
1532	vie->op = two_byte_opcodes[x];
1533
1534	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1535		return (-1);
1536
1537	vie_advance(vie);
1538	return (0);
1539}
1540
1541static int
1542decode_opcode(struct vie *vie)
1543{
1544	uint8_t x;
1545
1546	if (vie_peek(vie, &x))
1547		return (-1);
1548
1549	vie->op = one_byte_opcodes[x];
1550
1551	if (vie->op.op_type == VIE_OP_TYPE_NONE)
1552		return (-1);
1553
1554	vie_advance(vie);
1555
1556	if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
1557		return (decode_two_byte_opcode(vie));
1558
1559	return (0);
1560}
1561
1562static int
1563decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
1564{
1565	uint8_t x;
1566
1567	if (cpu_mode == CPU_MODE_REAL)
1568		return (-1);
1569
1570	if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
1571		return (0);
1572
1573	if (vie_peek(vie, &x))
1574		return (-1);
1575
1576	vie->mod = (x >> 6) & 0x3;
1577	vie->rm =  (x >> 0) & 0x7;
1578	vie->reg = (x >> 3) & 0x7;
1579
1580	/*
1581	 * A direct addressing mode makes no sense in the context of an EPT
1582	 * fault. There has to be a memory access involved to cause the
1583	 * EPT fault.
1584	 */
1585	if (vie->mod == VIE_MOD_DIRECT)
1586		return (-1);
1587
1588	if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
1589	    (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
1590		/*
1591		 * Table 2-5: Special Cases of REX Encodings
1592		 *
1593		 * mod=0, r/m=5 is used in the compatibility mode to
1594		 * indicate a disp32 without a base register.
1595		 *
1596		 * mod!=3, r/m=4 is used in the compatibility mode to
1597		 * indicate that the SIB byte is present.
1598		 *
1599		 * The 'b' bit in the REX prefix is don't care in
1600		 * this case.
1601		 */
1602	} else {
1603		vie->rm |= (vie->rex_b << 3);
1604	}
1605
1606	vie->reg |= (vie->rex_r << 3);
1607
1608	/* SIB */
1609	if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
1610		goto done;
1611
1612	vie->base_register = gpr_map[vie->rm];
1613
1614	switch (vie->mod) {
1615	case VIE_MOD_INDIRECT_DISP8:
1616		vie->disp_bytes = 1;
1617		break;
1618	case VIE_MOD_INDIRECT_DISP32:
1619		vie->disp_bytes = 4;
1620		break;
1621	case VIE_MOD_INDIRECT:
1622		if (vie->rm == VIE_RM_DISP32) {
1623			vie->disp_bytes = 4;
1624			/*
1625			 * Table 2-7. RIP-Relative Addressing
1626			 *
1627			 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
1628			 * whereas in compatibility mode it just implies disp32.
1629			 */
1630
1631			if (cpu_mode == CPU_MODE_64BIT)
1632				vie->base_register = VM_REG_GUEST_RIP;
1633			else
1634				vie->base_register = VM_REG_LAST;
1635		}
1636		break;
1637	}
1638
1639done:
1640	vie_advance(vie);
1641
1642	return (0);
1643}
1644
1645static int
1646decode_sib(struct vie *vie)
1647{
1648	uint8_t x;
1649
1650	/* Proceed only if SIB byte is present */
1651	if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
1652		return (0);
1653
1654	if (vie_peek(vie, &x))
1655		return (-1);
1656
1657	/* De-construct the SIB byte */
1658	vie->ss = (x >> 6) & 0x3;
1659	vie->index = (x >> 3) & 0x7;
1660	vie->base = (x >> 0) & 0x7;
1661
1662	/* Apply the REX prefix modifiers */
1663	vie->index |= vie->rex_x << 3;
1664	vie->base |= vie->rex_b << 3;
1665
1666	switch (vie->mod) {
1667	case VIE_MOD_INDIRECT_DISP8:
1668		vie->disp_bytes = 1;
1669		break;
1670	case VIE_MOD_INDIRECT_DISP32:
1671		vie->disp_bytes = 4;
1672		break;
1673	}
1674
1675	if (vie->mod == VIE_MOD_INDIRECT &&
1676	    (vie->base == 5 || vie->base == 13)) {
1677		/*
1678		 * Special case when base register is unused if mod = 0
1679		 * and base = %rbp or %r13.
1680		 *
1681		 * Documented in:
1682		 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1683		 * Table 2-5: Special Cases of REX Encodings
1684		 */
1685		vie->disp_bytes = 4;
1686	} else {
1687		vie->base_register = gpr_map[vie->base];
1688	}
1689
1690	/*
1691	 * All encodings of 'index' are valid except for %rsp (4).
1692	 *
1693	 * Documented in:
1694	 * Table 2-3: 32-bit Addressing Forms with the SIB Byte
1695	 * Table 2-5: Special Cases of REX Encodings
1696	 */
1697	if (vie->index != 4)
1698		vie->index_register = gpr_map[vie->index];
1699
1700	/* 'scale' makes sense only in the context of an index register */
1701	if (vie->index_register < VM_REG_LAST)
1702		vie->scale = 1 << vie->ss;
1703
1704	vie_advance(vie);
1705
1706	return (0);
1707}
1708
1709static int
1710decode_displacement(struct vie *vie)
1711{
1712	int n, i;
1713	uint8_t x;
1714
1715	union {
1716		char	buf[4];
1717		int8_t	signed8;
1718		int32_t	signed32;
1719	} u;
1720
1721	if ((n = vie->disp_bytes) == 0)
1722		return (0);
1723
1724	if (n != 1 && n != 4)
1725		panic("decode_displacement: invalid disp_bytes %d", n);
1726
1727	for (i = 0; i < n; i++) {
1728		if (vie_peek(vie, &x))
1729			return (-1);
1730
1731		u.buf[i] = x;
1732		vie_advance(vie);
1733	}
1734
1735	if (n == 1)
1736		vie->displacement = u.signed8;		/* sign-extended */
1737	else
1738		vie->displacement = u.signed32;		/* sign-extended */
1739
1740	return (0);
1741}
1742
1743static int
1744decode_immediate(struct vie *vie)
1745{
1746	int i, n;
1747	uint8_t x;
1748	union {
1749		char	buf[4];
1750		int8_t	signed8;
1751		int16_t	signed16;
1752		int32_t	signed32;
1753	} u;
1754
1755	/* Figure out immediate operand size (if any) */
1756	if (vie->op.op_flags & VIE_OP_F_IMM) {
1757		/*
1758		 * Section 2.2.1.5 "Immediates", Intel SDM:
1759		 * In 64-bit mode the typical size of immediate operands
1760		 * remains 32-bits. When the operand size if 64-bits, the
1761		 * processor sign-extends all immediates to 64-bits prior
1762		 * to their use.
1763		 */
1764		if (vie->opsize == 4 || vie->opsize == 8)
1765			vie->imm_bytes = 4;
1766		else
1767			vie->imm_bytes = 2;
1768	} else if (vie->op.op_flags & VIE_OP_F_IMM8) {
1769		vie->imm_bytes = 1;
1770	}
1771
1772	if ((n = vie->imm_bytes) == 0)
1773		return (0);
1774
1775	KASSERT(n == 1 || n == 2 || n == 4,
1776	    ("%s: invalid number of immediate bytes: %d", __func__, n));
1777
1778	for (i = 0; i < n; i++) {
1779		if (vie_peek(vie, &x))
1780			return (-1);
1781
1782		u.buf[i] = x;
1783		vie_advance(vie);
1784	}
1785
1786	/* sign-extend the immediate value before use */
1787	if (n == 1)
1788		vie->immediate = u.signed8;
1789	else if (n == 2)
1790		vie->immediate = u.signed16;
1791	else
1792		vie->immediate = u.signed32;
1793
1794	return (0);
1795}
1796
1797static int
1798decode_moffset(struct vie *vie)
1799{
1800	int i, n;
1801	uint8_t x;
1802	union {
1803		char	buf[8];
1804		uint64_t u64;
1805	} u;
1806
1807	if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
1808		return (0);
1809
1810	/*
1811	 * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
1812	 * The memory offset size follows the address-size of the instruction.
1813	 */
1814	n = vie->addrsize;
1815	KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
1816
1817	u.u64 = 0;
1818	for (i = 0; i < n; i++) {
1819		if (vie_peek(vie, &x))
1820			return (-1);
1821
1822		u.buf[i] = x;
1823		vie_advance(vie);
1824	}
1825	vie->displacement = u.u64;
1826	return (0);
1827}
1828
1829/*
1830 * Verify that all the bytes in the instruction buffer were consumed.
1831 */
1832static int
1833verify_inst_length(struct vie *vie)
1834{
1835
1836	if (vie->num_processed)
1837		return (0);
1838	else
1839		return (-1);
1840}
1841
1842/*
1843 * Verify that the 'guest linear address' provided as collateral of the nested
1844 * page table fault matches with our instruction decoding.
1845 */
1846static int
1847verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
1848{
1849	int error;
1850	uint64_t base, idx, gla2;
1851
1852	/* Skip 'gla' verification */
1853	if (gla == VIE_INVALID_GLA)
1854		return (0);
1855
1856	base = 0;
1857	if (vie->base_register != VM_REG_LAST) {
1858		error = vm_get_register(vm, cpuid, vie->base_register, &base);
1859		if (error) {
1860			printf("verify_gla: error %d getting base reg %d\n",
1861				error, vie->base_register);
1862			return (-1);
1863		}
1864
1865		/*
1866		 * RIP-relative addressing starts from the following
1867		 * instruction
1868		 */
1869		if (vie->base_register == VM_REG_GUEST_RIP)
1870			base += vie->num_valid;
1871	}
1872
1873	idx = 0;
1874	if (vie->index_register != VM_REG_LAST) {
1875		error = vm_get_register(vm, cpuid, vie->index_register, &idx);
1876		if (error) {
1877			printf("verify_gla: error %d getting index reg %d\n",
1878				error, vie->index_register);
1879			return (-1);
1880		}
1881	}
1882
1883	/* XXX assuming that the base address of the segment is 0 */
1884	gla2 = base + vie->scale * idx + vie->displacement;
1885	gla2 &= size2mask[vie->addrsize];
1886	if (gla != gla2) {
1887		printf("verify_gla mismatch: "
1888		       "base(0x%0lx), scale(%d), index(0x%0lx), "
1889		       "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
1890		       base, vie->scale, idx, vie->displacement, gla, gla2);
1891		return (-1);
1892	}
1893
1894	return (0);
1895}
1896
1897int
1898vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
1899		       enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
1900{
1901
1902	if (decode_prefixes(vie, cpu_mode, cs_d))
1903		return (-1);
1904
1905	if (decode_opcode(vie))
1906		return (-1);
1907
1908	if (decode_modrm(vie, cpu_mode))
1909		return (-1);
1910
1911	if (decode_sib(vie))
1912		return (-1);
1913
1914	if (decode_displacement(vie))
1915		return (-1);
1916
1917	if (decode_immediate(vie))
1918		return (-1);
1919
1920	if (decode_moffset(vie))
1921		return (-1);
1922
1923	if (verify_inst_length(vie))
1924		return (-1);
1925
1926	if (verify_gla(vm, cpuid, gla, vie))
1927		return (-1);
1928
1929	vie->decoded = 1;	/* success */
1930
1931	return (0);
1932}
1933#endif	/* _KERNEL */
1934