1268777Sneel/*-
2268777Sneel * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
3268777Sneel * All rights reserved.
4268777Sneel *
5268777Sneel * Redistribution and use in source and binary forms, with or without
6268777Sneel * modification, are permitted provided that the following conditions
7268777Sneel * are met:
8268777Sneel * 1. Redistributions of source code must retain the above copyright
9268777Sneel *    notice, this list of conditions and the following disclaimer.
10268777Sneel * 2. Redistributions in binary form must reproduce the above copyright
11268777Sneel *    notice, this list of conditions and the following disclaimer in the
12268777Sneel *    documentation and/or other materials provided with the distribution.
13268777Sneel *
14268777Sneel * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15268777Sneel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16268777Sneel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17268777Sneel * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18268777Sneel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19268777Sneel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20268777Sneel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21268777Sneel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22268777Sneel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23268777Sneel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24268777Sneel * SUCH DAMAGE.
25268777Sneel */
26268777Sneel
27268777Sneel#include <sys/cdefs.h>
28268777Sneel__FBSDID("$FreeBSD$");
29268777Sneel
30268777Sneel#include <sys/param.h>
31268777Sneel#include <sys/_iovec.h>
32268777Sneel#include <sys/mman.h>
33268777Sneel
34268777Sneel#include <x86/psl.h>
35268777Sneel#include <x86/segments.h>
36268777Sneel#include <x86/specialreg.h>
37268777Sneel#include <machine/vmm.h>
38268777Sneel#include <machine/vmm_instruction_emul.h>
39268777Sneel
40268777Sneel#include <stdbool.h>
41268777Sneel#include <stdio.h>
42268777Sneel#include <stdlib.h>
43268777Sneel#include <assert.h>
44268777Sneel#include <errno.h>
45268777Sneel
46268777Sneel#include <vmmapi.h>
47268777Sneel
48268777Sneel#include "bhyverun.h"
49268777Sneel
50268777Sneel/*
51268777Sneel * Using 'struct i386tss' is tempting but causes myriad sign extension
52268777Sneel * issues because all of its fields are defined as signed integers.
53268777Sneel */
54268777Sneelstruct tss32 {
55268777Sneel	uint16_t	tss_link;
56268777Sneel	uint16_t	rsvd1;
57268777Sneel	uint32_t	tss_esp0;
58268777Sneel	uint16_t	tss_ss0;
59268777Sneel	uint16_t	rsvd2;
60268777Sneel	uint32_t	tss_esp1;
61268777Sneel	uint16_t	tss_ss1;
62268777Sneel	uint16_t	rsvd3;
63268777Sneel	uint32_t	tss_esp2;
64268777Sneel	uint16_t	tss_ss2;
65268777Sneel	uint16_t	rsvd4;
66268777Sneel	uint32_t	tss_cr3;
67268777Sneel	uint32_t	tss_eip;
68268777Sneel	uint32_t	tss_eflags;
69268777Sneel	uint32_t	tss_eax;
70268777Sneel	uint32_t	tss_ecx;
71268777Sneel	uint32_t	tss_edx;
72268777Sneel	uint32_t	tss_ebx;
73268777Sneel	uint32_t	tss_esp;
74268777Sneel	uint32_t	tss_ebp;
75268777Sneel	uint32_t	tss_esi;
76268777Sneel	uint32_t	tss_edi;
77268777Sneel	uint16_t	tss_es;
78268777Sneel	uint16_t	rsvd5;
79268777Sneel	uint16_t	tss_cs;
80268777Sneel	uint16_t	rsvd6;
81268777Sneel	uint16_t	tss_ss;
82268777Sneel	uint16_t	rsvd7;
83268777Sneel	uint16_t	tss_ds;
84268777Sneel	uint16_t	rsvd8;
85268777Sneel	uint16_t	tss_fs;
86268777Sneel	uint16_t	rsvd9;
87268777Sneel	uint16_t	tss_gs;
88268777Sneel	uint16_t	rsvd10;
89268777Sneel	uint16_t	tss_ldt;
90268777Sneel	uint16_t	rsvd11;
91268777Sneel	uint16_t	tss_trap;
92268777Sneel	uint16_t	tss_iomap;
93268777Sneel};
94268777SneelCTASSERT(sizeof(struct tss32) == 104);
95268777Sneel
96268777Sneel#define	SEL_START(sel)	(((sel) & ~0x7))
97268777Sneel#define	SEL_LIMIT(sel)	(((sel) | 0x7))
98268777Sneel#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
99268777Sneel
100268777Sneelstatic uint64_t
101268777SneelGETREG(struct vmctx *ctx, int vcpu, int reg)
102268777Sneel{
103268777Sneel	uint64_t val;
104268777Sneel	int error;
105268777Sneel
106268777Sneel	error = vm_get_register(ctx, vcpu, reg, &val);
107268777Sneel	assert(error == 0);
108268777Sneel	return (val);
109268777Sneel}
110268777Sneel
111268777Sneelstatic void
112268777SneelSETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
113268777Sneel{
114268777Sneel	int error;
115268777Sneel
116268777Sneel	error = vm_set_register(ctx, vcpu, reg, val);
117268777Sneel	assert(error == 0);
118268777Sneel}
119268777Sneel
120268777Sneelstatic struct seg_desc
121268777Sneelusd_to_seg_desc(struct user_segment_descriptor *usd)
122268777Sneel{
123268777Sneel	struct seg_desc seg_desc;
124268777Sneel
125268777Sneel	seg_desc.base = (u_int)USD_GETBASE(usd);
126268777Sneel	if (usd->sd_gran)
127268777Sneel		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
128268777Sneel	else
129268777Sneel		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
130268777Sneel	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
131268777Sneel	seg_desc.access |= usd->sd_xx << 12;
132268777Sneel	seg_desc.access |= usd->sd_def32 << 14;
133268777Sneel	seg_desc.access |= usd->sd_gran << 15;
134268777Sneel
135268777Sneel	return (seg_desc);
136268777Sneel}
137268777Sneel
138268777Sneel/*
139268777Sneel * Inject an exception with an error code that is a segment selector.
140268777Sneel * The format of the error code is described in section 6.13, "Error Code",
141268777Sneel * Intel SDM volume 3.
142268777Sneel *
143268777Sneel * Bit 0 (EXT) denotes whether the exception occurred during delivery
144268777Sneel * of an external event like an interrupt.
145268777Sneel *
146268777Sneel * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
147268777Sneel * in the IDT.
148268777Sneel *
149268777Sneel * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
150268777Sneel */
151268777Sneelstatic void
152268777Sneelsel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
153268777Sneel{
154268777Sneel	/*
155268777Sneel	 * Bit 2 from the selector is retained as-is in the error code.
156268777Sneel	 *
157268777Sneel	 * Bit 1 can be safely cleared because none of the selectors
158268777Sneel	 * encountered during task switch emulation refer to a task
159268777Sneel	 * gate in the IDT.
160268777Sneel	 *
161268777Sneel	 * Bit 0 is set depending on the value of 'ext'.
162268777Sneel	 */
163268777Sneel	sel &= ~0x3;
164268777Sneel	if (ext)
165268777Sneel		sel |= 0x1;
166270159Sgrehan	vm_inject_fault(ctx, vcpu, vector, 1, sel);
167268777Sneel}
168268777Sneel
169270159Sgrehan/*
170270159Sgrehan * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
171270159Sgrehan * and non-zero otherwise.
172270159Sgrehan */
173268777Sneelstatic int
174268777Sneeldesc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
175268777Sneel{
176268777Sneel	uint64_t base;
177268777Sneel	uint32_t limit, access;
178268777Sneel	int error, reg;
179268777Sneel
180268777Sneel	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
181268777Sneel	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
182268777Sneel	assert(error == 0);
183268777Sneel
184268777Sneel	if (reg == VM_REG_GUEST_LDTR) {
185268777Sneel		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
186268777Sneel			return (-1);
187268777Sneel	}
188268777Sneel
189268777Sneel	if (limit < SEL_LIMIT(sel))
190268777Sneel		return (-1);
191268777Sneel	else
192268777Sneel		return (0);
193268777Sneel}
194268777Sneel
195270159Sgrehan/*
196270159Sgrehan * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
197270159Sgrehan * by the selector 'sel'.
198270159Sgrehan *
199270159Sgrehan * Returns 0 on success.
200270159Sgrehan * Returns 1 if an exception was injected into the guest.
201270159Sgrehan * Returns -1 otherwise.
202270159Sgrehan */
203268777Sneelstatic int
204268777Sneeldesc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
205268777Sneel    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
206268777Sneel{
207268777Sneel	struct iovec iov[2];
208268777Sneel	uint64_t base;
209268777Sneel	uint32_t limit, access;
210268777Sneel	int error, reg;
211268777Sneel
212268777Sneel	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213268777Sneel	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
214268777Sneel	assert(error == 0);
215268777Sneel	assert(limit >= SEL_LIMIT(sel));
216268777Sneel
217270159Sgrehan	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
218268777Sneel	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
219268777Sneel	if (error == 0) {
220268777Sneel		if (doread)
221268777Sneel			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
222268777Sneel		else
223268777Sneel			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
224268777Sneel	}
225268777Sneel	return (error);
226268777Sneel}
227268777Sneel
228268777Sneelstatic int
229268777Sneeldesc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
230268777Sneel    uint16_t sel, struct user_segment_descriptor *desc)
231268777Sneel{
232268777Sneel	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
233268777Sneel}
234268777Sneel
235268777Sneelstatic int
236268777Sneeldesc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
237268777Sneel    uint16_t sel, struct user_segment_descriptor *desc)
238268777Sneel{
239268777Sneel	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
240268777Sneel}
241268777Sneel
242270159Sgrehan/*
243270159Sgrehan * Read the TSS descriptor referenced by 'sel' into 'desc'.
244270159Sgrehan *
245270159Sgrehan * Returns 0 on success.
246270159Sgrehan * Returns 1 if an exception was injected into the guest.
247270159Sgrehan * Returns -1 otherwise.
248270159Sgrehan */
249268777Sneelstatic int
250268777Sneelread_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
251268777Sneel    uint16_t sel, struct user_segment_descriptor *desc)
252268777Sneel{
253268777Sneel	struct vm_guest_paging sup_paging;
254268777Sneel	int error;
255268777Sneel
256268777Sneel	assert(!ISLDT(sel));
257268777Sneel	assert(IDXSEL(sel) != 0);
258268777Sneel
259268777Sneel	/* Fetch the new TSS descriptor */
260268777Sneel	if (desc_table_limit_check(ctx, vcpu, sel)) {
261268777Sneel		if (ts->reason == TSR_IRET)
262268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
263268777Sneel		else
264268777Sneel			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
265270159Sgrehan		return (1);
266268777Sneel	}
267268777Sneel
268268777Sneel	sup_paging = ts->paging;
269268777Sneel	sup_paging.cpl = 0;		/* implicit supervisor mode */
270268777Sneel	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
271270159Sgrehan	return (error);
272268777Sneel}
273268777Sneel
274268777Sneelstatic bool
275268777Sneelcode_desc(int sd_type)
276268777Sneel{
277268777Sneel	/* code descriptor */
278268777Sneel	return ((sd_type & 0x18) == 0x18);
279268777Sneel}
280268777Sneel
281268777Sneelstatic bool
282268777Sneelstack_desc(int sd_type)
283268777Sneel{
284268777Sneel	/* writable data descriptor */
285268777Sneel	return ((sd_type & 0x1A) == 0x12);
286268777Sneel}
287268777Sneel
288268777Sneelstatic bool
289268777Sneeldata_desc(int sd_type)
290268777Sneel{
291268777Sneel	/* data descriptor or a readable code descriptor */
292268777Sneel	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
293268777Sneel}
294268777Sneel
295268777Sneelstatic bool
296268777Sneelldt_desc(int sd_type)
297268777Sneel{
298268777Sneel
299268777Sneel	return (sd_type == SDT_SYSLDT);
300268777Sneel}
301268777Sneel
302270159Sgrehan/*
303270159Sgrehan * Validate the descriptor 'seg_desc' associated with 'segment'.
304270159Sgrehan *
305270159Sgrehan * Returns 0 on success.
306270159Sgrehan * Returns 1 if an exception was injected into the guest.
307270159Sgrehan * Returns -1 otherwise.
308270159Sgrehan */
309268777Sneelstatic int
310268777Sneelvalidate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
311268777Sneel    int segment, struct seg_desc *seg_desc)
312268777Sneel{
313268777Sneel	struct vm_guest_paging sup_paging;
314268777Sneel	struct user_segment_descriptor usd;
315268777Sneel	int error, idtvec;
316268777Sneel	int cpl, dpl, rpl;
317268777Sneel	uint16_t sel, cs;
318268777Sneel	bool ldtseg, codeseg, stackseg, dataseg, conforming;
319268777Sneel
320268777Sneel	ldtseg = codeseg = stackseg = dataseg = false;
321268777Sneel	switch (segment) {
322268777Sneel	case VM_REG_GUEST_LDTR:
323268777Sneel		ldtseg = true;
324268777Sneel		break;
325268777Sneel	case VM_REG_GUEST_CS:
326268777Sneel		codeseg = true;
327268777Sneel		break;
328268777Sneel	case VM_REG_GUEST_SS:
329268777Sneel		stackseg = true;
330268777Sneel		break;
331268777Sneel	case VM_REG_GUEST_DS:
332268777Sneel	case VM_REG_GUEST_ES:
333268777Sneel	case VM_REG_GUEST_FS:
334268777Sneel	case VM_REG_GUEST_GS:
335268777Sneel		dataseg = true;
336268777Sneel		break;
337268777Sneel	default:
338268777Sneel		assert(0);
339268777Sneel	}
340268777Sneel
341268777Sneel	/* Get the segment selector */
342268777Sneel	sel = GETREG(ctx, vcpu, segment);
343268777Sneel
344268777Sneel	/* LDT selector must point into the GDT */
345268777Sneel	if (ldtseg && ISLDT(sel)) {
346268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
347270159Sgrehan		return (1);
348268777Sneel	}
349268777Sneel
350268777Sneel	/* Descriptor table limit check */
351268777Sneel	if (desc_table_limit_check(ctx, vcpu, sel)) {
352268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
353270159Sgrehan		return (1);
354268777Sneel	}
355268777Sneel
356268777Sneel	/* NULL selector */
357268777Sneel	if (IDXSEL(sel) == 0) {
358268777Sneel		/* Code and stack segment selectors cannot be NULL */
359268777Sneel		if (codeseg || stackseg) {
360268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
361270159Sgrehan			return (1);
362268777Sneel		}
363268777Sneel		seg_desc->base = 0;
364268777Sneel		seg_desc->limit = 0;
365268777Sneel		seg_desc->access = 0x10000;	/* unusable */
366268777Sneel		return (0);
367268777Sneel	}
368268777Sneel
369268777Sneel	/* Read the descriptor from the GDT/LDT */
370268777Sneel	sup_paging = ts->paging;
371268777Sneel	sup_paging.cpl = 0;	/* implicit supervisor mode */
372268777Sneel	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
373270159Sgrehan	if (error)
374270159Sgrehan		return (error);
375268777Sneel
376268777Sneel	/* Verify that the descriptor type is compatible with the segment */
377268777Sneel	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
378268777Sneel	    (codeseg && !code_desc(usd.sd_type)) ||
379268777Sneel	    (dataseg && !data_desc(usd.sd_type)) ||
380268777Sneel	    (stackseg && !stack_desc(usd.sd_type))) {
381268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
382270159Sgrehan		return (1);
383268777Sneel	}
384268777Sneel
385268777Sneel	/* Segment must be marked present */
386268777Sneel	if (!usd.sd_p) {
387268777Sneel		if (ldtseg)
388268777Sneel			idtvec = IDT_TS;
389268777Sneel		else if (stackseg)
390268777Sneel			idtvec = IDT_SS;
391268777Sneel		else
392268777Sneel			idtvec = IDT_NP;
393268777Sneel		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
394270159Sgrehan		return (1);
395268777Sneel	}
396268777Sneel
397268777Sneel	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
398268777Sneel	cpl = cs & SEL_RPL_MASK;
399268777Sneel	rpl = sel & SEL_RPL_MASK;
400268777Sneel	dpl = usd.sd_dpl;
401268777Sneel
402268777Sneel	if (stackseg && (rpl != cpl || dpl != cpl)) {
403268777Sneel		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
404270159Sgrehan		return (1);
405268777Sneel	}
406268777Sneel
407268777Sneel	if (codeseg) {
408268777Sneel		conforming = (usd.sd_type & 0x4) ? true : false;
409268777Sneel		if ((conforming && (cpl < dpl)) ||
410268777Sneel		    (!conforming && (cpl != dpl))) {
411268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
412270159Sgrehan			return (1);
413268777Sneel		}
414268777Sneel	}
415268777Sneel
416268777Sneel	if (dataseg) {
417268777Sneel		/*
418268777Sneel		 * A data segment is always non-conforming except when it's
419268777Sneel		 * descriptor is a readable, conforming code segment.
420268777Sneel		 */
421268777Sneel		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
422268777Sneel			conforming = true;
423268777Sneel		else
424268777Sneel			conforming = false;
425268777Sneel
426268777Sneel		if (!conforming && (rpl > dpl || cpl > dpl)) {
427268777Sneel			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
428270159Sgrehan			return (1);
429268777Sneel		}
430268777Sneel	}
431268777Sneel	*seg_desc = usd_to_seg_desc(&usd);
432268777Sneel	return (0);
433268777Sneel}
434268777Sneel
435268777Sneelstatic void
436268777Sneeltss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
437268777Sneel    uint32_t eip, struct tss32 *tss, struct iovec *iov)
438268777Sneel{
439268777Sneel
440268777Sneel	/* General purpose registers */
441268777Sneel	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
442268777Sneel	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
443268777Sneel	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
444268777Sneel	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
445268777Sneel	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
446268777Sneel	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
447268777Sneel	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
448268777Sneel	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
449268777Sneel
450268777Sneel	/* Segment selectors */
451268777Sneel	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
452268777Sneel	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
453268777Sneel	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
454268777Sneel	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
455268777Sneel	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
456268777Sneel	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
457268777Sneel
458268777Sneel	/* eflags and eip */
459268777Sneel	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
460268777Sneel	if (task_switch->reason == TSR_IRET)
461268777Sneel		tss->tss_eflags &= ~PSL_NT;
462268777Sneel	tss->tss_eip = eip;
463268777Sneel
464268777Sneel	/* Copy updated old TSS into guest memory */
465268777Sneel	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
466268777Sneel}
467268777Sneel
468268777Sneelstatic void
469268777Sneelupdate_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
470268777Sneel{
471268777Sneel	int error;
472268777Sneel
473268777Sneel	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
474268777Sneel	assert(error == 0);
475268777Sneel}
476268777Sneel
477270159Sgrehan/*
478270159Sgrehan * Update the vcpu registers to reflect the state of the new task.
479270159Sgrehan *
480270159Sgrehan * Returns 0 on success.
481270159Sgrehan * Returns 1 if an exception was injected into the guest.
482270159Sgrehan * Returns -1 otherwise.
483270159Sgrehan */
484268777Sneelstatic int
485268777Sneeltss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
486268777Sneel    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
487268777Sneel{
488268777Sneel	struct seg_desc seg_desc, seg_desc2;
489268777Sneel	uint64_t *pdpte, maxphyaddr, reserved;
490268777Sneel	uint32_t eflags;
491268777Sneel	int error, i;
492268777Sneel	bool nested;
493268777Sneel
494268777Sneel	nested = false;
495268777Sneel	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
496268777Sneel		tss->tss_link = ot_sel;
497268777Sneel		nested = true;
498268777Sneel	}
499268777Sneel
500268777Sneel	eflags = tss->tss_eflags;
501268777Sneel	if (nested)
502268777Sneel		eflags |= PSL_NT;
503268777Sneel
504268777Sneel	/* LDTR */
505268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
506268777Sneel
507268777Sneel	/* PBDR */
508268777Sneel	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
509268777Sneel		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
510268777Sneel			/*
511268777Sneel			 * XXX Assuming 36-bit MAXPHYADDR.
512268777Sneel			 */
513268777Sneel			maxphyaddr = (1UL << 36) - 1;
514268777Sneel			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
515268777Sneel			for (i = 0; i < 4; i++) {
516268777Sneel				/* Check reserved bits if the PDPTE is valid */
517268777Sneel				if (!(pdpte[i] & 0x1))
518268777Sneel					continue;
519268777Sneel				/*
520268777Sneel				 * Bits 2:1, 8:5 and bits above the processor's
521268777Sneel				 * maximum physical address are reserved.
522268777Sneel				 */
523268777Sneel				reserved = ~maxphyaddr | 0x1E6;
524268777Sneel				if (pdpte[i] & reserved) {
525270159Sgrehan					vm_inject_gp(ctx, vcpu);
526270159Sgrehan					return (1);
527268777Sneel				}
528268777Sneel			}
529268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
530268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
531268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
532268777Sneel			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
533268777Sneel		}
534268777Sneel		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
535268777Sneel		ts->paging.cr3 = tss->tss_cr3;
536268777Sneel	}
537268777Sneel
538268777Sneel	/* eflags and eip */
539268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
540268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
541268777Sneel
542268777Sneel	/* General purpose registers */
543268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
544268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
545268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
546268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
547268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
548268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
549268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
550268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
551268777Sneel
552268777Sneel	/* Segment selectors */
553268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
554268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
555268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
556268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
557268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
558268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
559268777Sneel
560268777Sneel	/*
561268777Sneel	 * If this is a nested task then write out the new TSS to update
562268777Sneel	 * the previous link field.
563268777Sneel	 */
564268777Sneel	if (nested)
565268777Sneel		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
566268777Sneel
567268777Sneel	/* Validate segment descriptors */
568268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
569268777Sneel	if (error)
570268777Sneel		return (error);
571268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
572268777Sneel
573268777Sneel	/*
574268777Sneel	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
575268777Sneel	 *
576268777Sneel	 * The SS and CS attribute checks on VM-entry are inter-dependent so
577268777Sneel	 * we need to make sure that both segments are valid before updating
578268777Sneel	 * either of them. This ensures that the VMCS state can pass the
579268777Sneel	 * VM-entry checks so the guest can handle any exception injected
580268777Sneel	 * during task switch emulation.
581268777Sneel	 */
582268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
583268777Sneel	if (error)
584268777Sneel		return (error);
585268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
586268777Sneel	if (error)
587268777Sneel		return (error);
588268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
589268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
590268777Sneel	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
591268777Sneel
592268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
593268777Sneel	if (error)
594268777Sneel		return (error);
595268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
596268777Sneel
597268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
598268777Sneel	if (error)
599268777Sneel		return (error);
600268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
601268777Sneel
602268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
603268777Sneel	if (error)
604268777Sneel		return (error);
605268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
606268777Sneel
607268777Sneel	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
608268777Sneel	if (error)
609268777Sneel		return (error);
610268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
611268777Sneel
612268777Sneel	return (0);
613268777Sneel}
614268777Sneel
615270159Sgrehan/*
616270159Sgrehan * Push an error code on the stack of the new task. This is needed if the
617270159Sgrehan * task switch was triggered by a hardware exception that causes an error
618270159Sgrehan * code to be saved (e.g. #PF).
619270159Sgrehan *
620270159Sgrehan * Returns 0 on success.
621270159Sgrehan * Returns 1 if an exception was injected into the guest.
622270159Sgrehan * Returns -1 otherwise.
623270159Sgrehan */
624268777Sneelstatic int
625268777Sneelpush_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
626268777Sneel    int task_type, uint32_t errcode)
627268777Sneel{
628268777Sneel	struct iovec iov[2];
629268777Sneel	struct seg_desc seg_desc;
630268777Sneel	int stacksize, bytes, error;
631268777Sneel	uint64_t gla, cr0, rflags;
632268777Sneel	uint32_t esp;
633268777Sneel	uint16_t stacksel;
634268777Sneel
635268777Sneel	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
636268777Sneel	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
637268777Sneel	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
638268777Sneel
639268777Sneel	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
640268777Sneel	    &seg_desc.limit, &seg_desc.access);
641268777Sneel	assert(error == 0);
642268777Sneel
643268777Sneel	/*
644268777Sneel	 * Section "Error Code" in the Intel SDM vol 3: the error code is
645268777Sneel	 * pushed on the stack as a doubleword or word (depending on the
646268777Sneel	 * default interrupt, trap or task gate size).
647268777Sneel	 */
648268777Sneel	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
649268777Sneel		bytes = 4;
650268777Sneel	else
651268777Sneel		bytes = 2;
652268777Sneel
653268777Sneel	/*
654268777Sneel	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
655268777Sneel	 * stack-segment descriptor determines the size of the stack
656268777Sneel	 * pointer outside of 64-bit mode.
657268777Sneel	 */
658268777Sneel	if (SEG_DESC_DEF32(seg_desc.access))
659268777Sneel		stacksize = 4;
660268777Sneel	else
661268777Sneel		stacksize = 2;
662268777Sneel
663268777Sneel	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
664268777Sneel	esp -= bytes;
665268777Sneel
666268777Sneel	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
667268777Sneel	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
668268777Sneel		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
669270159Sgrehan		return (1);
670268777Sneel	}
671268777Sneel
672268777Sneel	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
673270159Sgrehan		vm_inject_ac(ctx, vcpu, 1);
674270159Sgrehan		return (1);
675268777Sneel	}
676268777Sneel
677270159Sgrehan	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
678268777Sneel	    iov, nitems(iov));
679270159Sgrehan	if (error)
680270159Sgrehan		return (error);
681268777Sneel
682268777Sneel	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
683268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
684268777Sneel	return (0);
685268777Sneel}
686268777Sneel
687270159Sgrehan/*
688270159Sgrehan * Evaluate return value from helper functions and potentially return to
689270159Sgrehan * the VM run loop.
690270159Sgrehan *  0: success
691270159Sgrehan * +1: an exception was injected into the guest vcpu
692270159Sgrehan * -1: unrecoverable/programming error
693270159Sgrehan */
694270159Sgrehan#define	CHKERR(x)							\
695270159Sgrehan	do {								\
696270159Sgrehan		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
697270159Sgrehan		if ((x) == -1)						\
698270159Sgrehan			return (VMEXIT_ABORT);				\
699270159Sgrehan		else if ((x) == 1)					\
700270159Sgrehan			return (VMEXIT_CONTINUE);			\
701270159Sgrehan	} while (0)
702270159Sgrehan
703268777Sneelint
704268777Sneelvmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
705268777Sneel{
706268777Sneel	struct seg_desc nt;
707268777Sneel	struct tss32 oldtss, newtss;
708268777Sneel	struct vm_task_switch *task_switch;
709268777Sneel	struct vm_guest_paging *paging, sup_paging;
710268777Sneel	struct user_segment_descriptor nt_desc, ot_desc;
711268777Sneel	struct iovec nt_iov[2], ot_iov[2];
712268777Sneel	uint64_t cr0, ot_base;
713268777Sneel	uint32_t eip, ot_lim, access;
714268777Sneel	int error, ext, minlimit, nt_type, ot_type, vcpu;
715268777Sneel	enum task_switch_reason reason;
716268777Sneel	uint16_t nt_sel, ot_sel;
717268777Sneel
718268777Sneel	task_switch = &vmexit->u.task_switch;
719268777Sneel	nt_sel = task_switch->tsssel;
720268777Sneel	ext = vmexit->u.task_switch.ext;
721268777Sneel	reason = vmexit->u.task_switch.reason;
722268777Sneel	paging = &vmexit->u.task_switch.paging;
723268777Sneel	vcpu = *pvcpu;
724268777Sneel
725268777Sneel	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
726268777Sneel
727268777Sneel	/*
728268777Sneel	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
729268777Sneel	 * The following page table accesses are implicitly supervisor mode:
730268777Sneel	 * - accesses to GDT or LDT to load segment descriptors
731268777Sneel	 * - accesses to the task state segment during task switch
732268777Sneel	 */
733268777Sneel	sup_paging = *paging;
734268777Sneel	sup_paging.cpl = 0;	/* implicit supervisor mode */
735268777Sneel
736268777Sneel	/* Fetch the new TSS descriptor */
737268777Sneel	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
738270159Sgrehan	CHKERR(error);
739268777Sneel
740268777Sneel	nt = usd_to_seg_desc(&nt_desc);
741268777Sneel
742268777Sneel	/* Verify the type of the new TSS */
743268777Sneel	nt_type = SEG_DESC_TYPE(nt.access);
744268777Sneel	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
745268777Sneel	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
746268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
747270159Sgrehan		goto done;
748268777Sneel	}
749268777Sneel
750268777Sneel	/* TSS descriptor must have present bit set */
751268777Sneel	if (!SEG_DESC_PRESENT(nt.access)) {
752268777Sneel		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
753270159Sgrehan		goto done;
754268777Sneel	}
755268777Sneel
756268777Sneel	/*
757268777Sneel	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
758268777Sneel	 * 44 bytes for a 16-bit TSS.
759268777Sneel	 */
760268777Sneel	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
761268777Sneel		minlimit = 104 - 1;
762268777Sneel	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
763268777Sneel		minlimit = 44 - 1;
764268777Sneel	else
765268777Sneel		minlimit = 0;
766268777Sneel
767268777Sneel	assert(minlimit > 0);
768268777Sneel	if (nt.limit < minlimit) {
769268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
770270159Sgrehan		goto done;
771268777Sneel	}
772268777Sneel
773268777Sneel	/* TSS must be busy if task switch is due to IRET */
774268777Sneel	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
775268777Sneel		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
776270159Sgrehan		goto done;
777268777Sneel	}
778268777Sneel
779268777Sneel	/*
780268777Sneel	 * TSS must be available (not busy) if task switch reason is
781268777Sneel	 * CALL, JMP, exception or interrupt.
782268777Sneel	 */
783268777Sneel	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
784268777Sneel		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
785270159Sgrehan		goto done;
786268777Sneel	}
787268777Sneel
788268777Sneel	/* Fetch the new TSS */
789270159Sgrehan	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
790268777Sneel	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
791270159Sgrehan	CHKERR(error);
792270159Sgrehan	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
793268777Sneel
794268777Sneel	/* Get the old TSS selector from the guest's task register */
795268777Sneel	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
796268777Sneel	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
797268777Sneel		/*
798268777Sneel		 * This might happen if a task switch was attempted without
799268777Sneel		 * ever loading the task register with LTR. In this case the
800268777Sneel		 * TR would contain the values from power-on:
801268777Sneel		 * (sel = 0, base = 0, limit = 0xffff).
802268777Sneel		 */
803268777Sneel		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
804270159Sgrehan		goto done;
805268777Sneel	}
806268777Sneel
807268777Sneel	/* Get the old TSS base and limit from the guest's task register */
808268777Sneel	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
809268777Sneel	    &access);
810268777Sneel	assert(error == 0);
811268777Sneel	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
812268777Sneel	ot_type = SEG_DESC_TYPE(access);
813268777Sneel	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
814268777Sneel
815268777Sneel	/* Fetch the old TSS descriptor */
816270159Sgrehan	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
817270159Sgrehan	CHKERR(error);
818268777Sneel
819268777Sneel	/* Get the old TSS */
820270159Sgrehan	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
821268777Sneel	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
822270159Sgrehan	CHKERR(error);
823270159Sgrehan	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
824268777Sneel
825268777Sneel	/*
826268777Sneel	 * Clear the busy bit in the old TSS descriptor if the task switch
827268777Sneel	 * due to an IRET or JMP instruction.
828268777Sneel	 */
829268777Sneel	if (reason == TSR_IRET || reason == TSR_JMP) {
830268777Sneel		ot_desc.sd_type &= ~0x2;
831268777Sneel		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
832268777Sneel		    &ot_desc);
833270159Sgrehan		CHKERR(error);
834268777Sneel	}
835268777Sneel
836268777Sneel	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
837268777Sneel		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
838268777Sneel		return (VMEXIT_ABORT);
839268777Sneel	}
840268777Sneel
841268777Sneel	/* Save processor state in old TSS */
842268777Sneel	eip = vmexit->rip + vmexit->inst_length;
843268777Sneel	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
844268777Sneel
845268777Sneel	/*
846268777Sneel	 * If the task switch was triggered for any reason other than IRET
847268777Sneel	 * then set the busy bit in the new TSS descriptor.
848268777Sneel	 */
849268777Sneel	if (reason != TSR_IRET) {
850268777Sneel		nt_desc.sd_type |= 0x2;
851268777Sneel		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
852268777Sneel		    &nt_desc);
853270159Sgrehan		CHKERR(error);
854268777Sneel	}
855268777Sneel
856268777Sneel	/* Update task register to point at the new TSS */
857268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
858268777Sneel
859268777Sneel	/* Update the hidden descriptor state of the task register */
860268777Sneel	nt = usd_to_seg_desc(&nt_desc);
861268777Sneel	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
862268777Sneel
863268777Sneel	/* Set CR0.TS */
864268777Sneel	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
865268777Sneel	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
866268777Sneel
867268777Sneel	/*
868268777Sneel	 * We are now committed to the task switch. Any exceptions encountered
869268777Sneel	 * after this point will be handled in the context of the new task and
870268777Sneel	 * the saved instruction pointer will belong to the new task.
871268777Sneel	 */
872268777Sneel	vmexit->rip = newtss.tss_eip;
873268777Sneel	vmexit->inst_length = 0;
874268777Sneel
875268777Sneel	/* Load processor state from new TSS */
876268777Sneel	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
877270159Sgrehan	CHKERR(error);
878268777Sneel
879268777Sneel	/*
880268777Sneel	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
881268777Sneel	 * caused an error code to be generated, this error code is copied
882268777Sneel	 * to the stack of the new task.
883268777Sneel	 */
884268777Sneel	if (task_switch->errcode_valid) {
885268777Sneel		assert(task_switch->ext);
886268777Sneel		assert(task_switch->reason == TSR_IDT_GATE);
887268777Sneel		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
888268777Sneel		    task_switch->errcode);
889270159Sgrehan		CHKERR(error);
890268777Sneel	}
891268777Sneel
892268777Sneel	/*
893268777Sneel	 * Treatment of virtual-NMI blocking if NMI is delivered through
894268777Sneel	 * a task gate.
895268777Sneel	 *
896268777Sneel	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
897268777Sneel	 * If the virtual NMIs VM-execution control is 1, VM entry injects
898268777Sneel	 * an NMI, and delivery of the NMI causes a task switch that causes
899268777Sneel	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
900268777Sneel	 * commences.
901268777Sneel	 *
902268777Sneel	 * Thus, virtual-NMI blocking is in effect at the time of the task
903268777Sneel	 * switch VM exit.
904268777Sneel	 */
905268777Sneel
906268777Sneel	/*
907268777Sneel	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
908268777Sneel	 *
909268777Sneel	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
910268777Sneel	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
911268777Sneel	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
912268777Sneel	 *
913268777Sneel	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
914268777Sneel	 * VM exit.
915268777Sneel	 */
916268777Sneel
917268777Sneel	/*
918270159Sgrehan	 * If the task switch was triggered by an event delivered through
919270159Sgrehan	 * the IDT then extinguish the pending event from the vcpu's
920270159Sgrehan	 * exitintinfo.
921268777Sneel	 */
922270159Sgrehan	if (task_switch->reason == TSR_IDT_GATE) {
923270159Sgrehan		error = vm_set_intinfo(ctx, vcpu, 0);
924270159Sgrehan		assert(error == 0);
925270159Sgrehan	}
926268777Sneel
927268777Sneel	/*
928268777Sneel	 * XXX should inject debug exception if 'T' bit is 1
929268777Sneel	 */
930270159Sgrehandone:
931270159Sgrehan	return (VMEXIT_CONTINUE);
932268777Sneel}
933