task_switch.c revision 276349
1174035Sjb/*-
2174035Sjb * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
3174035Sjb * All rights reserved.
4174035Sjb *
5174035Sjb * Redistribution and use in source and binary forms, with or without
6174035Sjb * modification, are permitted provided that the following conditions
7174035Sjb * are met:
8174035Sjb * 1. Redistributions of source code must retain the above copyright
9174035Sjb *    notice, this list of conditions and the following disclaimer.
10174035Sjb * 2. Redistributions in binary form must reproduce the above copyright
11174035Sjb *    notice, this list of conditions and the following disclaimer in the
12174035Sjb *    documentation and/or other materials provided with the distribution.
13174035Sjb *
14174035Sjb * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15174035Sjb * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16174035Sjb * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17174035Sjb * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18174035Sjb * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19174035Sjb * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20174035Sjb * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21174035Sjb * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22174035Sjb * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23174035Sjb * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24174035Sjb * SUCH DAMAGE.
25174035Sjb */
26174035Sjb
27174035Sjb#include <sys/cdefs.h>
28174035Sjb__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/task_switch.c 276349 2014-12-28 21:27:13Z neel $");
29174035Sjb
30174035Sjb#include <sys/param.h>
31174035Sjb#include <sys/_iovec.h>
32174035Sjb#include <sys/mman.h>
33174035Sjb
34174035Sjb#include <x86/psl.h>
35174035Sjb#include <x86/segments.h>
36174035Sjb#include <x86/specialreg.h>
37174035Sjb#include <machine/vmm.h>
38#include <machine/vmm_instruction_emul.h>
39
40#include <stdbool.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <assert.h>
44#include <errno.h>
45
46#include <vmmapi.h>
47
48#include "bhyverun.h"
49
50/*
51 * Using 'struct i386tss' is tempting but causes myriad sign extension
52 * issues because all of its fields are defined as signed integers.
53 */
54struct tss32 {
55	uint16_t	tss_link;
56	uint16_t	rsvd1;
57	uint32_t	tss_esp0;
58	uint16_t	tss_ss0;
59	uint16_t	rsvd2;
60	uint32_t	tss_esp1;
61	uint16_t	tss_ss1;
62	uint16_t	rsvd3;
63	uint32_t	tss_esp2;
64	uint16_t	tss_ss2;
65	uint16_t	rsvd4;
66	uint32_t	tss_cr3;
67	uint32_t	tss_eip;
68	uint32_t	tss_eflags;
69	uint32_t	tss_eax;
70	uint32_t	tss_ecx;
71	uint32_t	tss_edx;
72	uint32_t	tss_ebx;
73	uint32_t	tss_esp;
74	uint32_t	tss_ebp;
75	uint32_t	tss_esi;
76	uint32_t	tss_edi;
77	uint16_t	tss_es;
78	uint16_t	rsvd5;
79	uint16_t	tss_cs;
80	uint16_t	rsvd6;
81	uint16_t	tss_ss;
82	uint16_t	rsvd7;
83	uint16_t	tss_ds;
84	uint16_t	rsvd8;
85	uint16_t	tss_fs;
86	uint16_t	rsvd9;
87	uint16_t	tss_gs;
88	uint16_t	rsvd10;
89	uint16_t	tss_ldt;
90	uint16_t	rsvd11;
91	uint16_t	tss_trap;
92	uint16_t	tss_iomap;
93};
94CTASSERT(sizeof(struct tss32) == 104);
95
96#define	SEL_START(sel)	(((sel) & ~0x7))
97#define	SEL_LIMIT(sel)	(((sel) | 0x7))
98#define	TSS_BUSY(type)	(((type) & 0x2) != 0)
99
100static uint64_t
101GETREG(struct vmctx *ctx, int vcpu, int reg)
102{
103	uint64_t val;
104	int error;
105
106	error = vm_get_register(ctx, vcpu, reg, &val);
107	assert(error == 0);
108	return (val);
109}
110
111static void
112SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
113{
114	int error;
115
116	error = vm_set_register(ctx, vcpu, reg, val);
117	assert(error == 0);
118}
119
120static struct seg_desc
121usd_to_seg_desc(struct user_segment_descriptor *usd)
122{
123	struct seg_desc seg_desc;
124
125	seg_desc.base = (u_int)USD_GETBASE(usd);
126	if (usd->sd_gran)
127		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
128	else
129		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
130	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
131	seg_desc.access |= usd->sd_xx << 12;
132	seg_desc.access |= usd->sd_def32 << 14;
133	seg_desc.access |= usd->sd_gran << 15;
134
135	return (seg_desc);
136}
137
138/*
139 * Inject an exception with an error code that is a segment selector.
140 * The format of the error code is described in section 6.13, "Error Code",
141 * Intel SDM volume 3.
142 *
143 * Bit 0 (EXT) denotes whether the exception occurred during delivery
144 * of an external event like an interrupt.
145 *
146 * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
147 * in the IDT.
148 *
149 * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
150 */
151static void
152sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
153{
154	/*
155	 * Bit 2 from the selector is retained as-is in the error code.
156	 *
157	 * Bit 1 can be safely cleared because none of the selectors
158	 * encountered during task switch emulation refer to a task
159	 * gate in the IDT.
160	 *
161	 * Bit 0 is set depending on the value of 'ext'.
162	 */
163	sel &= ~0x3;
164	if (ext)
165		sel |= 0x1;
166	vm_inject_fault(ctx, vcpu, vector, 1, sel);
167}
168
169/*
170 * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
171 * and non-zero otherwise.
172 */
173static int
174desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
175{
176	uint64_t base;
177	uint32_t limit, access;
178	int error, reg;
179
180	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
181	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
182	assert(error == 0);
183
184	if (reg == VM_REG_GUEST_LDTR) {
185		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
186			return (-1);
187	}
188
189	if (limit < SEL_LIMIT(sel))
190		return (-1);
191	else
192		return (0);
193}
194
195/*
196 * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
197 * by the selector 'sel'.
198 *
199 * Returns 0 on success.
200 * Returns 1 if an exception was injected into the guest.
201 * Returns -1 otherwise.
202 */
203static int
204desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
205    uint16_t sel, struct user_segment_descriptor *desc, bool doread)
206{
207	struct iovec iov[2];
208	uint64_t base;
209	uint32_t limit, access;
210	int error, reg;
211
212	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
214	assert(error == 0);
215	assert(limit >= SEL_LIMIT(sel));
216
217	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
218	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
219	if (error == 0) {
220		if (doread)
221			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
222		else
223			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
224	}
225	return (error);
226}
227
228static int
229desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
230    uint16_t sel, struct user_segment_descriptor *desc)
231{
232	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
233}
234
235static int
236desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
237    uint16_t sel, struct user_segment_descriptor *desc)
238{
239	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
240}
241
242/*
243 * Read the TSS descriptor referenced by 'sel' into 'desc'.
244 *
245 * Returns 0 on success.
246 * Returns 1 if an exception was injected into the guest.
247 * Returns -1 otherwise.
248 */
249static int
250read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
251    uint16_t sel, struct user_segment_descriptor *desc)
252{
253	struct vm_guest_paging sup_paging;
254	int error;
255
256	assert(!ISLDT(sel));
257	assert(IDXSEL(sel) != 0);
258
259	/* Fetch the new TSS descriptor */
260	if (desc_table_limit_check(ctx, vcpu, sel)) {
261		if (ts->reason == TSR_IRET)
262			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
263		else
264			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
265		return (1);
266	}
267
268	sup_paging = ts->paging;
269	sup_paging.cpl = 0;		/* implicit supervisor mode */
270	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
271	return (error);
272}
273
274static bool
275code_desc(int sd_type)
276{
277	/* code descriptor */
278	return ((sd_type & 0x18) == 0x18);
279}
280
281static bool
282stack_desc(int sd_type)
283{
284	/* writable data descriptor */
285	return ((sd_type & 0x1A) == 0x12);
286}
287
288static bool
289data_desc(int sd_type)
290{
291	/* data descriptor or a readable code descriptor */
292	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
293}
294
295static bool
296ldt_desc(int sd_type)
297{
298
299	return (sd_type == SDT_SYSLDT);
300}
301
302/*
303 * Validate the descriptor 'seg_desc' associated with 'segment'.
304 *
305 * Returns 0 on success.
306 * Returns 1 if an exception was injected into the guest.
307 * Returns -1 otherwise.
308 */
309static int
310validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
311    int segment, struct seg_desc *seg_desc)
312{
313	struct vm_guest_paging sup_paging;
314	struct user_segment_descriptor usd;
315	int error, idtvec;
316	int cpl, dpl, rpl;
317	uint16_t sel, cs;
318	bool ldtseg, codeseg, stackseg, dataseg, conforming;
319
320	ldtseg = codeseg = stackseg = dataseg = false;
321	switch (segment) {
322	case VM_REG_GUEST_LDTR:
323		ldtseg = true;
324		break;
325	case VM_REG_GUEST_CS:
326		codeseg = true;
327		break;
328	case VM_REG_GUEST_SS:
329		stackseg = true;
330		break;
331	case VM_REG_GUEST_DS:
332	case VM_REG_GUEST_ES:
333	case VM_REG_GUEST_FS:
334	case VM_REG_GUEST_GS:
335		dataseg = true;
336		break;
337	default:
338		assert(0);
339	}
340
341	/* Get the segment selector */
342	sel = GETREG(ctx, vcpu, segment);
343
344	/* LDT selector must point into the GDT */
345	if (ldtseg && ISLDT(sel)) {
346		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
347		return (1);
348	}
349
350	/* Descriptor table limit check */
351	if (desc_table_limit_check(ctx, vcpu, sel)) {
352		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
353		return (1);
354	}
355
356	/* NULL selector */
357	if (IDXSEL(sel) == 0) {
358		/* Code and stack segment selectors cannot be NULL */
359		if (codeseg || stackseg) {
360			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
361			return (1);
362		}
363		seg_desc->base = 0;
364		seg_desc->limit = 0;
365		seg_desc->access = 0x10000;	/* unusable */
366		return (0);
367	}
368
369	/* Read the descriptor from the GDT/LDT */
370	sup_paging = ts->paging;
371	sup_paging.cpl = 0;	/* implicit supervisor mode */
372	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
373	if (error)
374		return (error);
375
376	/* Verify that the descriptor type is compatible with the segment */
377	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
378	    (codeseg && !code_desc(usd.sd_type)) ||
379	    (dataseg && !data_desc(usd.sd_type)) ||
380	    (stackseg && !stack_desc(usd.sd_type))) {
381		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
382		return (1);
383	}
384
385	/* Segment must be marked present */
386	if (!usd.sd_p) {
387		if (ldtseg)
388			idtvec = IDT_TS;
389		else if (stackseg)
390			idtvec = IDT_SS;
391		else
392			idtvec = IDT_NP;
393		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
394		return (1);
395	}
396
397	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
398	cpl = cs & SEL_RPL_MASK;
399	rpl = sel & SEL_RPL_MASK;
400	dpl = usd.sd_dpl;
401
402	if (stackseg && (rpl != cpl || dpl != cpl)) {
403		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
404		return (1);
405	}
406
407	if (codeseg) {
408		conforming = (usd.sd_type & 0x4) ? true : false;
409		if ((conforming && (cpl < dpl)) ||
410		    (!conforming && (cpl != dpl))) {
411			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
412			return (1);
413		}
414	}
415
416	if (dataseg) {
417		/*
418		 * A data segment is always non-conforming except when it's
419		 * descriptor is a readable, conforming code segment.
420		 */
421		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
422			conforming = true;
423		else
424			conforming = false;
425
426		if (!conforming && (rpl > dpl || cpl > dpl)) {
427			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
428			return (1);
429		}
430	}
431	*seg_desc = usd_to_seg_desc(&usd);
432	return (0);
433}
434
435static void
436tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
437    uint32_t eip, struct tss32 *tss, struct iovec *iov)
438{
439
440	/* General purpose registers */
441	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
442	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
443	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
444	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
445	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
446	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
447	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
448	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
449
450	/* Segment selectors */
451	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
452	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
453	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
454	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
455	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
456	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
457
458	/* eflags and eip */
459	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
460	if (task_switch->reason == TSR_IRET)
461		tss->tss_eflags &= ~PSL_NT;
462	tss->tss_eip = eip;
463
464	/* Copy updated old TSS into guest memory */
465	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
466}
467
468static void
469update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
470{
471	int error;
472
473	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
474	assert(error == 0);
475}
476
477/*
478 * Update the vcpu registers to reflect the state of the new task.
479 *
480 * Returns 0 on success.
481 * Returns 1 if an exception was injected into the guest.
482 * Returns -1 otherwise.
483 */
484static int
485tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
486    uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
487{
488	struct seg_desc seg_desc, seg_desc2;
489	uint64_t *pdpte, maxphyaddr, reserved;
490	uint32_t eflags;
491	int error, i;
492	bool nested;
493
494	nested = false;
495	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
496		tss->tss_link = ot_sel;
497		nested = true;
498	}
499
500	eflags = tss->tss_eflags;
501	if (nested)
502		eflags |= PSL_NT;
503
504	/* LDTR */
505	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
506
507	/* PBDR */
508	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
509		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
510			/*
511			 * XXX Assuming 36-bit MAXPHYADDR.
512			 */
513			maxphyaddr = (1UL << 36) - 1;
514			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
515			for (i = 0; i < 4; i++) {
516				/* Check reserved bits if the PDPTE is valid */
517				if (!(pdpte[i] & 0x1))
518					continue;
519				/*
520				 * Bits 2:1, 8:5 and bits above the processor's
521				 * maximum physical address are reserved.
522				 */
523				reserved = ~maxphyaddr | 0x1E6;
524				if (pdpte[i] & reserved) {
525					vm_inject_gp(ctx, vcpu);
526					return (1);
527				}
528			}
529			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
530			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
531			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
532			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
533		}
534		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
535		ts->paging.cr3 = tss->tss_cr3;
536	}
537
538	/* eflags and eip */
539	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
540	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
541
542	/* General purpose registers */
543	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
544	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
545	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
546	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
547	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
548	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
549	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
550	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
551
552	/* Segment selectors */
553	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
554	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
555	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
556	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
557	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
558	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
559
560	/*
561	 * If this is a nested task then write out the new TSS to update
562	 * the previous link field.
563	 */
564	if (nested)
565		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
566
567	/* Validate segment descriptors */
568	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
569	if (error)
570		return (error);
571	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
572
573	/*
574	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
575	 *
576	 * The SS and CS attribute checks on VM-entry are inter-dependent so
577	 * we need to make sure that both segments are valid before updating
578	 * either of them. This ensures that the VMCS state can pass the
579	 * VM-entry checks so the guest can handle any exception injected
580	 * during task switch emulation.
581	 */
582	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
583	if (error)
584		return (error);
585	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
586	if (error)
587		return (error);
588	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
589	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
590	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
591
592	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
593	if (error)
594		return (error);
595	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
596
597	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
598	if (error)
599		return (error);
600	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
601
602	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
603	if (error)
604		return (error);
605	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
606
607	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
608	if (error)
609		return (error);
610	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
611
612	return (0);
613}
614
615/*
616 * Push an error code on the stack of the new task. This is needed if the
617 * task switch was triggered by a hardware exception that causes an error
618 * code to be saved (e.g. #PF).
619 *
620 * Returns 0 on success.
621 * Returns 1 if an exception was injected into the guest.
622 * Returns -1 otherwise.
623 */
624static int
625push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
626    int task_type, uint32_t errcode)
627{
628	struct iovec iov[2];
629	struct seg_desc seg_desc;
630	int stacksize, bytes, error;
631	uint64_t gla, cr0, rflags;
632	uint32_t esp;
633	uint16_t stacksel;
634
635	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
636	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
637	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
638
639	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
640	    &seg_desc.limit, &seg_desc.access);
641	assert(error == 0);
642
643	/*
644	 * Section "Error Code" in the Intel SDM vol 3: the error code is
645	 * pushed on the stack as a doubleword or word (depending on the
646	 * default interrupt, trap or task gate size).
647	 */
648	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
649		bytes = 4;
650	else
651		bytes = 2;
652
653	/*
654	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
655	 * stack-segment descriptor determines the size of the stack
656	 * pointer outside of 64-bit mode.
657	 */
658	if (SEG_DESC_DEF32(seg_desc.access))
659		stacksize = 4;
660	else
661		stacksize = 2;
662
663	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
664	esp -= bytes;
665
666	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
667	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
668		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
669		return (1);
670	}
671
672	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
673		vm_inject_ac(ctx, vcpu, 1);
674		return (1);
675	}
676
677	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
678	    iov, nitems(iov));
679	if (error)
680		return (error);
681
682	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
683	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
684	return (0);
685}
686
687/*
688 * Evaluate return value from helper functions and potentially return to
689 * the VM run loop.
690 *  0: success
691 * +1: an exception was injected into the guest vcpu
692 * -1: unrecoverable/programming error
693 */
694#define	CHKERR(x)							\
695	do {								\
696		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
697		if ((x) == -1)						\
698			return (VMEXIT_ABORT);				\
699		else if ((x) == 1)					\
700			return (VMEXIT_CONTINUE);			\
701	} while (0)
702
703int
704vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
705{
706	struct seg_desc nt;
707	struct tss32 oldtss, newtss;
708	struct vm_task_switch *task_switch;
709	struct vm_guest_paging *paging, sup_paging;
710	struct user_segment_descriptor nt_desc, ot_desc;
711	struct iovec nt_iov[2], ot_iov[2];
712	uint64_t cr0, ot_base;
713	uint32_t eip, ot_lim, access;
714	int error, ext, minlimit, nt_type, ot_type, vcpu;
715	enum task_switch_reason reason;
716	uint16_t nt_sel, ot_sel;
717
718	task_switch = &vmexit->u.task_switch;
719	nt_sel = task_switch->tsssel;
720	ext = vmexit->u.task_switch.ext;
721	reason = vmexit->u.task_switch.reason;
722	paging = &vmexit->u.task_switch.paging;
723	vcpu = *pvcpu;
724
725	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
726
727	/*
728	 * Calculate the %eip to store in the old TSS before modifying the
729	 * 'inst_length'.
730	 */
731	eip = vmexit->rip + vmexit->inst_length;
732
733	/*
734	 * Set the 'inst_length' to '0'.
735	 *
736	 * If an exception is triggered during emulation of the task switch
737	 * then the exception handler should return to the instruction that
738	 * caused the task switch as opposed to the subsequent instruction.
739	 */
740	vmexit->inst_length = 0;
741
742	/*
743	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
744	 * The following page table accesses are implicitly supervisor mode:
745	 * - accesses to GDT or LDT to load segment descriptors
746	 * - accesses to the task state segment during task switch
747	 */
748	sup_paging = *paging;
749	sup_paging.cpl = 0;	/* implicit supervisor mode */
750
751	/* Fetch the new TSS descriptor */
752	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
753	CHKERR(error);
754
755	nt = usd_to_seg_desc(&nt_desc);
756
757	/* Verify the type of the new TSS */
758	nt_type = SEG_DESC_TYPE(nt.access);
759	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
760	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
761		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
762		goto done;
763	}
764
765	/* TSS descriptor must have present bit set */
766	if (!SEG_DESC_PRESENT(nt.access)) {
767		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
768		goto done;
769	}
770
771	/*
772	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
773	 * 44 bytes for a 16-bit TSS.
774	 */
775	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
776		minlimit = 104 - 1;
777	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
778		minlimit = 44 - 1;
779	else
780		minlimit = 0;
781
782	assert(minlimit > 0);
783	if (nt.limit < minlimit) {
784		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
785		goto done;
786	}
787
788	/* TSS must be busy if task switch is due to IRET */
789	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
790		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
791		goto done;
792	}
793
794	/*
795	 * TSS must be available (not busy) if task switch reason is
796	 * CALL, JMP, exception or interrupt.
797	 */
798	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
799		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
800		goto done;
801	}
802
803	/* Fetch the new TSS */
804	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
805	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
806	CHKERR(error);
807	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
808
809	/* Get the old TSS selector from the guest's task register */
810	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
811	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
812		/*
813		 * This might happen if a task switch was attempted without
814		 * ever loading the task register with LTR. In this case the
815		 * TR would contain the values from power-on:
816		 * (sel = 0, base = 0, limit = 0xffff).
817		 */
818		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
819		goto done;
820	}
821
822	/* Get the old TSS base and limit from the guest's task register */
823	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
824	    &access);
825	assert(error == 0);
826	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
827	ot_type = SEG_DESC_TYPE(access);
828	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
829
830	/* Fetch the old TSS descriptor */
831	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
832	CHKERR(error);
833
834	/* Get the old TSS */
835	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
836	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
837	CHKERR(error);
838	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
839
840	/*
841	 * Clear the busy bit in the old TSS descriptor if the task switch
842	 * due to an IRET or JMP instruction.
843	 */
844	if (reason == TSR_IRET || reason == TSR_JMP) {
845		ot_desc.sd_type &= ~0x2;
846		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
847		    &ot_desc);
848		CHKERR(error);
849	}
850
851	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
852		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
853		return (VMEXIT_ABORT);
854	}
855
856	/* Save processor state in old TSS */
857	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
858
859	/*
860	 * If the task switch was triggered for any reason other than IRET
861	 * then set the busy bit in the new TSS descriptor.
862	 */
863	if (reason != TSR_IRET) {
864		nt_desc.sd_type |= 0x2;
865		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
866		    &nt_desc);
867		CHKERR(error);
868	}
869
870	/* Update task register to point at the new TSS */
871	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
872
873	/* Update the hidden descriptor state of the task register */
874	nt = usd_to_seg_desc(&nt_desc);
875	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
876
877	/* Set CR0.TS */
878	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
879	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
880
881	/*
882	 * We are now committed to the task switch. Any exceptions encountered
883	 * after this point will be handled in the context of the new task and
884	 * the saved instruction pointer will belong to the new task.
885	 */
886	vmexit->rip = newtss.tss_eip;
887	assert(vmexit->inst_length == 0);
888
889	/* Load processor state from new TSS */
890	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
891	CHKERR(error);
892
893	/*
894	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
895	 * caused an error code to be generated, this error code is copied
896	 * to the stack of the new task.
897	 */
898	if (task_switch->errcode_valid) {
899		assert(task_switch->ext);
900		assert(task_switch->reason == TSR_IDT_GATE);
901		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
902		    task_switch->errcode);
903		CHKERR(error);
904	}
905
906	/*
907	 * Treatment of virtual-NMI blocking if NMI is delivered through
908	 * a task gate.
909	 *
910	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
911	 * If the virtual NMIs VM-execution control is 1, VM entry injects
912	 * an NMI, and delivery of the NMI causes a task switch that causes
913	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
914	 * commences.
915	 *
916	 * Thus, virtual-NMI blocking is in effect at the time of the task
917	 * switch VM exit.
918	 */
919
920	/*
921	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
922	 *
923	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
924	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
925	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
926	 *
927	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
928	 * VM exit.
929	 */
930
931	/*
932	 * If the task switch was triggered by an event delivered through
933	 * the IDT then extinguish the pending event from the vcpu's
934	 * exitintinfo.
935	 */
936	if (task_switch->reason == TSR_IDT_GATE) {
937		error = vm_set_intinfo(ctx, vcpu, 0);
938		assert(error == 0);
939	}
940
941	/*
942	 * XXX should inject debug exception if 'T' bit is 1
943	 */
944done:
945	return (VMEXIT_CONTINUE);
946}
947