vmm_instruction_emul.c revision 268976
1/*- 2 * Copyright (c) 2012 Sandvine, Inc. 3 * Copyright (c) 2012 NetApp, Inc. 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 268976 2014-07-22 04:39:16Z jhb $ 28 */ 29 30#include <sys/cdefs.h> 31__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_instruction_emul.c 268976 2014-07-22 04:39:16Z jhb $"); 32 33#ifdef _KERNEL 34#include <sys/param.h> 35#include <sys/pcpu.h> 36#include <sys/systm.h> 37#include <sys/proc.h> 38 39#include <vm/vm.h> 40#include <vm/pmap.h> 41 42#include <machine/vmparam.h> 43#include <machine/vmm.h> 44#else /* !_KERNEL */ 45#include <sys/types.h> 46#include <sys/errno.h> 47 48#include <machine/vmm.h> 49 50#include <assert.h> 51#include <vmmapi.h> 52#define KASSERT(exp,msg) assert((exp)) 53#endif /* _KERNEL */ 54 55#include <machine/vmm_instruction_emul.h> 56#include <x86/psl.h> 57#include <x86/specialreg.h> 58 59/* struct vie_op.op_type */ 60enum { 61 VIE_OP_TYPE_NONE = 0, 62 VIE_OP_TYPE_MOV, 63 VIE_OP_TYPE_MOVSX, 64 VIE_OP_TYPE_MOVZX, 65 VIE_OP_TYPE_AND, 66 VIE_OP_TYPE_OR, 67 VIE_OP_TYPE_TWO_BYTE, 68 VIE_OP_TYPE_LAST 69}; 70 71/* struct vie_op.op_flags */ 72#define VIE_OP_F_IMM (1 << 0) /* immediate operand present */ 73#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */ 74 75static const struct vie_op two_byte_opcodes[256] = { 76 [0xB6] = { 77 .op_byte = 0xB6, 78 .op_type = VIE_OP_TYPE_MOVZX, 79 }, 80 [0xBE] = { 81 .op_byte = 0xBE, 82 .op_type = VIE_OP_TYPE_MOVSX, 83 }, 84}; 85 86static const struct vie_op one_byte_opcodes[256] = { 87 [0x0F] = { 88 .op_byte = 0x0F, 89 .op_type = VIE_OP_TYPE_TWO_BYTE 90 }, 91 [0x88] = { 92 .op_byte = 0x88, 93 .op_type = VIE_OP_TYPE_MOV, 94 }, 95 [0x89] = { 96 .op_byte = 0x89, 97 .op_type = VIE_OP_TYPE_MOV, 98 }, 99 [0x8A] = { 100 .op_byte = 0x8A, 101 .op_type = VIE_OP_TYPE_MOV, 102 }, 103 [0x8B] = { 104 .op_byte = 0x8B, 105 .op_type = VIE_OP_TYPE_MOV, 106 }, 107 [0xC7] = { 108 .op_byte = 0xC7, 109 .op_type = VIE_OP_TYPE_MOV, 110 .op_flags = VIE_OP_F_IMM, 111 }, 112 [0x23] = { 113 .op_byte = 0x23, 114 .op_type = VIE_OP_TYPE_AND, 115 }, 116 [0x81] = { 117 /* XXX Group 1 extended opcode - not just AND */ 118 .op_byte = 0x81, 119 .op_type = VIE_OP_TYPE_AND, 120 .op_flags = VIE_OP_F_IMM, 121 }, 122 [0x83] = { 123 /* XXX Group 1 extended opcode - not just OR */ 124 .op_byte = 0x83, 125 .op_type = VIE_OP_TYPE_OR, 126 .op_flags = VIE_OP_F_IMM8, 127 }, 128}; 129 130/* struct vie.mod */ 131#define VIE_MOD_INDIRECT 0 132#define VIE_MOD_INDIRECT_DISP8 1 133#define VIE_MOD_INDIRECT_DISP32 2 134#define VIE_MOD_DIRECT 3 135 136/* struct vie.rm */ 137#define VIE_RM_SIB 4 138#define VIE_RM_DISP32 5 139 140#define GB (1024 * 1024 * 1024) 141 142static enum vm_reg_name gpr_map[16] = { 143 VM_REG_GUEST_RAX, 144 VM_REG_GUEST_RCX, 145 VM_REG_GUEST_RDX, 146 VM_REG_GUEST_RBX, 147 VM_REG_GUEST_RSP, 148 VM_REG_GUEST_RBP, 149 VM_REG_GUEST_RSI, 150 VM_REG_GUEST_RDI, 151 VM_REG_GUEST_R8, 152 VM_REG_GUEST_R9, 153 VM_REG_GUEST_R10, 154 VM_REG_GUEST_R11, 155 VM_REG_GUEST_R12, 156 VM_REG_GUEST_R13, 157 VM_REG_GUEST_R14, 158 VM_REG_GUEST_R15 159}; 160 161static uint64_t size2mask[] = { 162 [1] = 0xff, 163 [2] = 0xffff, 164 [4] = 0xffffffff, 165 [8] = 0xffffffffffffffff, 166}; 167 168static int 169vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) 170{ 171 int error; 172 173 error = vm_get_register(vm, vcpuid, reg, rval); 174 175 return (error); 176} 177 178static int 179vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) 180{ 181 uint64_t val; 182 int error, rshift; 183 enum vm_reg_name reg; 184 185 rshift = 0; 186 reg = gpr_map[vie->reg]; 187 188 /* 189 * 64-bit mode imposes limitations on accessing legacy byte registers. 190 * 191 * The legacy high-byte registers cannot be addressed if the REX 192 * prefix is present. In this case the values 4, 5, 6 and 7 of the 193 * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively. 194 * 195 * If the REX prefix is not present then the values 4, 5, 6 and 7 196 * of the 'ModRM:reg' field address the legacy high-byte registers, 197 * %ah, %ch, %dh and %bh respectively. 198 */ 199 if (!vie->rex_present) { 200 if (vie->reg & 0x4) { 201 /* 202 * Obtain the value of %ah by reading %rax and shifting 203 * right by 8 bits (same for %bh, %ch and %dh). 204 */ 205 rshift = 8; 206 reg = gpr_map[vie->reg & 0x3]; 207 } 208 } 209 210 error = vm_get_register(vm, vcpuid, reg, &val); 211 *rval = val >> rshift; 212 return (error); 213} 214 215int 216vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, 217 uint64_t val, int size) 218{ 219 int error; 220 uint64_t origval; 221 222 switch (size) { 223 case 1: 224 case 2: 225 error = vie_read_register(vm, vcpuid, reg, &origval); 226 if (error) 227 return (error); 228 val &= size2mask[size]; 229 val |= origval & ~size2mask[size]; 230 break; 231 case 4: 232 val &= 0xffffffffUL; 233 break; 234 case 8: 235 break; 236 default: 237 return (EINVAL); 238 } 239 240 error = vm_set_register(vm, vcpuid, reg, val); 241 return (error); 242} 243 244/* 245 * The following simplifying assumptions are made during emulation: 246 * 247 * - guest is in 64-bit mode 248 * - default address size is 64-bits 249 * - default operand size is 32-bits 250 * 251 * - operand size override is not supported 252 * 253 * - address size override is not supported 254 */ 255static int 256emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 257 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 258{ 259 int error, size; 260 enum vm_reg_name reg; 261 uint8_t byte; 262 uint64_t val; 263 264 size = 4; 265 error = EINVAL; 266 267 switch (vie->op.op_byte) { 268 case 0x88: 269 /* 270 * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m) 271 * 88/r: mov r/m8, r8 272 * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) 273 */ 274 size = 1; 275 error = vie_read_bytereg(vm, vcpuid, vie, &byte); 276 if (error == 0) 277 error = memwrite(vm, vcpuid, gpa, byte, size, arg); 278 break; 279 case 0x89: 280 /* 281 * MOV from reg (ModRM:reg) to mem (ModRM:r/m) 282 * 89/r: mov r/m32, r32 283 * REX.W + 89/r mov r/m64, r64 284 */ 285 if (vie->rex_w) 286 size = 8; 287 reg = gpr_map[vie->reg]; 288 error = vie_read_register(vm, vcpuid, reg, &val); 289 if (error == 0) { 290 val &= size2mask[size]; 291 error = memwrite(vm, vcpuid, gpa, val, size, arg); 292 } 293 break; 294 case 0x8A: 295 case 0x8B: 296 /* 297 * MOV from mem (ModRM:r/m) to reg (ModRM:reg) 298 * 8A/r: mov r/m8, r8 299 * REX + 8A/r: mov r/m8, r8 300 * 8B/r: mov r32, r/m32 301 * REX.W 8B/r: mov r64, r/m64 302 */ 303 if (vie->op.op_byte == 0x8A) 304 size = 1; 305 else if (vie->rex_w) 306 size = 8; 307 error = memread(vm, vcpuid, gpa, &val, size, arg); 308 if (error == 0) { 309 reg = gpr_map[vie->reg]; 310 error = vie_update_register(vm, vcpuid, reg, val, size); 311 } 312 break; 313 case 0xC7: 314 /* 315 * MOV from imm32 to mem (ModRM:r/m) 316 * C7/0 mov r/m32, imm32 317 * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) 318 */ 319 val = vie->immediate; /* already sign-extended */ 320 321 if (vie->rex_w) 322 size = 8; 323 324 if (size != 8) 325 val &= size2mask[size]; 326 327 error = memwrite(vm, vcpuid, gpa, val, size, arg); 328 break; 329 default: 330 break; 331 } 332 333 return (error); 334} 335 336/* 337 * The following simplifying assumptions are made during emulation: 338 * 339 * - guest is in 64-bit mode 340 * - default address size is 64-bits 341 * - default operand size is 32-bits 342 * 343 * - operand size override is not supported 344 * 345 * - address size override is not supported 346 */ 347static int 348emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 349 mem_region_read_t memread, mem_region_write_t memwrite, 350 void *arg) 351{ 352 int error, size; 353 enum vm_reg_name reg; 354 uint64_t val; 355 356 size = 4; 357 error = EINVAL; 358 359 switch (vie->op.op_byte) { 360 case 0xB6: 361 /* 362 * MOV and zero extend byte from mem (ModRM:r/m) to 363 * reg (ModRM:reg). 364 * 365 * 0F B6/r movzx r/m8, r32 366 * REX.W + 0F B6/r movzx r/m8, r64 367 */ 368 369 /* get the first operand */ 370 error = memread(vm, vcpuid, gpa, &val, 1, arg); 371 if (error) 372 break; 373 374 /* get the second operand */ 375 reg = gpr_map[vie->reg]; 376 377 if (vie->rex_w) 378 size = 8; 379 380 /* write the result */ 381 error = vie_update_register(vm, vcpuid, reg, val, size); 382 break; 383 case 0xBE: 384 /* 385 * MOV and sign extend byte from mem (ModRM:r/m) to 386 * reg (ModRM:reg). 387 * 388 * 0F BE/r movsx r/m8, r32 389 * REX.W + 0F BE/r movsx r/m8, r64 390 */ 391 392 /* get the first operand */ 393 error = memread(vm, vcpuid, gpa, &val, 1, arg); 394 if (error) 395 break; 396 397 /* get the second operand */ 398 reg = gpr_map[vie->reg]; 399 400 if (vie->rex_w) 401 size = 8; 402 403 /* sign extend byte */ 404 val = (int8_t)val; 405 406 /* write the result */ 407 error = vie_update_register(vm, vcpuid, reg, val, size); 408 break; 409 default: 410 break; 411 } 412 return (error); 413} 414 415static int 416emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 417 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 418{ 419 int error, size; 420 enum vm_reg_name reg; 421 uint64_t val1, val2; 422 423 size = 4; 424 error = EINVAL; 425 426 switch (vie->op.op_byte) { 427 case 0x23: 428 /* 429 * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the 430 * result in reg. 431 * 432 * 23/r and r32, r/m32 433 * REX.W + 23/r and r64, r/m64 434 */ 435 if (vie->rex_w) 436 size = 8; 437 438 /* get the first operand */ 439 reg = gpr_map[vie->reg]; 440 error = vie_read_register(vm, vcpuid, reg, &val1); 441 if (error) 442 break; 443 444 /* get the second operand */ 445 error = memread(vm, vcpuid, gpa, &val2, size, arg); 446 if (error) 447 break; 448 449 /* perform the operation and write the result */ 450 val1 &= val2; 451 error = vie_update_register(vm, vcpuid, reg, val1, size); 452 break; 453 case 0x81: 454 /* 455 * AND mem (ModRM:r/m) with immediate and store the 456 * result in mem. 457 * 458 * 81/ and r/m32, imm32 459 * REX.W + 81/ and r/m64, imm32 sign-extended to 64 460 * 461 * Currently, only the AND operation of the 0x81 opcode 462 * is implemented (ModRM:reg = b100). 463 */ 464 if ((vie->reg & 7) != 4) 465 break; 466 467 if (vie->rex_w) 468 size = 8; 469 470 /* get the first operand */ 471 error = memread(vm, vcpuid, gpa, &val1, size, arg); 472 if (error) 473 break; 474 475 /* 476 * perform the operation with the pre-fetched immediate 477 * operand and write the result 478 */ 479 val1 &= vie->immediate; 480 error = memwrite(vm, vcpuid, gpa, val1, size, arg); 481 break; 482 default: 483 break; 484 } 485 return (error); 486} 487 488static int 489emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 490 mem_region_read_t memread, mem_region_write_t memwrite, void *arg) 491{ 492 int error, size; 493 uint64_t val1; 494 495 size = 4; 496 error = EINVAL; 497 498 switch (vie->op.op_byte) { 499 case 0x83: 500 /* 501 * OR mem (ModRM:r/m) with immediate and store the 502 * result in mem. 503 * 504 * 83/ OR r/m32, imm8 sign-extended to 32 505 * REX.W + 83/ OR r/m64, imm8 sign-extended to 64 506 * 507 * Currently, only the OR operation of the 0x83 opcode 508 * is implemented (ModRM:reg = b001). 509 */ 510 if ((vie->reg & 7) != 1) 511 break; 512 513 if (vie->rex_w) 514 size = 8; 515 516 /* get the first operand */ 517 error = memread(vm, vcpuid, gpa, &val1, size, arg); 518 if (error) 519 break; 520 521 /* 522 * perform the operation with the pre-fetched immediate 523 * operand and write the result 524 */ 525 val1 |= vie->immediate; 526 error = memwrite(vm, vcpuid, gpa, val1, size, arg); 527 break; 528 default: 529 break; 530 } 531 return (error); 532} 533 534int 535vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, 536 mem_region_read_t memread, mem_region_write_t memwrite, 537 void *memarg) 538{ 539 int error; 540 541 if (!vie->decoded) 542 return (EINVAL); 543 544 switch (vie->op.op_type) { 545 case VIE_OP_TYPE_MOV: 546 error = emulate_mov(vm, vcpuid, gpa, vie, 547 memread, memwrite, memarg); 548 break; 549 case VIE_OP_TYPE_MOVSX: 550 case VIE_OP_TYPE_MOVZX: 551 error = emulate_movx(vm, vcpuid, gpa, vie, 552 memread, memwrite, memarg); 553 break; 554 case VIE_OP_TYPE_AND: 555 error = emulate_and(vm, vcpuid, gpa, vie, 556 memread, memwrite, memarg); 557 break; 558 case VIE_OP_TYPE_OR: 559 error = emulate_or(vm, vcpuid, gpa, vie, 560 memread, memwrite, memarg); 561 break; 562 default: 563 error = EINVAL; 564 break; 565 } 566 567 return (error); 568} 569 570int 571vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) 572{ 573 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 574 ("%s: invalid size %d", __func__, size)); 575 KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl)); 576 577 if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) 578 return (0); 579 580 return ((gla & (size - 1)) ? 1 : 0); 581} 582 583int 584vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) 585{ 586 uint64_t mask; 587 588 if (cpu_mode != CPU_MODE_64BIT) 589 return (0); 590 591 /* 592 * The value of the bit 47 in the 'gla' should be replicated in the 593 * most significant 16 bits. 594 */ 595 mask = ~((1UL << 48) - 1); 596 if (gla & (1UL << 47)) 597 return ((gla & mask) != mask); 598 else 599 return ((gla & mask) != 0); 600} 601 602uint64_t 603vie_size2mask(int size) 604{ 605 KASSERT(size == 1 || size == 2 || size == 4 || size == 8, 606 ("vie_size2mask: invalid size %d", size)); 607 return (size2mask[size]); 608} 609 610int 611vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, 612 struct seg_desc *desc, uint64_t offset, int length, int addrsize, 613 int prot, uint64_t *gla) 614{ 615 uint64_t firstoff, low_limit, high_limit, segbase; 616 int glasize, type; 617 618 KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS, 619 ("%s: invalid segment %d", __func__, seg)); 620 KASSERT(length == 1 || length == 2 || length == 4 || length == 8, 621 ("%s: invalid operand size %d", __func__, length)); 622 KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, 623 ("%s: invalid prot %#x", __func__, prot)); 624 625 firstoff = offset; 626 if (cpu_mode == CPU_MODE_64BIT) { 627 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address " 628 "size %d for cpu_mode %d", __func__, addrsize, cpu_mode)); 629 glasize = 8; 630 } else { 631 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address " 632 "size %d for cpu mode %d", __func__, addrsize, cpu_mode)); 633 glasize = 4; 634 /* 635 * If the segment selector is loaded with a NULL selector 636 * then the descriptor is unusable and attempting to use 637 * it results in a #GP(0). 638 */ 639 if (SEG_DESC_UNUSABLE(desc)) 640 return (-1); 641 642 /* 643 * The processor generates a #NP exception when a segment 644 * register is loaded with a selector that points to a 645 * descriptor that is not present. If this was the case then 646 * it would have been checked before the VM-exit. 647 */ 648 KASSERT(SEG_DESC_PRESENT(desc), ("segment %d not present: %#x", 649 seg, desc->access)); 650 651 /* 652 * The descriptor type must indicate a code/data segment. 653 */ 654 type = SEG_DESC_TYPE(desc); 655 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " 656 "descriptor type %#x", seg, type)); 657 658 if (prot & PROT_READ) { 659 /* #GP on a read access to a exec-only code segment */ 660 if ((type & 0xA) == 0x8) 661 return (-1); 662 } 663 664 if (prot & PROT_WRITE) { 665 /* 666 * #GP on a write access to a code segment or a 667 * read-only data segment. 668 */ 669 if (type & 0x8) /* code segment */ 670 return (-1); 671 672 if ((type & 0xA) == 0) /* read-only data seg */ 673 return (-1); 674 } 675 676 /* 677 * 'desc->limit' is fully expanded taking granularity into 678 * account. 679 */ 680 if ((type & 0xC) == 0x4) { 681 /* expand-down data segment */ 682 low_limit = desc->limit + 1; 683 high_limit = SEG_DESC_DEF32(desc) ? 0xffffffff : 0xffff; 684 } else { 685 /* code segment or expand-up data segment */ 686 low_limit = 0; 687 high_limit = desc->limit; 688 } 689 690 while (length > 0) { 691 offset &= vie_size2mask(addrsize); 692 if (offset < low_limit || offset > high_limit) 693 return (-1); 694 offset++; 695 length--; 696 } 697 } 698 699 /* 700 * In 64-bit mode all segments except %fs and %gs have a segment 701 * base address of 0. 702 */ 703 if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && 704 seg != VM_REG_GUEST_GS) { 705 segbase = 0; 706 } else { 707 segbase = desc->base; 708 } 709 710 /* 711 * Truncate 'firstoff' to the effective address size before adding 712 * it to the segment base. 713 */ 714 firstoff &= vie_size2mask(addrsize); 715 *gla = (segbase + firstoff) & vie_size2mask(glasize); 716 return (0); 717} 718 719#ifdef _KERNEL 720void 721vie_init(struct vie *vie) 722{ 723 724 bzero(vie, sizeof(struct vie)); 725 726 vie->base_register = VM_REG_LAST; 727 vie->index_register = VM_REG_LAST; 728} 729 730static int 731pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) 732{ 733 int error_code = 0; 734 735 if (pte & PG_V) 736 error_code |= PGEX_P; 737 if (prot & VM_PROT_WRITE) 738 error_code |= PGEX_W; 739 if (usermode) 740 error_code |= PGEX_U; 741 if (rsvd) 742 error_code |= PGEX_RSV; 743 if (prot & VM_PROT_EXECUTE) 744 error_code |= PGEX_I; 745 746 return (error_code); 747} 748 749static void 750ptp_release(void **cookie) 751{ 752 if (*cookie != NULL) { 753 vm_gpa_release(*cookie); 754 *cookie = NULL; 755 } 756} 757 758static void * 759ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) 760{ 761 void *ptr; 762 763 ptp_release(cookie); 764 ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); 765 return (ptr); 766} 767 768int 769vmm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, 770 uint64_t gla, int prot, uint64_t *gpa) 771{ 772 int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; 773 u_int retries; 774 uint64_t *ptpbase, ptpphys, pte, pgsize; 775 uint32_t *ptpbase32, pte32; 776 void *cookie; 777 778 usermode = (paging->cpl == 3 ? 1 : 0); 779 writable = prot & VM_PROT_WRITE; 780 cookie = NULL; 781 retval = 0; 782 retries = 0; 783restart: 784 ptpphys = paging->cr3; /* root of the page tables */ 785 ptp_release(&cookie); 786 if (retries++ > 0) 787 maybe_yield(); 788 789 if (vie_canonical_check(paging->cpu_mode, gla)) { 790 /* 791 * XXX assuming a non-stack reference otherwise a stack fault 792 * should be generated. 793 */ 794 vm_inject_gp(vm, vcpuid); 795 goto fault; 796 } 797 798 if (paging->paging_mode == PAGING_MODE_FLAT) { 799 *gpa = gla; 800 goto done; 801 } 802 803 if (paging->paging_mode == PAGING_MODE_32) { 804 nlevels = 2; 805 while (--nlevels >= 0) { 806 /* Zero out the lower 12 bits. */ 807 ptpphys &= ~0xfff; 808 809 ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 810 811 if (ptpbase32 == NULL) 812 goto error; 813 814 ptpshift = PAGE_SHIFT + nlevels * 10; 815 ptpindex = (gla >> ptpshift) & 0x3FF; 816 pgsize = 1UL << ptpshift; 817 818 pte32 = ptpbase32[ptpindex]; 819 820 if ((pte32 & PG_V) == 0 || 821 (usermode && (pte32 & PG_U) == 0) || 822 (writable && (pte32 & PG_RW) == 0)) { 823 pfcode = pf_error_code(usermode, prot, 0, 824 pte32); 825 vm_inject_pf(vm, vcpuid, pfcode, gla); 826 goto fault; 827 } 828 829 /* 830 * Emulate the x86 MMU's management of the accessed 831 * and dirty flags. While the accessed flag is set 832 * at every level of the page table, the dirty flag 833 * is only set at the last level providing the guest 834 * physical address. 835 */ 836 if ((pte32 & PG_A) == 0) { 837 if (atomic_cmpset_32(&ptpbase32[ptpindex], 838 pte32, pte32 | PG_A) == 0) { 839 goto restart; 840 } 841 } 842 843 /* XXX must be ignored if CR4.PSE=0 */ 844 if (nlevels > 0 && (pte32 & PG_PS) != 0) 845 break; 846 847 ptpphys = pte32; 848 } 849 850 /* Set the dirty bit in the page table entry if necessary */ 851 if (writable && (pte32 & PG_M) == 0) { 852 if (atomic_cmpset_32(&ptpbase32[ptpindex], 853 pte32, pte32 | PG_M) == 0) { 854 goto restart; 855 } 856 } 857 858 /* Zero out the lower 'ptpshift' bits */ 859 pte32 >>= ptpshift; pte32 <<= ptpshift; 860 *gpa = pte32 | (gla & (pgsize - 1)); 861 goto done; 862 } 863 864 if (paging->paging_mode == PAGING_MODE_PAE) { 865 /* Zero out the lower 5 bits and the upper 32 bits */ 866 ptpphys &= 0xffffffe0UL; 867 868 ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); 869 if (ptpbase == NULL) 870 goto error; 871 872 ptpindex = (gla >> 30) & 0x3; 873 874 pte = ptpbase[ptpindex]; 875 876 if ((pte & PG_V) == 0) { 877 pfcode = pf_error_code(usermode, prot, 0, pte); 878 vm_inject_pf(vm, vcpuid, pfcode, gla); 879 goto fault; 880 } 881 882 ptpphys = pte; 883 884 nlevels = 2; 885 } else 886 nlevels = 4; 887 while (--nlevels >= 0) { 888 /* Zero out the lower 12 bits and the upper 12 bits */ 889 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; 890 891 ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); 892 if (ptpbase == NULL) 893 goto error; 894 895 ptpshift = PAGE_SHIFT + nlevels * 9; 896 ptpindex = (gla >> ptpshift) & 0x1FF; 897 pgsize = 1UL << ptpshift; 898 899 pte = ptpbase[ptpindex]; 900 901 if ((pte & PG_V) == 0 || 902 (usermode && (pte & PG_U) == 0) || 903 (writable && (pte & PG_RW) == 0)) { 904 pfcode = pf_error_code(usermode, prot, 0, pte); 905 vm_inject_pf(vm, vcpuid, pfcode, gla); 906 goto fault; 907 } 908 909 /* Set the accessed bit in the page table entry */ 910 if ((pte & PG_A) == 0) { 911 if (atomic_cmpset_64(&ptpbase[ptpindex], 912 pte, pte | PG_A) == 0) { 913 goto restart; 914 } 915 } 916 917 if (nlevels > 0 && (pte & PG_PS) != 0) { 918 if (pgsize > 1 * GB) { 919 pfcode = pf_error_code(usermode, prot, 1, pte); 920 vm_inject_pf(vm, vcpuid, pfcode, gla); 921 goto fault; 922 } 923 break; 924 } 925 926 ptpphys = pte; 927 } 928 929 /* Set the dirty bit in the page table entry if necessary */ 930 if (writable && (pte & PG_M) == 0) { 931 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) 932 goto restart; 933 } 934 935 /* Zero out the lower 'ptpshift' bits and the upper 12 bits */ 936 pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12; 937 *gpa = pte | (gla & (pgsize - 1)); 938done: 939 ptp_release(&cookie); 940 return (retval); 941error: 942 retval = -1; 943 goto done; 944fault: 945 retval = 1; 946 goto done; 947} 948 949int 950vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *paging, 951 uint64_t rip, int inst_length, struct vie *vie) 952{ 953 int n, error, prot; 954 uint64_t gpa, off; 955 void *hpa, *cookie; 956 957 /* 958 * XXX cache previously fetched instructions using 'rip' as the tag 959 */ 960 961 prot = VM_PROT_READ | VM_PROT_EXECUTE; 962 if (inst_length > VIE_INST_SIZE) 963 panic("vmm_fetch_instruction: invalid length %d", inst_length); 964 965 /* Copy the instruction into 'vie' */ 966 while (vie->num_valid < inst_length) { 967 error = vmm_gla2gpa(vm, cpuid, paging, rip, prot, &gpa); 968 if (error) 969 return (error); 970 971 off = gpa & PAGE_MASK; 972 n = min(inst_length - vie->num_valid, PAGE_SIZE - off); 973 974 if ((hpa = vm_gpa_hold(vm, gpa, n, prot, &cookie)) == NULL) 975 break; 976 977 bcopy(hpa, &vie->inst[vie->num_valid], n); 978 979 vm_gpa_release(cookie); 980 981 rip += n; 982 vie->num_valid += n; 983 } 984 985 if (vie->num_valid == inst_length) 986 return (0); 987 else 988 return (-1); 989} 990 991static int 992vie_peek(struct vie *vie, uint8_t *x) 993{ 994 995 if (vie->num_processed < vie->num_valid) { 996 *x = vie->inst[vie->num_processed]; 997 return (0); 998 } else 999 return (-1); 1000} 1001 1002static void 1003vie_advance(struct vie *vie) 1004{ 1005 1006 vie->num_processed++; 1007} 1008 1009static int 1010decode_rex(struct vie *vie) 1011{ 1012 uint8_t x; 1013 1014 if (vie_peek(vie, &x)) 1015 return (-1); 1016 1017 if (x >= 0x40 && x <= 0x4F) { 1018 vie->rex_present = 1; 1019 1020 vie->rex_w = x & 0x8 ? 1 : 0; 1021 vie->rex_r = x & 0x4 ? 1 : 0; 1022 vie->rex_x = x & 0x2 ? 1 : 0; 1023 vie->rex_b = x & 0x1 ? 1 : 0; 1024 1025 vie_advance(vie); 1026 } 1027 1028 return (0); 1029} 1030 1031static int 1032decode_two_byte_opcode(struct vie *vie) 1033{ 1034 uint8_t x; 1035 1036 if (vie_peek(vie, &x)) 1037 return (-1); 1038 1039 vie->op = two_byte_opcodes[x]; 1040 1041 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1042 return (-1); 1043 1044 vie_advance(vie); 1045 return (0); 1046} 1047 1048static int 1049decode_opcode(struct vie *vie) 1050{ 1051 uint8_t x; 1052 1053 if (vie_peek(vie, &x)) 1054 return (-1); 1055 1056 vie->op = one_byte_opcodes[x]; 1057 1058 if (vie->op.op_type == VIE_OP_TYPE_NONE) 1059 return (-1); 1060 1061 vie_advance(vie); 1062 1063 if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE) 1064 return (decode_two_byte_opcode(vie)); 1065 1066 return (0); 1067} 1068 1069static int 1070decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode) 1071{ 1072 uint8_t x; 1073 1074 if (vie_peek(vie, &x)) 1075 return (-1); 1076 1077 vie->mod = (x >> 6) & 0x3; 1078 vie->rm = (x >> 0) & 0x7; 1079 vie->reg = (x >> 3) & 0x7; 1080 1081 /* 1082 * A direct addressing mode makes no sense in the context of an EPT 1083 * fault. There has to be a memory access involved to cause the 1084 * EPT fault. 1085 */ 1086 if (vie->mod == VIE_MOD_DIRECT) 1087 return (-1); 1088 1089 if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) || 1090 (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) { 1091 /* 1092 * Table 2-5: Special Cases of REX Encodings 1093 * 1094 * mod=0, r/m=5 is used in the compatibility mode to 1095 * indicate a disp32 without a base register. 1096 * 1097 * mod!=3, r/m=4 is used in the compatibility mode to 1098 * indicate that the SIB byte is present. 1099 * 1100 * The 'b' bit in the REX prefix is don't care in 1101 * this case. 1102 */ 1103 } else { 1104 vie->rm |= (vie->rex_b << 3); 1105 } 1106 1107 vie->reg |= (vie->rex_r << 3); 1108 1109 /* SIB */ 1110 if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB) 1111 goto done; 1112 1113 vie->base_register = gpr_map[vie->rm]; 1114 1115 switch (vie->mod) { 1116 case VIE_MOD_INDIRECT_DISP8: 1117 vie->disp_bytes = 1; 1118 break; 1119 case VIE_MOD_INDIRECT_DISP32: 1120 vie->disp_bytes = 4; 1121 break; 1122 case VIE_MOD_INDIRECT: 1123 if (vie->rm == VIE_RM_DISP32) { 1124 vie->disp_bytes = 4; 1125 /* 1126 * Table 2-7. RIP-Relative Addressing 1127 * 1128 * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32 1129 * whereas in compatibility mode it just implies disp32. 1130 */ 1131 1132 if (cpu_mode == CPU_MODE_64BIT) 1133 vie->base_register = VM_REG_GUEST_RIP; 1134 else 1135 vie->base_register = VM_REG_LAST; 1136 } 1137 break; 1138 } 1139 1140done: 1141 vie_advance(vie); 1142 1143 return (0); 1144} 1145 1146static int 1147decode_sib(struct vie *vie) 1148{ 1149 uint8_t x; 1150 1151 /* Proceed only if SIB byte is present */ 1152 if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB) 1153 return (0); 1154 1155 if (vie_peek(vie, &x)) 1156 return (-1); 1157 1158 /* De-construct the SIB byte */ 1159 vie->ss = (x >> 6) & 0x3; 1160 vie->index = (x >> 3) & 0x7; 1161 vie->base = (x >> 0) & 0x7; 1162 1163 /* Apply the REX prefix modifiers */ 1164 vie->index |= vie->rex_x << 3; 1165 vie->base |= vie->rex_b << 3; 1166 1167 switch (vie->mod) { 1168 case VIE_MOD_INDIRECT_DISP8: 1169 vie->disp_bytes = 1; 1170 break; 1171 case VIE_MOD_INDIRECT_DISP32: 1172 vie->disp_bytes = 4; 1173 break; 1174 } 1175 1176 if (vie->mod == VIE_MOD_INDIRECT && 1177 (vie->base == 5 || vie->base == 13)) { 1178 /* 1179 * Special case when base register is unused if mod = 0 1180 * and base = %rbp or %r13. 1181 * 1182 * Documented in: 1183 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1184 * Table 2-5: Special Cases of REX Encodings 1185 */ 1186 vie->disp_bytes = 4; 1187 } else { 1188 vie->base_register = gpr_map[vie->base]; 1189 } 1190 1191 /* 1192 * All encodings of 'index' are valid except for %rsp (4). 1193 * 1194 * Documented in: 1195 * Table 2-3: 32-bit Addressing Forms with the SIB Byte 1196 * Table 2-5: Special Cases of REX Encodings 1197 */ 1198 if (vie->index != 4) 1199 vie->index_register = gpr_map[vie->index]; 1200 1201 /* 'scale' makes sense only in the context of an index register */ 1202 if (vie->index_register < VM_REG_LAST) 1203 vie->scale = 1 << vie->ss; 1204 1205 vie_advance(vie); 1206 1207 return (0); 1208} 1209 1210static int 1211decode_displacement(struct vie *vie) 1212{ 1213 int n, i; 1214 uint8_t x; 1215 1216 union { 1217 char buf[4]; 1218 int8_t signed8; 1219 int32_t signed32; 1220 } u; 1221 1222 if ((n = vie->disp_bytes) == 0) 1223 return (0); 1224 1225 if (n != 1 && n != 4) 1226 panic("decode_displacement: invalid disp_bytes %d", n); 1227 1228 for (i = 0; i < n; i++) { 1229 if (vie_peek(vie, &x)) 1230 return (-1); 1231 1232 u.buf[i] = x; 1233 vie_advance(vie); 1234 } 1235 1236 if (n == 1) 1237 vie->displacement = u.signed8; /* sign-extended */ 1238 else 1239 vie->displacement = u.signed32; /* sign-extended */ 1240 1241 return (0); 1242} 1243 1244static int 1245decode_immediate(struct vie *vie) 1246{ 1247 int i, n; 1248 uint8_t x; 1249 union { 1250 char buf[4]; 1251 int8_t signed8; 1252 int32_t signed32; 1253 } u; 1254 1255 /* Figure out immediate operand size (if any) */ 1256 if (vie->op.op_flags & VIE_OP_F_IMM) 1257 vie->imm_bytes = 4; 1258 else if (vie->op.op_flags & VIE_OP_F_IMM8) 1259 vie->imm_bytes = 1; 1260 1261 if ((n = vie->imm_bytes) == 0) 1262 return (0); 1263 1264 if (n != 1 && n != 4) 1265 panic("decode_immediate: invalid imm_bytes %d", n); 1266 1267 for (i = 0; i < n; i++) { 1268 if (vie_peek(vie, &x)) 1269 return (-1); 1270 1271 u.buf[i] = x; 1272 vie_advance(vie); 1273 } 1274 1275 if (n == 1) 1276 vie->immediate = u.signed8; /* sign-extended */ 1277 else 1278 vie->immediate = u.signed32; /* sign-extended */ 1279 1280 return (0); 1281} 1282 1283/* 1284 * Verify that all the bytes in the instruction buffer were consumed. 1285 */ 1286static int 1287verify_inst_length(struct vie *vie) 1288{ 1289 1290 if (vie->num_processed == vie->num_valid) 1291 return (0); 1292 else 1293 return (-1); 1294} 1295 1296/* 1297 * Verify that the 'guest linear address' provided as collateral of the nested 1298 * page table fault matches with our instruction decoding. 1299 */ 1300static int 1301verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) 1302{ 1303 int error; 1304 uint64_t base, idx; 1305 1306 /* Skip 'gla' verification */ 1307 if (gla == VIE_INVALID_GLA) 1308 return (0); 1309 1310 base = 0; 1311 if (vie->base_register != VM_REG_LAST) { 1312 error = vm_get_register(vm, cpuid, vie->base_register, &base); 1313 if (error) { 1314 printf("verify_gla: error %d getting base reg %d\n", 1315 error, vie->base_register); 1316 return (-1); 1317 } 1318 1319 /* 1320 * RIP-relative addressing starts from the following 1321 * instruction 1322 */ 1323 if (vie->base_register == VM_REG_GUEST_RIP) 1324 base += vie->num_valid; 1325 } 1326 1327 idx = 0; 1328 if (vie->index_register != VM_REG_LAST) { 1329 error = vm_get_register(vm, cpuid, vie->index_register, &idx); 1330 if (error) { 1331 printf("verify_gla: error %d getting index reg %d\n", 1332 error, vie->index_register); 1333 return (-1); 1334 } 1335 } 1336 1337 if (base + vie->scale * idx + vie->displacement != gla) { 1338 printf("verify_gla mismatch: " 1339 "base(0x%0lx), scale(%d), index(0x%0lx), " 1340 "disp(0x%0lx), gla(0x%0lx)\n", 1341 base, vie->scale, idx, vie->displacement, gla); 1342 return (-1); 1343 } 1344 1345 return (0); 1346} 1347 1348int 1349vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, 1350 enum vm_cpu_mode cpu_mode, struct vie *vie) 1351{ 1352 1353 if (cpu_mode == CPU_MODE_64BIT) { 1354 if (decode_rex(vie)) 1355 return (-1); 1356 } 1357 1358 if (decode_opcode(vie)) 1359 return (-1); 1360 1361 if (decode_modrm(vie, cpu_mode)) 1362 return (-1); 1363 1364 if (decode_sib(vie)) 1365 return (-1); 1366 1367 if (decode_displacement(vie)) 1368 return (-1); 1369 1370 if (decode_immediate(vie)) 1371 return (-1); 1372 1373 if (verify_inst_length(vie)) 1374 return (-1); 1375 1376 if (verify_gla(vm, cpuid, gla, vie)) 1377 return (-1); 1378 1379 vie->decoded = 1; /* success */ 1380 1381 return (0); 1382} 1383#endif /* _KERNEL */ 1384