1/*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/param.h> 30#include <sys/capsicum.h> 31#include <sys/sysctl.h> 32#include <sys/ioctl.h> 33#include <sys/mman.h> 34#include <sys/linker.h> 35#include <sys/module.h> 36#include <sys/_iovec.h> 37#include <sys/cpuset.h> 38 39#include <capsicum_helpers.h> 40#include <errno.h> 41#include <stdbool.h> 42#include <stdio.h> 43#include <stdlib.h> 44#include <assert.h> 45#include <string.h> 46#include <fcntl.h> 47#include <unistd.h> 48 49#include <libutil.h> 50 51#include <vm/vm.h> 52#include <machine/vmm.h> 53#include <machine/vmm_dev.h> 54#ifdef WITH_VMMAPI_SNAPSHOT 55#include <machine/vmm_snapshot.h> 56#endif 57 58#include "vmmapi.h" 59#include "internal.h" 60 61#define MB (1024 * 1024UL) 62#define GB (1024 * 1024 * 1024UL) 63 64#ifdef __amd64__ 65#define VM_LOWMEM_LIMIT (3 * GB) 66#else 67#define VM_LOWMEM_LIMIT 0 68#endif 69#define VM_HIGHMEM_BASE (4 * GB) 70 71/* 72 * Size of the guard region before and after the virtual address space 73 * mapping the guest physical memory. This must be a multiple of the 74 * superpage size for performance reasons. 75 */ 76#define VM_MMAP_GUARD_SIZE (4 * MB) 77 78#define PROT_RW (PROT_READ | PROT_WRITE) 79#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) 80 81#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) 82#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) 83 84static int 85vm_device_open(const char *name) 86{ 87 int fd, len; 88 char *vmfile; 89 90 len = strlen("/dev/vmm/") + strlen(name) + 1; 91 vmfile = malloc(len); 92 assert(vmfile != NULL); 93 snprintf(vmfile, len, "/dev/vmm/%s", name); 94 95 /* Open the device file */ 96 fd = open(vmfile, O_RDWR, 0); 97 98 free(vmfile); 99 return (fd); 100} 101 102int 103vm_create(const char *name) 104{ 105 /* Try to load vmm(4) module before creating a guest. */ 106 if (modfind("vmm") < 0) 107 kldload("vmm"); 108 return (CREATE(name)); 109} 110 111struct vmctx * 112vm_open(const char *name) 113{ 114 struct vmctx *vm; 115 int saved_errno; 116 117 vm = malloc(sizeof(struct vmctx) + strlen(name) + 1); 118 assert(vm != NULL); 119 120 vm->fd = -1; 121 vm->memflags = 0; 122 vm->name = (char *)(vm + 1); 123 strcpy(vm->name, name); 124 memset(vm->memsegs, 0, sizeof(vm->memsegs)); 125 126 if ((vm->fd = vm_device_open(vm->name)) < 0) 127 goto err; 128 129 return (vm); 130err: 131 saved_errno = errno; 132 free(vm); 133 errno = saved_errno; 134 return (NULL); 135} 136 137void 138vm_close(struct vmctx *vm) 139{ 140 assert(vm != NULL); 141 142 close(vm->fd); 143 free(vm); 144} 145 146void 147vm_destroy(struct vmctx *vm) 148{ 149 assert(vm != NULL); 150 151 if (vm->fd >= 0) 152 close(vm->fd); 153 DESTROY(vm->name); 154 155 free(vm); 156} 157 158struct vcpu * 159vm_vcpu_open(struct vmctx *ctx, int vcpuid) 160{ 161 struct vcpu *vcpu; 162 163 vcpu = malloc(sizeof(*vcpu)); 164 vcpu->ctx = ctx; 165 vcpu->vcpuid = vcpuid; 166 return (vcpu); 167} 168 169void 170vm_vcpu_close(struct vcpu *vcpu) 171{ 172 free(vcpu); 173} 174 175int 176vcpu_id(struct vcpu *vcpu) 177{ 178 return (vcpu->vcpuid); 179} 180 181int 182vm_parse_memsize(const char *opt, size_t *ret_memsize) 183{ 184 char *endptr; 185 size_t optval; 186 int error; 187 188 optval = strtoul(opt, &endptr, 0); 189 if (*opt != '\0' && *endptr == '\0') { 190 /* 191 * For the sake of backward compatibility if the memory size 192 * specified on the command line is less than a megabyte then 193 * it is interpreted as being in units of MB. 194 */ 195 if (optval < MB) 196 optval *= MB; 197 *ret_memsize = optval; 198 error = 0; 199 } else 200 error = expand_number(opt, ret_memsize); 201 202 return (error); 203} 204 205uint32_t 206vm_get_lowmem_limit(struct vmctx *ctx __unused) 207{ 208 209 return (VM_LOWMEM_LIMIT); 210} 211 212void 213vm_set_memflags(struct vmctx *ctx, int flags) 214{ 215 216 ctx->memflags = flags; 217} 218 219int 220vm_get_memflags(struct vmctx *ctx) 221{ 222 223 return (ctx->memflags); 224} 225 226/* 227 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). 228 */ 229int 230vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, 231 size_t len, int prot) 232{ 233 struct vm_memmap memmap; 234 int error, flags; 235 236 memmap.gpa = gpa; 237 memmap.segid = segid; 238 memmap.segoff = off; 239 memmap.len = len; 240 memmap.prot = prot; 241 memmap.flags = 0; 242 243 if (ctx->memflags & VM_MEM_F_WIRED) 244 memmap.flags |= VM_MEMMAP_F_WIRED; 245 246 /* 247 * If this mapping already exists then don't create it again. This 248 * is the common case for SYSMEM mappings created by bhyveload(8). 249 */ 250 error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); 251 if (error == 0 && gpa == memmap.gpa) { 252 if (segid != memmap.segid || off != memmap.segoff || 253 prot != memmap.prot || flags != memmap.flags) { 254 errno = EEXIST; 255 return (-1); 256 } else { 257 return (0); 258 } 259 } 260 261 error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); 262 return (error); 263} 264 265int 266vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr, 267 size_t *lowmem_size, size_t *highmem_size) 268{ 269 270 *guest_baseaddr = ctx->baseaddr; 271 *lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size; 272 *highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size; 273 return (0); 274} 275 276int 277vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len) 278{ 279 struct vm_munmap munmap; 280 int error; 281 282 munmap.gpa = gpa; 283 munmap.len = len; 284 285 error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap); 286 return (error); 287} 288 289int 290vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, 291 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) 292{ 293 struct vm_memmap memmap; 294 int error; 295 296 bzero(&memmap, sizeof(struct vm_memmap)); 297 memmap.gpa = *gpa; 298 error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); 299 if (error == 0) { 300 *gpa = memmap.gpa; 301 *segid = memmap.segid; 302 *segoff = memmap.segoff; 303 *len = memmap.len; 304 *prot = memmap.prot; 305 *flags = memmap.flags; 306 } 307 return (error); 308} 309 310/* 311 * Return 0 if the segments are identical and non-zero otherwise. 312 * 313 * This is slightly complicated by the fact that only device memory segments 314 * are named. 315 */ 316static int 317cmpseg(size_t len, const char *str, size_t len2, const char *str2) 318{ 319 320 if (len == len2) { 321 if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) 322 return (0); 323 } 324 return (-1); 325} 326 327static int 328vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) 329{ 330 struct vm_memseg memseg; 331 size_t n; 332 int error; 333 334 /* 335 * If the memory segment has already been created then just return. 336 * This is the usual case for the SYSMEM segment created by userspace 337 * loaders like bhyveload(8). 338 */ 339 error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, 340 sizeof(memseg.name)); 341 if (error) 342 return (error); 343 344 if (memseg.len != 0) { 345 if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { 346 errno = EINVAL; 347 return (-1); 348 } else { 349 return (0); 350 } 351 } 352 353 bzero(&memseg, sizeof(struct vm_memseg)); 354 memseg.segid = segid; 355 memseg.len = len; 356 if (name != NULL) { 357 n = strlcpy(memseg.name, name, sizeof(memseg.name)); 358 if (n >= sizeof(memseg.name)) { 359 errno = ENAMETOOLONG; 360 return (-1); 361 } 362 } 363 364 error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); 365 return (error); 366} 367 368int 369vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, 370 size_t bufsize) 371{ 372 struct vm_memseg memseg; 373 size_t n; 374 int error; 375 376 bzero(&memseg, sizeof(memseg)); 377 memseg.segid = segid; 378 error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); 379 if (error == 0) { 380 *lenp = memseg.len; 381 n = strlcpy(namebuf, memseg.name, bufsize); 382 if (n >= bufsize) { 383 errno = ENAMETOOLONG; 384 error = -1; 385 } 386 } 387 return (error); 388} 389 390static int 391setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) 392{ 393 char *ptr; 394 int error, flags; 395 396 /* Map 'len' bytes starting at 'gpa' in the guest address space */ 397 error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); 398 if (error) 399 return (error); 400 401 flags = MAP_SHARED | MAP_FIXED; 402 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 403 flags |= MAP_NOCORE; 404 405 /* mmap into the process address space on the host */ 406 ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); 407 if (ptr == MAP_FAILED) 408 return (-1); 409 410 return (0); 411} 412 413int 414vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) 415{ 416 size_t objsize, len; 417 vm_paddr_t gpa; 418 char *baseaddr, *ptr; 419 int error; 420 421 assert(vms == VM_MMAP_ALL); 422 423 /* 424 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create 425 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder. 426 */ 427 if (memsize > VM_LOWMEM_LIMIT) { 428 ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT; 429 ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT; 430 objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size; 431 } else { 432 ctx->memsegs[VM_MEMSEG_LOW].size = memsize; 433 ctx->memsegs[VM_MEMSEG_HIGH].size = 0; 434 objsize = memsize; 435 } 436 437 error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); 438 if (error) 439 return (error); 440 441 /* 442 * Stake out a contiguous region covering the guest physical memory 443 * and the adjoining guard regions. 444 */ 445 len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; 446 ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); 447 if (ptr == MAP_FAILED) 448 return (-1); 449 450 baseaddr = ptr + VM_MMAP_GUARD_SIZE; 451 if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) { 452 gpa = VM_HIGHMEM_BASE; 453 len = ctx->memsegs[VM_MEMSEG_HIGH].size; 454 error = setup_memory_segment(ctx, gpa, len, baseaddr); 455 if (error) 456 return (error); 457 } 458 459 if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) { 460 gpa = 0; 461 len = ctx->memsegs[VM_MEMSEG_LOW].size; 462 error = setup_memory_segment(ctx, gpa, len, baseaddr); 463 if (error) 464 return (error); 465 } 466 467 ctx->baseaddr = baseaddr; 468 469 return (0); 470} 471 472/* 473 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in 474 * the lowmem or highmem regions. 475 * 476 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. 477 * The instruction emulation code depends on this behavior. 478 */ 479void * 480vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) 481{ 482 vm_size_t lowsize, highsize; 483 484 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 485 if (lowsize > 0) { 486 if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize) 487 return (ctx->baseaddr + gaddr); 488 } 489 490 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 491 if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) { 492 if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize && 493 gaddr + len <= VM_HIGHMEM_BASE + highsize) 494 return (ctx->baseaddr + gaddr); 495 } 496 497 return (NULL); 498} 499 500vm_paddr_t 501vm_rev_map_gpa(struct vmctx *ctx, void *addr) 502{ 503 vm_paddr_t offaddr; 504 vm_size_t lowsize, highsize; 505 506 offaddr = (char *)addr - ctx->baseaddr; 507 508 lowsize = ctx->memsegs[VM_MEMSEG_LOW].size; 509 if (lowsize > 0) 510 if (offaddr <= lowsize) 511 return (offaddr); 512 513 highsize = ctx->memsegs[VM_MEMSEG_HIGH].size; 514 if (highsize > 0) 515 if (offaddr >= VM_HIGHMEM_BASE && 516 offaddr < VM_HIGHMEM_BASE + highsize) 517 return (offaddr); 518 519 return ((vm_paddr_t)-1); 520} 521 522const char * 523vm_get_name(struct vmctx *ctx) 524{ 525 526 return (ctx->name); 527} 528 529size_t 530vm_get_lowmem_size(struct vmctx *ctx) 531{ 532 533 return (ctx->memsegs[VM_MEMSEG_LOW].size); 534} 535 536vm_paddr_t 537vm_get_highmem_base(struct vmctx *ctx __unused) 538{ 539 540 return (VM_HIGHMEM_BASE); 541} 542 543size_t 544vm_get_highmem_size(struct vmctx *ctx) 545{ 546 547 return (ctx->memsegs[VM_MEMSEG_HIGH].size); 548} 549 550void * 551vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) 552{ 553 char pathname[MAXPATHLEN]; 554 size_t len2; 555 char *base, *ptr; 556 int fd, error, flags; 557 558 fd = -1; 559 ptr = MAP_FAILED; 560 if (name == NULL || strlen(name) == 0) { 561 errno = EINVAL; 562 goto done; 563 } 564 565 error = vm_alloc_memseg(ctx, segid, len, name); 566 if (error) 567 goto done; 568 569 strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); 570 strlcat(pathname, ctx->name, sizeof(pathname)); 571 strlcat(pathname, ".", sizeof(pathname)); 572 strlcat(pathname, name, sizeof(pathname)); 573 574 fd = open(pathname, O_RDWR); 575 if (fd < 0) 576 goto done; 577 578 /* 579 * Stake out a contiguous region covering the device memory and the 580 * adjoining guard regions. 581 */ 582 len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; 583 base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 584 0); 585 if (base == MAP_FAILED) 586 goto done; 587 588 flags = MAP_SHARED | MAP_FIXED; 589 if ((ctx->memflags & VM_MEM_F_INCORE) == 0) 590 flags |= MAP_NOCORE; 591 592 /* mmap the devmem region in the host address space */ 593 ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); 594done: 595 if (fd >= 0) 596 close(fd); 597 return (ptr); 598} 599 600int 601vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg) 602{ 603 /* 604 * XXX: fragile, handle with care 605 * Assumes that the first field of the ioctl data 606 * is the vcpuid. 607 */ 608 *(int *)arg = vcpu->vcpuid; 609 return (ioctl(vcpu->ctx->fd, cmd, arg)); 610} 611 612int 613vm_set_register(struct vcpu *vcpu, int reg, uint64_t val) 614{ 615 int error; 616 struct vm_register vmreg; 617 618 bzero(&vmreg, sizeof(vmreg)); 619 vmreg.regnum = reg; 620 vmreg.regval = val; 621 622 error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg); 623 return (error); 624} 625 626int 627vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val) 628{ 629 int error; 630 struct vm_register vmreg; 631 632 bzero(&vmreg, sizeof(vmreg)); 633 vmreg.regnum = reg; 634 635 error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg); 636 *ret_val = vmreg.regval; 637 return (error); 638} 639 640int 641vm_set_register_set(struct vcpu *vcpu, unsigned int count, 642 const int *regnums, uint64_t *regvals) 643{ 644 int error; 645 struct vm_register_set vmregset; 646 647 bzero(&vmregset, sizeof(vmregset)); 648 vmregset.count = count; 649 vmregset.regnums = regnums; 650 vmregset.regvals = regvals; 651 652 error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset); 653 return (error); 654} 655 656int 657vm_get_register_set(struct vcpu *vcpu, unsigned int count, 658 const int *regnums, uint64_t *regvals) 659{ 660 int error; 661 struct vm_register_set vmregset; 662 663 bzero(&vmregset, sizeof(vmregset)); 664 vmregset.count = count; 665 vmregset.regnums = regnums; 666 vmregset.regvals = regvals; 667 668 error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset); 669 return (error); 670} 671 672int 673vm_run(struct vcpu *vcpu, struct vm_run *vmrun) 674{ 675 return (vcpu_ioctl(vcpu, VM_RUN, vmrun)); 676} 677 678int 679vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) 680{ 681 struct vm_suspend vmsuspend; 682 683 bzero(&vmsuspend, sizeof(vmsuspend)); 684 vmsuspend.how = how; 685 return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); 686} 687 688int 689vm_reinit(struct vmctx *ctx) 690{ 691 692 return (ioctl(ctx->fd, VM_REINIT, 0)); 693} 694 695int 696vm_capability_name2type(const char *capname) 697{ 698 int i; 699 700 for (i = 0; i < VM_CAP_MAX; i++) { 701 if (vm_capstrmap[i] != NULL && 702 strcmp(vm_capstrmap[i], capname) == 0) 703 return (i); 704 } 705 706 return (-1); 707} 708 709const char * 710vm_capability_type2name(int type) 711{ 712 if (type >= 0 && type < VM_CAP_MAX) 713 return (vm_capstrmap[type]); 714 715 return (NULL); 716} 717 718int 719vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval) 720{ 721 int error; 722 struct vm_capability vmcap; 723 724 bzero(&vmcap, sizeof(vmcap)); 725 vmcap.captype = cap; 726 727 error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap); 728 *retval = vmcap.capval; 729 return (error); 730} 731 732int 733vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val) 734{ 735 struct vm_capability vmcap; 736 737 bzero(&vmcap, sizeof(vmcap)); 738 vmcap.captype = cap; 739 vmcap.capval = val; 740 741 return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap)); 742} 743 744uint64_t * 745vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv, 746 int *ret_entries) 747{ 748 static _Thread_local uint64_t *stats_buf; 749 static _Thread_local u_int stats_count; 750 uint64_t *new_stats; 751 struct vm_stats vmstats; 752 u_int count, index; 753 bool have_stats; 754 755 have_stats = false; 756 count = 0; 757 for (index = 0;; index += nitems(vmstats.statbuf)) { 758 vmstats.index = index; 759 if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0) 760 break; 761 if (stats_count < index + vmstats.num_entries) { 762 new_stats = realloc(stats_buf, 763 (index + vmstats.num_entries) * sizeof(uint64_t)); 764 if (new_stats == NULL) { 765 errno = ENOMEM; 766 return (NULL); 767 } 768 stats_count = index + vmstats.num_entries; 769 stats_buf = new_stats; 770 } 771 memcpy(stats_buf + index, vmstats.statbuf, 772 vmstats.num_entries * sizeof(uint64_t)); 773 count += vmstats.num_entries; 774 have_stats = true; 775 776 if (vmstats.num_entries != nitems(vmstats.statbuf)) 777 break; 778 } 779 if (have_stats) { 780 if (ret_entries) 781 *ret_entries = count; 782 if (ret_tv) 783 *ret_tv = vmstats.tv; 784 return (stats_buf); 785 } else 786 return (NULL); 787} 788 789const char * 790vm_get_stat_desc(struct vmctx *ctx, int index) 791{ 792 static struct vm_stat_desc statdesc; 793 794 statdesc.index = index; 795 if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0) 796 return (statdesc.desc); 797 else 798 return (NULL); 799} 800 801#ifdef __amd64__ 802int 803vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) 804{ 805 int error, i; 806 struct vm_gpa_pte gpapte; 807 808 bzero(&gpapte, sizeof(gpapte)); 809 gpapte.gpa = gpa; 810 811 error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte); 812 813 if (error == 0) { 814 *num = gpapte.ptenum; 815 for (i = 0; i < gpapte.ptenum; i++) 816 pte[i] = gpapte.pte[i]; 817 } 818 819 return (error); 820} 821 822int 823vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging, 824 uint64_t gla, int prot, uint64_t *gpa, int *fault) 825{ 826 struct vm_gla2gpa gg; 827 int error; 828 829 bzero(&gg, sizeof(struct vm_gla2gpa)); 830 gg.prot = prot; 831 gg.gla = gla; 832 gg.paging = *paging; 833 834 error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg); 835 if (error == 0) { 836 *fault = gg.fault; 837 *gpa = gg.gpa; 838 } 839 return (error); 840} 841#endif 842 843int 844vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging, 845 uint64_t gla, int prot, uint64_t *gpa, int *fault) 846{ 847 struct vm_gla2gpa gg; 848 int error; 849 850 bzero(&gg, sizeof(struct vm_gla2gpa)); 851 gg.prot = prot; 852 gg.gla = gla; 853 gg.paging = *paging; 854 855 error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg); 856 if (error == 0) { 857 *fault = gg.fault; 858 *gpa = gg.gpa; 859 } 860 return (error); 861} 862 863#ifndef min 864#define min(a,b) (((a) < (b)) ? (a) : (b)) 865#endif 866 867#ifdef __amd64__ 868int 869vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging, 870 uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, 871 int *fault) 872{ 873 void *va; 874 uint64_t gpa, off; 875 int error, i, n; 876 877 for (i = 0; i < iovcnt; i++) { 878 iov[i].iov_base = 0; 879 iov[i].iov_len = 0; 880 } 881 882 while (len) { 883 assert(iovcnt > 0); 884 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault); 885 if (error || *fault) 886 return (error); 887 888 off = gpa & PAGE_MASK; 889 n = MIN(len, PAGE_SIZE - off); 890 891 va = vm_map_gpa(vcpu->ctx, gpa, n); 892 if (va == NULL) 893 return (EFAULT); 894 895 iov->iov_base = va; 896 iov->iov_len = n; 897 iov++; 898 iovcnt--; 899 900 gla += n; 901 len -= n; 902 } 903 return (0); 904} 905#endif 906 907void 908vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused) 909{ 910 /* 911 * Intentionally empty. This is used by the instruction 912 * emulation code shared with the kernel. The in-kernel 913 * version of this is non-empty. 914 */ 915} 916 917void 918vm_copyin(struct iovec *iov, void *vp, size_t len) 919{ 920 const char *src; 921 char *dst; 922 size_t n; 923 924 dst = vp; 925 while (len) { 926 assert(iov->iov_len); 927 n = min(len, iov->iov_len); 928 src = iov->iov_base; 929 bcopy(src, dst, n); 930 931 iov++; 932 dst += n; 933 len -= n; 934 } 935} 936 937void 938vm_copyout(const void *vp, struct iovec *iov, size_t len) 939{ 940 const char *src; 941 char *dst; 942 size_t n; 943 944 src = vp; 945 while (len) { 946 assert(iov->iov_len); 947 n = min(len, iov->iov_len); 948 dst = iov->iov_base; 949 bcopy(src, dst, n); 950 951 iov++; 952 src += n; 953 len -= n; 954 } 955} 956 957static int 958vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) 959{ 960 struct vm_cpuset vm_cpuset; 961 int error; 962 963 bzero(&vm_cpuset, sizeof(struct vm_cpuset)); 964 vm_cpuset.which = which; 965 vm_cpuset.cpusetsize = sizeof(cpuset_t); 966 vm_cpuset.cpus = cpus; 967 968 error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); 969 return (error); 970} 971 972int 973vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) 974{ 975 976 return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); 977} 978 979int 980vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) 981{ 982 983 return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); 984} 985 986int 987vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) 988{ 989 990 return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); 991} 992 993int 994vm_activate_cpu(struct vcpu *vcpu) 995{ 996 struct vm_activate_cpu ac; 997 int error; 998 999 bzero(&ac, sizeof(struct vm_activate_cpu)); 1000 error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac); 1001 return (error); 1002} 1003 1004int 1005vm_suspend_all_cpus(struct vmctx *ctx) 1006{ 1007 struct vm_activate_cpu ac; 1008 int error; 1009 1010 bzero(&ac, sizeof(struct vm_activate_cpu)); 1011 ac.vcpuid = -1; 1012 error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); 1013 return (error); 1014} 1015 1016int 1017vm_suspend_cpu(struct vcpu *vcpu) 1018{ 1019 struct vm_activate_cpu ac; 1020 int error; 1021 1022 bzero(&ac, sizeof(struct vm_activate_cpu)); 1023 error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac); 1024 return (error); 1025} 1026 1027int 1028vm_resume_cpu(struct vcpu *vcpu) 1029{ 1030 struct vm_activate_cpu ac; 1031 int error; 1032 1033 bzero(&ac, sizeof(struct vm_activate_cpu)); 1034 error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac); 1035 return (error); 1036} 1037 1038int 1039vm_resume_all_cpus(struct vmctx *ctx) 1040{ 1041 struct vm_activate_cpu ac; 1042 int error; 1043 1044 bzero(&ac, sizeof(struct vm_activate_cpu)); 1045 ac.vcpuid = -1; 1046 error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); 1047 return (error); 1048} 1049 1050#ifdef __amd64__ 1051int 1052vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2) 1053{ 1054 struct vm_intinfo vmii; 1055 int error; 1056 1057 bzero(&vmii, sizeof(struct vm_intinfo)); 1058 error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii); 1059 if (error == 0) { 1060 *info1 = vmii.info1; 1061 *info2 = vmii.info2; 1062 } 1063 return (error); 1064} 1065 1066int 1067vm_set_intinfo(struct vcpu *vcpu, uint64_t info1) 1068{ 1069 struct vm_intinfo vmii; 1070 int error; 1071 1072 bzero(&vmii, sizeof(struct vm_intinfo)); 1073 vmii.info1 = info1; 1074 error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii); 1075 return (error); 1076} 1077#endif 1078 1079#ifdef WITH_VMMAPI_SNAPSHOT 1080int 1081vm_restart_instruction(struct vcpu *vcpu) 1082{ 1083 int arg; 1084 1085 return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg)); 1086} 1087 1088int 1089vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta) 1090{ 1091 1092 if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) { 1093#ifdef SNAPSHOT_DEBUG 1094 fprintf(stderr, "%s: snapshot failed for %s: %d\r\n", 1095 __func__, meta->dev_name, errno); 1096#endif 1097 return (-1); 1098 } 1099 return (0); 1100} 1101 1102int 1103vm_restore_time(struct vmctx *ctx) 1104{ 1105 int dummy; 1106 1107 dummy = 0; 1108 return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy)); 1109} 1110#endif 1111 1112int 1113vm_set_topology(struct vmctx *ctx, 1114 uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) 1115{ 1116 struct vm_cpu_topology topology; 1117 1118 bzero(&topology, sizeof (struct vm_cpu_topology)); 1119 topology.sockets = sockets; 1120 topology.cores = cores; 1121 topology.threads = threads; 1122 topology.maxcpus = maxcpus; 1123 return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); 1124} 1125 1126int 1127vm_get_topology(struct vmctx *ctx, 1128 uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) 1129{ 1130 struct vm_cpu_topology topology; 1131 int error; 1132 1133 bzero(&topology, sizeof (struct vm_cpu_topology)); 1134 error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); 1135 if (error == 0) { 1136 *sockets = topology.sockets; 1137 *cores = topology.cores; 1138 *threads = topology.threads; 1139 *maxcpus = topology.maxcpus; 1140 } 1141 return (error); 1142} 1143 1144int 1145vm_limit_rights(struct vmctx *ctx) 1146{ 1147 cap_rights_t rights; 1148 1149 cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); 1150 if (caph_rights_limit(ctx->fd, &rights) != 0) 1151 return (-1); 1152 if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0) 1153 return (-1); 1154 return (0); 1155} 1156 1157/* 1158 * Avoid using in new code. Operations on the fd should be wrapped here so that 1159 * capability rights can be kept in sync. 1160 */ 1161int 1162vm_get_device_fd(struct vmctx *ctx) 1163{ 1164 1165 return (ctx->fd); 1166} 1167 1168/* Legacy interface, do not use. */ 1169const cap_ioctl_t * 1170vm_get_ioctls(size_t *len) 1171{ 1172 cap_ioctl_t *cmds; 1173 size_t sz; 1174 1175 if (len == NULL) { 1176 sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]); 1177 cmds = malloc(sz); 1178 if (cmds == NULL) 1179 return (NULL); 1180 bcopy(vm_ioctl_cmds, cmds, sz); 1181 return (cmds); 1182 } 1183 1184 *len = vm_ioctl_ncmds; 1185 return (NULL); 1186} 1187