vmm_dev.c revision 262350
1/*- 2 * Copyright (c) 2011 NetApp, Inc. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262350 2014-02-23 00:46:05Z jhb $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/amd64/vmm/vmm_dev.c 262350 2014-02-23 00:46:05Z jhb $"); 31 32#include <sys/param.h> 33#include <sys/kernel.h> 34#include <sys/queue.h> 35#include <sys/lock.h> 36#include <sys/mutex.h> 37#include <sys/malloc.h> 38#include <sys/conf.h> 39#include <sys/sysctl.h> 40#include <sys/libkern.h> 41#include <sys/ioccom.h> 42#include <sys/mman.h> 43#include <sys/uio.h> 44 45#include <vm/vm.h> 46#include <vm/pmap.h> 47#include <vm/vm_map.h> 48 49#include <machine/vmparam.h> 50#include <machine/vmm.h> 51#include <machine/vmm_dev.h> 52 53#include "vmm_lapic.h" 54#include "vmm_stat.h" 55#include "vmm_mem.h" 56#include "io/ppt.h" 57#include "io/vioapic.h" 58#include "io/vhpet.h" 59 60struct vmmdev_softc { 61 struct vm *vm; /* vm instance cookie */ 62 struct cdev *cdev; 63 SLIST_ENTRY(vmmdev_softc) link; 64 int flags; 65}; 66#define VSC_LINKED 0x01 67 68static SLIST_HEAD(, vmmdev_softc) head; 69 70static struct mtx vmmdev_mtx; 71 72static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev"); 73 74SYSCTL_DECL(_hw_vmm); 75 76static struct vmmdev_softc * 77vmmdev_lookup(const char *name) 78{ 79 struct vmmdev_softc *sc; 80 81#ifdef notyet /* XXX kernel is not compiled with invariants */ 82 mtx_assert(&vmmdev_mtx, MA_OWNED); 83#endif 84 85 SLIST_FOREACH(sc, &head, link) { 86 if (strcmp(name, vm_name(sc->vm)) == 0) 87 break; 88 } 89 90 return (sc); 91} 92 93static struct vmmdev_softc * 94vmmdev_lookup2(struct cdev *cdev) 95{ 96 97 return (cdev->si_drv1); 98} 99 100static int 101vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags) 102{ 103 int error, off, c, prot; 104 vm_paddr_t gpa; 105 void *hpa, *cookie; 106 struct vmmdev_softc *sc; 107 108 static char zerobuf[PAGE_SIZE]; 109 110 error = 0; 111 sc = vmmdev_lookup2(cdev); 112 if (sc == NULL) 113 error = ENXIO; 114 115 prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ); 116 while (uio->uio_resid > 0 && error == 0) { 117 gpa = uio->uio_offset; 118 off = gpa & PAGE_MASK; 119 c = min(uio->uio_resid, PAGE_SIZE - off); 120 121 /* 122 * The VM has a hole in its physical memory map. If we want to 123 * use 'dd' to inspect memory beyond the hole we need to 124 * provide bogus data for memory that lies in the hole. 125 * 126 * Since this device does not support lseek(2), dd(1) will 127 * read(2) blocks of data to simulate the lseek(2). 128 */ 129 hpa = vm_gpa_hold(sc->vm, gpa, c, prot, &cookie); 130 if (hpa == NULL) { 131 if (uio->uio_rw == UIO_READ) 132 error = uiomove(zerobuf, c, uio); 133 else 134 error = EFAULT; 135 } else { 136 error = uiomove(hpa, c, uio); 137 vm_gpa_release(cookie); 138 } 139 } 140 return (error); 141} 142 143static int 144vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag, 145 struct thread *td) 146{ 147 int error, vcpu, state_changed; 148 struct vmmdev_softc *sc; 149 struct vm_memory_segment *seg; 150 struct vm_register *vmreg; 151 struct vm_seg_desc *vmsegdesc; 152 struct vm_run *vmrun; 153 struct vm_event *vmevent; 154 struct vm_lapic_irq *vmirq; 155 struct vm_lapic_msi *vmmsi; 156 struct vm_ioapic_irq *ioapic_irq; 157 struct vm_capability *vmcap; 158 struct vm_pptdev *pptdev; 159 struct vm_pptdev_mmio *pptmmio; 160 struct vm_pptdev_msi *pptmsi; 161 struct vm_pptdev_msix *pptmsix; 162 struct vm_nmi *vmnmi; 163 struct vm_stats *vmstats; 164 struct vm_stat_desc *statdesc; 165 struct vm_x2apic *x2apic; 166 struct vm_gpa_pte *gpapte; 167 168 sc = vmmdev_lookup2(cdev); 169 if (sc == NULL) 170 return (ENXIO); 171 172 vcpu = -1; 173 state_changed = 0; 174 175 /* 176 * Some VMM ioctls can operate only on vcpus that are not running. 177 */ 178 switch (cmd) { 179 case VM_RUN: 180 case VM_GET_REGISTER: 181 case VM_SET_REGISTER: 182 case VM_GET_SEGMENT_DESCRIPTOR: 183 case VM_SET_SEGMENT_DESCRIPTOR: 184 case VM_INJECT_EVENT: 185 case VM_GET_CAPABILITY: 186 case VM_SET_CAPABILITY: 187 case VM_PPTDEV_MSI: 188 case VM_PPTDEV_MSIX: 189 case VM_SET_X2APIC_STATE: 190 /* 191 * XXX fragile, handle with care 192 * Assumes that the first field of the ioctl data is the vcpu. 193 */ 194 vcpu = *(int *)data; 195 if (vcpu < 0 || vcpu >= VM_MAXCPU) { 196 error = EINVAL; 197 goto done; 198 } 199 200 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 201 if (error) 202 goto done; 203 204 state_changed = 1; 205 break; 206 207 case VM_MAP_PPTDEV_MMIO: 208 case VM_BIND_PPTDEV: 209 case VM_UNBIND_PPTDEV: 210 case VM_MAP_MEMORY: 211 /* 212 * ioctls that operate on the entire virtual machine must 213 * prevent all vcpus from running. 214 */ 215 error = 0; 216 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { 217 error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN); 218 if (error) 219 break; 220 } 221 222 if (error) { 223 while (--vcpu >= 0) 224 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 225 goto done; 226 } 227 228 state_changed = 2; 229 break; 230 231 default: 232 break; 233 } 234 235 switch(cmd) { 236 case VM_RUN: 237 vmrun = (struct vm_run *)data; 238 error = vm_run(sc->vm, vmrun); 239 break; 240 case VM_STAT_DESC: { 241 statdesc = (struct vm_stat_desc *)data; 242 error = vmm_stat_desc_copy(statdesc->index, 243 statdesc->desc, sizeof(statdesc->desc)); 244 break; 245 } 246 case VM_STATS: { 247 CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); 248 vmstats = (struct vm_stats *)data; 249 getmicrotime(&vmstats->tv); 250 error = vmm_stat_copy(sc->vm, vmstats->cpuid, 251 &vmstats->num_entries, vmstats->statbuf); 252 break; 253 } 254 case VM_PPTDEV_MSI: 255 pptmsi = (struct vm_pptdev_msi *)data; 256 error = ppt_setup_msi(sc->vm, pptmsi->vcpu, 257 pptmsi->bus, pptmsi->slot, pptmsi->func, 258 pptmsi->addr, pptmsi->msg, 259 pptmsi->numvec); 260 break; 261 case VM_PPTDEV_MSIX: 262 pptmsix = (struct vm_pptdev_msix *)data; 263 error = ppt_setup_msix(sc->vm, pptmsix->vcpu, 264 pptmsix->bus, pptmsix->slot, 265 pptmsix->func, pptmsix->idx, 266 pptmsix->addr, pptmsix->msg, 267 pptmsix->vector_control); 268 break; 269 case VM_MAP_PPTDEV_MMIO: 270 pptmmio = (struct vm_pptdev_mmio *)data; 271 error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot, 272 pptmmio->func, pptmmio->gpa, pptmmio->len, 273 pptmmio->hpa); 274 break; 275 case VM_BIND_PPTDEV: 276 pptdev = (struct vm_pptdev *)data; 277 error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 278 pptdev->func); 279 break; 280 case VM_UNBIND_PPTDEV: 281 pptdev = (struct vm_pptdev *)data; 282 error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot, 283 pptdev->func); 284 break; 285 case VM_INJECT_EVENT: 286 vmevent = (struct vm_event *)data; 287 error = vm_inject_event(sc->vm, vmevent->cpuid, vmevent->type, 288 vmevent->vector, 289 vmevent->error_code, 290 vmevent->error_code_valid); 291 break; 292 case VM_INJECT_NMI: 293 vmnmi = (struct vm_nmi *)data; 294 error = vm_inject_nmi(sc->vm, vmnmi->cpuid); 295 break; 296 case VM_LAPIC_IRQ: 297 vmirq = (struct vm_lapic_irq *)data; 298 error = lapic_intr_edge(sc->vm, vmirq->cpuid, vmirq->vector); 299 break; 300 case VM_LAPIC_LOCAL_IRQ: 301 vmirq = (struct vm_lapic_irq *)data; 302 error = lapic_set_local_intr(sc->vm, vmirq->cpuid, 303 vmirq->vector); 304 break; 305 case VM_LAPIC_MSI: 306 vmmsi = (struct vm_lapic_msi *)data; 307 error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg); 308 break; 309 case VM_IOAPIC_ASSERT_IRQ: 310 ioapic_irq = (struct vm_ioapic_irq *)data; 311 error = vioapic_assert_irq(sc->vm, ioapic_irq->irq); 312 break; 313 case VM_IOAPIC_DEASSERT_IRQ: 314 ioapic_irq = (struct vm_ioapic_irq *)data; 315 error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq); 316 break; 317 case VM_IOAPIC_PULSE_IRQ: 318 ioapic_irq = (struct vm_ioapic_irq *)data; 319 error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq); 320 break; 321 case VM_MAP_MEMORY: 322 seg = (struct vm_memory_segment *)data; 323 error = vm_malloc(sc->vm, seg->gpa, seg->len); 324 break; 325 case VM_GET_MEMORY_SEG: 326 seg = (struct vm_memory_segment *)data; 327 seg->len = 0; 328 (void)vm_gpabase2memseg(sc->vm, seg->gpa, seg); 329 error = 0; 330 break; 331 case VM_GET_REGISTER: 332 vmreg = (struct vm_register *)data; 333 error = vm_get_register(sc->vm, vmreg->cpuid, vmreg->regnum, 334 &vmreg->regval); 335 break; 336 case VM_SET_REGISTER: 337 vmreg = (struct vm_register *)data; 338 error = vm_set_register(sc->vm, vmreg->cpuid, vmreg->regnum, 339 vmreg->regval); 340 break; 341 case VM_SET_SEGMENT_DESCRIPTOR: 342 vmsegdesc = (struct vm_seg_desc *)data; 343 error = vm_set_seg_desc(sc->vm, vmsegdesc->cpuid, 344 vmsegdesc->regnum, 345 &vmsegdesc->desc); 346 break; 347 case VM_GET_SEGMENT_DESCRIPTOR: 348 vmsegdesc = (struct vm_seg_desc *)data; 349 error = vm_get_seg_desc(sc->vm, vmsegdesc->cpuid, 350 vmsegdesc->regnum, 351 &vmsegdesc->desc); 352 break; 353 case VM_GET_CAPABILITY: 354 vmcap = (struct vm_capability *)data; 355 error = vm_get_capability(sc->vm, vmcap->cpuid, 356 vmcap->captype, 357 &vmcap->capval); 358 break; 359 case VM_SET_CAPABILITY: 360 vmcap = (struct vm_capability *)data; 361 error = vm_set_capability(sc->vm, vmcap->cpuid, 362 vmcap->captype, 363 vmcap->capval); 364 break; 365 case VM_SET_X2APIC_STATE: 366 x2apic = (struct vm_x2apic *)data; 367 error = vm_set_x2apic_state(sc->vm, 368 x2apic->cpuid, x2apic->state); 369 break; 370 case VM_GET_X2APIC_STATE: 371 x2apic = (struct vm_x2apic *)data; 372 error = vm_get_x2apic_state(sc->vm, 373 x2apic->cpuid, &x2apic->state); 374 break; 375 case VM_GET_GPA_PMAP: 376 gpapte = (struct vm_gpa_pte *)data; 377 pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)), 378 gpapte->gpa, gpapte->pte, &gpapte->ptenum); 379 error = 0; 380 break; 381 case VM_GET_HPET_CAPABILITIES: 382 error = vhpet_getcap((struct vm_hpet_cap *)data); 383 break; 384 default: 385 error = ENOTTY; 386 break; 387 } 388 389 if (state_changed == 1) { 390 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 391 } else if (state_changed == 2) { 392 for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) 393 vcpu_set_state(sc->vm, vcpu, VCPU_IDLE); 394 } 395 396done: 397 /* Make sure that no handler returns a bogus value like ERESTART */ 398 KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); 399 return (error); 400} 401 402static int 403vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, 404 vm_size_t size, struct vm_object **object, int nprot) 405{ 406 int error; 407 struct vmmdev_softc *sc; 408 409 sc = vmmdev_lookup2(cdev); 410 if (sc != NULL && (nprot & PROT_EXEC) == 0) 411 error = vm_get_memobj(sc->vm, *offset, size, offset, object); 412 else 413 error = EINVAL; 414 415 return (error); 416} 417 418static void 419vmmdev_destroy(void *arg) 420{ 421 422 struct vmmdev_softc *sc = arg; 423 424 if (sc->cdev != NULL) 425 destroy_dev(sc->cdev); 426 427 if (sc->vm != NULL) 428 vm_destroy(sc->vm); 429 430 if ((sc->flags & VSC_LINKED) != 0) { 431 mtx_lock(&vmmdev_mtx); 432 SLIST_REMOVE(&head, sc, vmmdev_softc, link); 433 mtx_unlock(&vmmdev_mtx); 434 } 435 436 free(sc, M_VMMDEV); 437} 438 439static int 440sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS) 441{ 442 int error; 443 char buf[VM_MAX_NAMELEN]; 444 struct vmmdev_softc *sc; 445 struct cdev *cdev; 446 447 strlcpy(buf, "beavis", sizeof(buf)); 448 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 449 if (error != 0 || req->newptr == NULL) 450 return (error); 451 452 mtx_lock(&vmmdev_mtx); 453 sc = vmmdev_lookup(buf); 454 if (sc == NULL || sc->cdev == NULL) { 455 mtx_unlock(&vmmdev_mtx); 456 return (EINVAL); 457 } 458 459 /* 460 * The 'cdev' will be destroyed asynchronously when 'si_threadcount' 461 * goes down to 0 so we should not do it again in the callback. 462 */ 463 cdev = sc->cdev; 464 sc->cdev = NULL; 465 mtx_unlock(&vmmdev_mtx); 466 467 /* 468 * Schedule the 'cdev' to be destroyed: 469 * 470 * - any new operations on this 'cdev' will return an error (ENXIO). 471 * 472 * - when the 'si_threadcount' dwindles down to zero the 'cdev' will 473 * be destroyed and the callback will be invoked in a taskqueue 474 * context. 475 */ 476 destroy_dev_sched_cb(cdev, vmmdev_destroy, sc); 477 478 return (0); 479} 480SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy, CTLTYPE_STRING | CTLFLAG_RW, 481 NULL, 0, sysctl_vmm_destroy, "A", NULL); 482 483static struct cdevsw vmmdevsw = { 484 .d_name = "vmmdev", 485 .d_version = D_VERSION, 486 .d_ioctl = vmmdev_ioctl, 487 .d_mmap_single = vmmdev_mmap_single, 488 .d_read = vmmdev_rw, 489 .d_write = vmmdev_rw, 490}; 491 492static int 493sysctl_vmm_create(SYSCTL_HANDLER_ARGS) 494{ 495 int error; 496 struct vm *vm; 497 struct cdev *cdev; 498 struct vmmdev_softc *sc, *sc2; 499 char buf[VM_MAX_NAMELEN]; 500 501 strlcpy(buf, "beavis", sizeof(buf)); 502 error = sysctl_handle_string(oidp, buf, sizeof(buf), req); 503 if (error != 0 || req->newptr == NULL) 504 return (error); 505 506 mtx_lock(&vmmdev_mtx); 507 sc = vmmdev_lookup(buf); 508 mtx_unlock(&vmmdev_mtx); 509 if (sc != NULL) 510 return (EEXIST); 511 512 error = vm_create(buf, &vm); 513 if (error != 0) 514 return (error); 515 516 sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO); 517 sc->vm = vm; 518 519 /* 520 * Lookup the name again just in case somebody sneaked in when we 521 * dropped the lock. 522 */ 523 mtx_lock(&vmmdev_mtx); 524 sc2 = vmmdev_lookup(buf); 525 if (sc2 == NULL) { 526 SLIST_INSERT_HEAD(&head, sc, link); 527 sc->flags |= VSC_LINKED; 528 } 529 mtx_unlock(&vmmdev_mtx); 530 531 if (sc2 != NULL) { 532 vmmdev_destroy(sc); 533 return (EEXIST); 534 } 535 536 error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, NULL, 537 UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf); 538 if (error != 0) { 539 vmmdev_destroy(sc); 540 return (error); 541 } 542 543 mtx_lock(&vmmdev_mtx); 544 sc->cdev = cdev; 545 sc->cdev->si_drv1 = sc; 546 mtx_unlock(&vmmdev_mtx); 547 548 return (0); 549} 550SYSCTL_PROC(_hw_vmm, OID_AUTO, create, CTLTYPE_STRING | CTLFLAG_RW, 551 NULL, 0, sysctl_vmm_create, "A", NULL); 552 553void 554vmmdev_init(void) 555{ 556 mtx_init(&vmmdev_mtx, "vmm device mutex", NULL, MTX_DEF); 557} 558 559int 560vmmdev_cleanup(void) 561{ 562 int error; 563 564 if (SLIST_EMPTY(&head)) 565 error = 0; 566 else 567 error = EBUSY; 568 569 return (error); 570} 571