1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/capsicum.h>
31#include <sys/sysctl.h>
32#include <sys/ioctl.h>
33#include <sys/mman.h>
34#include <sys/linker.h>
35#include <sys/module.h>
36#include <sys/_iovec.h>
37#include <sys/cpuset.h>
38
39#include <capsicum_helpers.h>
40#include <errno.h>
41#include <stdbool.h>
42#include <stdio.h>
43#include <stdlib.h>
44#include <assert.h>
45#include <string.h>
46#include <fcntl.h>
47#include <unistd.h>
48
49#include <libutil.h>
50
51#include <vm/vm.h>
52#include <machine/vmm.h>
53#include <machine/vmm_dev.h>
54#ifdef WITH_VMMAPI_SNAPSHOT
55#include <machine/vmm_snapshot.h>
56#endif
57
58#include "vmmapi.h"
59#include "internal.h"
60
61#define	MB	(1024 * 1024UL)
62#define	GB	(1024 * 1024 * 1024UL)
63
64#ifdef __amd64__
65#define	VM_LOWMEM_LIMIT	(3 * GB)
66#else
67#define	VM_LOWMEM_LIMIT	0
68#endif
69#define	VM_HIGHMEM_BASE	(4 * GB)
70
71/*
72 * Size of the guard region before and after the virtual address space
73 * mapping the guest physical memory. This must be a multiple of the
74 * superpage size for performance reasons.
75 */
76#define	VM_MMAP_GUARD_SIZE	(4 * MB)
77
78#define	PROT_RW		(PROT_READ | PROT_WRITE)
79#define	PROT_ALL	(PROT_READ | PROT_WRITE | PROT_EXEC)
80
81#define	CREATE(x)  sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
82#define	DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
83
84static int
85vm_device_open(const char *name)
86{
87	int fd, len;
88	char *vmfile;
89
90	len = strlen("/dev/vmm/") + strlen(name) + 1;
91	vmfile = malloc(len);
92	assert(vmfile != NULL);
93	snprintf(vmfile, len, "/dev/vmm/%s", name);
94
95	/* Open the device file */
96	fd = open(vmfile, O_RDWR, 0);
97
98	free(vmfile);
99	return (fd);
100}
101
102int
103vm_create(const char *name)
104{
105	/* Try to load vmm(4) module before creating a guest. */
106	if (modfind("vmm") < 0)
107		kldload("vmm");
108	return (CREATE(name));
109}
110
111struct vmctx *
112vm_open(const char *name)
113{
114	struct vmctx *vm;
115	int saved_errno;
116
117	vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
118	assert(vm != NULL);
119
120	vm->fd = -1;
121	vm->memflags = 0;
122	vm->name = (char *)(vm + 1);
123	strcpy(vm->name, name);
124	memset(vm->memsegs, 0, sizeof(vm->memsegs));
125
126	if ((vm->fd = vm_device_open(vm->name)) < 0)
127		goto err;
128
129	return (vm);
130err:
131	saved_errno = errno;
132	free(vm);
133	errno = saved_errno;
134	return (NULL);
135}
136
137void
138vm_close(struct vmctx *vm)
139{
140	assert(vm != NULL);
141
142	close(vm->fd);
143	free(vm);
144}
145
146void
147vm_destroy(struct vmctx *vm)
148{
149	assert(vm != NULL);
150
151	if (vm->fd >= 0)
152		close(vm->fd);
153	DESTROY(vm->name);
154
155	free(vm);
156}
157
158struct vcpu *
159vm_vcpu_open(struct vmctx *ctx, int vcpuid)
160{
161	struct vcpu *vcpu;
162
163	vcpu = malloc(sizeof(*vcpu));
164	vcpu->ctx = ctx;
165	vcpu->vcpuid = vcpuid;
166	return (vcpu);
167}
168
169void
170vm_vcpu_close(struct vcpu *vcpu)
171{
172	free(vcpu);
173}
174
175int
176vcpu_id(struct vcpu *vcpu)
177{
178	return (vcpu->vcpuid);
179}
180
181int
182vm_parse_memsize(const char *opt, size_t *ret_memsize)
183{
184	char *endptr;
185	size_t optval;
186	int error;
187
188	optval = strtoul(opt, &endptr, 0);
189	if (*opt != '\0' && *endptr == '\0') {
190		/*
191		 * For the sake of backward compatibility if the memory size
192		 * specified on the command line is less than a megabyte then
193		 * it is interpreted as being in units of MB.
194		 */
195		if (optval < MB)
196			optval *= MB;
197		*ret_memsize = optval;
198		error = 0;
199	} else
200		error = expand_number(opt, ret_memsize);
201
202	return (error);
203}
204
205uint32_t
206vm_get_lowmem_limit(struct vmctx *ctx __unused)
207{
208
209	return (VM_LOWMEM_LIMIT);
210}
211
212void
213vm_set_memflags(struct vmctx *ctx, int flags)
214{
215
216	ctx->memflags = flags;
217}
218
219int
220vm_get_memflags(struct vmctx *ctx)
221{
222
223	return (ctx->memflags);
224}
225
226/*
227 * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
228 */
229int
230vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
231    size_t len, int prot)
232{
233	struct vm_memmap memmap;
234	int error, flags;
235
236	memmap.gpa = gpa;
237	memmap.segid = segid;
238	memmap.segoff = off;
239	memmap.len = len;
240	memmap.prot = prot;
241	memmap.flags = 0;
242
243	if (ctx->memflags & VM_MEM_F_WIRED)
244		memmap.flags |= VM_MEMMAP_F_WIRED;
245
246	/*
247	 * If this mapping already exists then don't create it again. This
248	 * is the common case for SYSMEM mappings created by bhyveload(8).
249	 */
250	error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
251	if (error == 0 && gpa == memmap.gpa) {
252		if (segid != memmap.segid || off != memmap.segoff ||
253		    prot != memmap.prot || flags != memmap.flags) {
254			errno = EEXIST;
255			return (-1);
256		} else {
257			return (0);
258		}
259	}
260
261	error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
262	return (error);
263}
264
265int
266vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
267    size_t *lowmem_size, size_t *highmem_size)
268{
269
270	*guest_baseaddr = ctx->baseaddr;
271	*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
272	*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
273	return (0);
274}
275
276int
277vm_munmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
278{
279	struct vm_munmap munmap;
280	int error;
281
282	munmap.gpa = gpa;
283	munmap.len = len;
284
285	error = ioctl(ctx->fd, VM_MUNMAP_MEMSEG, &munmap);
286	return (error);
287}
288
289int
290vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
291    vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
292{
293	struct vm_memmap memmap;
294	int error;
295
296	bzero(&memmap, sizeof(struct vm_memmap));
297	memmap.gpa = *gpa;
298	error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
299	if (error == 0) {
300		*gpa = memmap.gpa;
301		*segid = memmap.segid;
302		*segoff = memmap.segoff;
303		*len = memmap.len;
304		*prot = memmap.prot;
305		*flags = memmap.flags;
306	}
307	return (error);
308}
309
310/*
311 * Return 0 if the segments are identical and non-zero otherwise.
312 *
313 * This is slightly complicated by the fact that only device memory segments
314 * are named.
315 */
316static int
317cmpseg(size_t len, const char *str, size_t len2, const char *str2)
318{
319
320	if (len == len2) {
321		if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
322			return (0);
323	}
324	return (-1);
325}
326
327static int
328vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
329{
330	struct vm_memseg memseg;
331	size_t n;
332	int error;
333
334	/*
335	 * If the memory segment has already been created then just return.
336	 * This is the usual case for the SYSMEM segment created by userspace
337	 * loaders like bhyveload(8).
338	 */
339	error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
340	    sizeof(memseg.name));
341	if (error)
342		return (error);
343
344	if (memseg.len != 0) {
345		if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
346			errno = EINVAL;
347			return (-1);
348		} else {
349			return (0);
350		}
351	}
352
353	bzero(&memseg, sizeof(struct vm_memseg));
354	memseg.segid = segid;
355	memseg.len = len;
356	if (name != NULL) {
357		n = strlcpy(memseg.name, name, sizeof(memseg.name));
358		if (n >= sizeof(memseg.name)) {
359			errno = ENAMETOOLONG;
360			return (-1);
361		}
362	}
363
364	error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
365	return (error);
366}
367
368int
369vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
370    size_t bufsize)
371{
372	struct vm_memseg memseg;
373	size_t n;
374	int error;
375
376	bzero(&memseg, sizeof(memseg));
377	memseg.segid = segid;
378	error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
379	if (error == 0) {
380		*lenp = memseg.len;
381		n = strlcpy(namebuf, memseg.name, bufsize);
382		if (n >= bufsize) {
383			errno = ENAMETOOLONG;
384			error = -1;
385		}
386	}
387	return (error);
388}
389
390static int
391setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
392{
393	char *ptr;
394	int error, flags;
395
396	/* Map 'len' bytes starting at 'gpa' in the guest address space */
397	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
398	if (error)
399		return (error);
400
401	flags = MAP_SHARED | MAP_FIXED;
402	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
403		flags |= MAP_NOCORE;
404
405	/* mmap into the process address space on the host */
406	ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
407	if (ptr == MAP_FAILED)
408		return (-1);
409
410	return (0);
411}
412
413int
414vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
415{
416	size_t objsize, len;
417	vm_paddr_t gpa;
418	char *baseaddr, *ptr;
419	int error;
420
421	assert(vms == VM_MMAP_ALL);
422
423	/*
424	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
425	 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
426	 */
427	if (memsize > VM_LOWMEM_LIMIT) {
428		ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
429		ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
430		objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
431	} else {
432		ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
433		ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
434		objsize = memsize;
435	}
436
437	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
438	if (error)
439		return (error);
440
441	/*
442	 * Stake out a contiguous region covering the guest physical memory
443	 * and the adjoining guard regions.
444	 */
445	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
446	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
447	if (ptr == MAP_FAILED)
448		return (-1);
449
450	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
451	if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
452		gpa = VM_HIGHMEM_BASE;
453		len = ctx->memsegs[VM_MEMSEG_HIGH].size;
454		error = setup_memory_segment(ctx, gpa, len, baseaddr);
455		if (error)
456			return (error);
457	}
458
459	if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
460		gpa = 0;
461		len = ctx->memsegs[VM_MEMSEG_LOW].size;
462		error = setup_memory_segment(ctx, gpa, len, baseaddr);
463		if (error)
464			return (error);
465	}
466
467	ctx->baseaddr = baseaddr;
468
469	return (0);
470}
471
472/*
473 * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
474 * the lowmem or highmem regions.
475 *
476 * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region.
477 * The instruction emulation code depends on this behavior.
478 */
479void *
480vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
481{
482	vm_size_t lowsize, highsize;
483
484	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
485	if (lowsize > 0) {
486		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
487			return (ctx->baseaddr + gaddr);
488	}
489
490	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
491	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
492		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
493		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
494			return (ctx->baseaddr + gaddr);
495	}
496
497	return (NULL);
498}
499
500vm_paddr_t
501vm_rev_map_gpa(struct vmctx *ctx, void *addr)
502{
503	vm_paddr_t offaddr;
504	vm_size_t lowsize, highsize;
505
506	offaddr = (char *)addr - ctx->baseaddr;
507
508	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
509	if (lowsize > 0)
510		if (offaddr <= lowsize)
511			return (offaddr);
512
513	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
514	if (highsize > 0)
515		if (offaddr >= VM_HIGHMEM_BASE &&
516		    offaddr < VM_HIGHMEM_BASE + highsize)
517			return (offaddr);
518
519	return ((vm_paddr_t)-1);
520}
521
522const char *
523vm_get_name(struct vmctx *ctx)
524{
525
526	return (ctx->name);
527}
528
529size_t
530vm_get_lowmem_size(struct vmctx *ctx)
531{
532
533	return (ctx->memsegs[VM_MEMSEG_LOW].size);
534}
535
536vm_paddr_t
537vm_get_highmem_base(struct vmctx *ctx __unused)
538{
539
540	return (VM_HIGHMEM_BASE);
541}
542
543size_t
544vm_get_highmem_size(struct vmctx *ctx)
545{
546
547	return (ctx->memsegs[VM_MEMSEG_HIGH].size);
548}
549
550void *
551vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
552{
553	char pathname[MAXPATHLEN];
554	size_t len2;
555	char *base, *ptr;
556	int fd, error, flags;
557
558	fd = -1;
559	ptr = MAP_FAILED;
560	if (name == NULL || strlen(name) == 0) {
561		errno = EINVAL;
562		goto done;
563	}
564
565	error = vm_alloc_memseg(ctx, segid, len, name);
566	if (error)
567		goto done;
568
569	strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname));
570	strlcat(pathname, ctx->name, sizeof(pathname));
571	strlcat(pathname, ".", sizeof(pathname));
572	strlcat(pathname, name, sizeof(pathname));
573
574	fd = open(pathname, O_RDWR);
575	if (fd < 0)
576		goto done;
577
578	/*
579	 * Stake out a contiguous region covering the device memory and the
580	 * adjoining guard regions.
581	 */
582	len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
583	base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1,
584	    0);
585	if (base == MAP_FAILED)
586		goto done;
587
588	flags = MAP_SHARED | MAP_FIXED;
589	if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
590		flags |= MAP_NOCORE;
591
592	/* mmap the devmem region in the host address space */
593	ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
594done:
595	if (fd >= 0)
596		close(fd);
597	return (ptr);
598}
599
600int
601vcpu_ioctl(struct vcpu *vcpu, u_long cmd, void *arg)
602{
603	/*
604	 * XXX: fragile, handle with care
605	 * Assumes that the first field of the ioctl data
606	 * is the vcpuid.
607	 */
608	*(int *)arg = vcpu->vcpuid;
609	return (ioctl(vcpu->ctx->fd, cmd, arg));
610}
611
612int
613vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
614{
615	int error;
616	struct vm_register vmreg;
617
618	bzero(&vmreg, sizeof(vmreg));
619	vmreg.regnum = reg;
620	vmreg.regval = val;
621
622	error = vcpu_ioctl(vcpu, VM_SET_REGISTER, &vmreg);
623	return (error);
624}
625
626int
627vm_get_register(struct vcpu *vcpu, int reg, uint64_t *ret_val)
628{
629	int error;
630	struct vm_register vmreg;
631
632	bzero(&vmreg, sizeof(vmreg));
633	vmreg.regnum = reg;
634
635	error = vcpu_ioctl(vcpu, VM_GET_REGISTER, &vmreg);
636	*ret_val = vmreg.regval;
637	return (error);
638}
639
640int
641vm_set_register_set(struct vcpu *vcpu, unsigned int count,
642    const int *regnums, uint64_t *regvals)
643{
644	int error;
645	struct vm_register_set vmregset;
646
647	bzero(&vmregset, sizeof(vmregset));
648	vmregset.count = count;
649	vmregset.regnums = regnums;
650	vmregset.regvals = regvals;
651
652	error = vcpu_ioctl(vcpu, VM_SET_REGISTER_SET, &vmregset);
653	return (error);
654}
655
656int
657vm_get_register_set(struct vcpu *vcpu, unsigned int count,
658    const int *regnums, uint64_t *regvals)
659{
660	int error;
661	struct vm_register_set vmregset;
662
663	bzero(&vmregset, sizeof(vmregset));
664	vmregset.count = count;
665	vmregset.regnums = regnums;
666	vmregset.regvals = regvals;
667
668	error = vcpu_ioctl(vcpu, VM_GET_REGISTER_SET, &vmregset);
669	return (error);
670}
671
672int
673vm_run(struct vcpu *vcpu, struct vm_run *vmrun)
674{
675	return (vcpu_ioctl(vcpu, VM_RUN, vmrun));
676}
677
678int
679vm_suspend(struct vmctx *ctx, enum vm_suspend_how how)
680{
681	struct vm_suspend vmsuspend;
682
683	bzero(&vmsuspend, sizeof(vmsuspend));
684	vmsuspend.how = how;
685	return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend));
686}
687
688int
689vm_reinit(struct vmctx *ctx)
690{
691
692	return (ioctl(ctx->fd, VM_REINIT, 0));
693}
694
695int
696vm_capability_name2type(const char *capname)
697{
698	int i;
699
700	for (i = 0; i < VM_CAP_MAX; i++) {
701		if (vm_capstrmap[i] != NULL &&
702		    strcmp(vm_capstrmap[i], capname) == 0)
703			return (i);
704	}
705
706	return (-1);
707}
708
709const char *
710vm_capability_type2name(int type)
711{
712	if (type >= 0 && type < VM_CAP_MAX)
713		return (vm_capstrmap[type]);
714
715	return (NULL);
716}
717
718int
719vm_get_capability(struct vcpu *vcpu, enum vm_cap_type cap, int *retval)
720{
721	int error;
722	struct vm_capability vmcap;
723
724	bzero(&vmcap, sizeof(vmcap));
725	vmcap.captype = cap;
726
727	error = vcpu_ioctl(vcpu, VM_GET_CAPABILITY, &vmcap);
728	*retval = vmcap.capval;
729	return (error);
730}
731
732int
733vm_set_capability(struct vcpu *vcpu, enum vm_cap_type cap, int val)
734{
735	struct vm_capability vmcap;
736
737	bzero(&vmcap, sizeof(vmcap));
738	vmcap.captype = cap;
739	vmcap.capval = val;
740
741	return (vcpu_ioctl(vcpu, VM_SET_CAPABILITY, &vmcap));
742}
743
744uint64_t *
745vm_get_stats(struct vcpu *vcpu, struct timeval *ret_tv,
746	     int *ret_entries)
747{
748	static _Thread_local uint64_t *stats_buf;
749	static _Thread_local u_int stats_count;
750	uint64_t *new_stats;
751	struct vm_stats vmstats;
752	u_int count, index;
753	bool have_stats;
754
755	have_stats = false;
756	count = 0;
757	for (index = 0;; index += nitems(vmstats.statbuf)) {
758		vmstats.index = index;
759		if (vcpu_ioctl(vcpu, VM_STATS, &vmstats) != 0)
760			break;
761		if (stats_count < index + vmstats.num_entries) {
762			new_stats = realloc(stats_buf,
763			    (index + vmstats.num_entries) * sizeof(uint64_t));
764			if (new_stats == NULL) {
765				errno = ENOMEM;
766				return (NULL);
767			}
768			stats_count = index + vmstats.num_entries;
769			stats_buf = new_stats;
770		}
771		memcpy(stats_buf + index, vmstats.statbuf,
772		    vmstats.num_entries * sizeof(uint64_t));
773		count += vmstats.num_entries;
774		have_stats = true;
775
776		if (vmstats.num_entries != nitems(vmstats.statbuf))
777			break;
778	}
779	if (have_stats) {
780		if (ret_entries)
781			*ret_entries = count;
782		if (ret_tv)
783			*ret_tv = vmstats.tv;
784		return (stats_buf);
785	} else
786		return (NULL);
787}
788
789const char *
790vm_get_stat_desc(struct vmctx *ctx, int index)
791{
792	static struct vm_stat_desc statdesc;
793
794	statdesc.index = index;
795	if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
796		return (statdesc.desc);
797	else
798		return (NULL);
799}
800
801#ifdef __amd64__
802int
803vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
804{
805	int error, i;
806	struct vm_gpa_pte gpapte;
807
808	bzero(&gpapte, sizeof(gpapte));
809	gpapte.gpa = gpa;
810
811	error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
812
813	if (error == 0) {
814		*num = gpapte.ptenum;
815		for (i = 0; i < gpapte.ptenum; i++)
816			pte[i] = gpapte.pte[i];
817	}
818
819	return (error);
820}
821
822int
823vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
824    uint64_t gla, int prot, uint64_t *gpa, int *fault)
825{
826	struct vm_gla2gpa gg;
827	int error;
828
829	bzero(&gg, sizeof(struct vm_gla2gpa));
830	gg.prot = prot;
831	gg.gla = gla;
832	gg.paging = *paging;
833
834	error = vcpu_ioctl(vcpu, VM_GLA2GPA, &gg);
835	if (error == 0) {
836		*fault = gg.fault;
837		*gpa = gg.gpa;
838	}
839	return (error);
840}
841#endif
842
843int
844vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
845    uint64_t gla, int prot, uint64_t *gpa, int *fault)
846{
847	struct vm_gla2gpa gg;
848	int error;
849
850	bzero(&gg, sizeof(struct vm_gla2gpa));
851	gg.prot = prot;
852	gg.gla = gla;
853	gg.paging = *paging;
854
855	error = vcpu_ioctl(vcpu, VM_GLA2GPA_NOFAULT, &gg);
856	if (error == 0) {
857		*fault = gg.fault;
858		*gpa = gg.gpa;
859	}
860	return (error);
861}
862
863#ifndef min
864#define	min(a,b)	(((a) < (b)) ? (a) : (b))
865#endif
866
867#ifdef __amd64__
868int
869vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
870    uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt,
871    int *fault)
872{
873	void *va;
874	uint64_t gpa, off;
875	int error, i, n;
876
877	for (i = 0; i < iovcnt; i++) {
878		iov[i].iov_base = 0;
879		iov[i].iov_len = 0;
880	}
881
882	while (len) {
883		assert(iovcnt > 0);
884		error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
885		if (error || *fault)
886			return (error);
887
888		off = gpa & PAGE_MASK;
889		n = MIN(len, PAGE_SIZE - off);
890
891		va = vm_map_gpa(vcpu->ctx, gpa, n);
892		if (va == NULL)
893			return (EFAULT);
894
895		iov->iov_base = va;
896		iov->iov_len = n;
897		iov++;
898		iovcnt--;
899
900		gla += n;
901		len -= n;
902	}
903	return (0);
904}
905#endif
906
907void
908vm_copy_teardown(struct iovec *iov __unused, int iovcnt __unused)
909{
910	/*
911	 * Intentionally empty.  This is used by the instruction
912	 * emulation code shared with the kernel.  The in-kernel
913	 * version of this is non-empty.
914	 */
915}
916
917void
918vm_copyin(struct iovec *iov, void *vp, size_t len)
919{
920	const char *src;
921	char *dst;
922	size_t n;
923
924	dst = vp;
925	while (len) {
926		assert(iov->iov_len);
927		n = min(len, iov->iov_len);
928		src = iov->iov_base;
929		bcopy(src, dst, n);
930
931		iov++;
932		dst += n;
933		len -= n;
934	}
935}
936
937void
938vm_copyout(const void *vp, struct iovec *iov, size_t len)
939{
940	const char *src;
941	char *dst;
942	size_t n;
943
944	src = vp;
945	while (len) {
946		assert(iov->iov_len);
947		n = min(len, iov->iov_len);
948		dst = iov->iov_base;
949		bcopy(src, dst, n);
950
951		iov++;
952		src += n;
953		len -= n;
954	}
955}
956
957static int
958vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus)
959{
960	struct vm_cpuset vm_cpuset;
961	int error;
962
963	bzero(&vm_cpuset, sizeof(struct vm_cpuset));
964	vm_cpuset.which = which;
965	vm_cpuset.cpusetsize = sizeof(cpuset_t);
966	vm_cpuset.cpus = cpus;
967
968	error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset);
969	return (error);
970}
971
972int
973vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus)
974{
975
976	return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus));
977}
978
979int
980vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus)
981{
982
983	return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus));
984}
985
986int
987vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus)
988{
989
990	return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus));
991}
992
993int
994vm_activate_cpu(struct vcpu *vcpu)
995{
996	struct vm_activate_cpu ac;
997	int error;
998
999	bzero(&ac, sizeof(struct vm_activate_cpu));
1000	error = vcpu_ioctl(vcpu, VM_ACTIVATE_CPU, &ac);
1001	return (error);
1002}
1003
1004int
1005vm_suspend_all_cpus(struct vmctx *ctx)
1006{
1007	struct vm_activate_cpu ac;
1008	int error;
1009
1010	bzero(&ac, sizeof(struct vm_activate_cpu));
1011	ac.vcpuid = -1;
1012	error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac);
1013	return (error);
1014}
1015
1016int
1017vm_suspend_cpu(struct vcpu *vcpu)
1018{
1019	struct vm_activate_cpu ac;
1020	int error;
1021
1022	bzero(&ac, sizeof(struct vm_activate_cpu));
1023	error = vcpu_ioctl(vcpu, VM_SUSPEND_CPU, &ac);
1024	return (error);
1025}
1026
1027int
1028vm_resume_cpu(struct vcpu *vcpu)
1029{
1030	struct vm_activate_cpu ac;
1031	int error;
1032
1033	bzero(&ac, sizeof(struct vm_activate_cpu));
1034	error = vcpu_ioctl(vcpu, VM_RESUME_CPU, &ac);
1035	return (error);
1036}
1037
1038int
1039vm_resume_all_cpus(struct vmctx *ctx)
1040{
1041	struct vm_activate_cpu ac;
1042	int error;
1043
1044	bzero(&ac, sizeof(struct vm_activate_cpu));
1045	ac.vcpuid = -1;
1046	error = ioctl(ctx->fd, VM_RESUME_CPU, &ac);
1047	return (error);
1048}
1049
1050#ifdef __amd64__
1051int
1052vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
1053{
1054	struct vm_intinfo vmii;
1055	int error;
1056
1057	bzero(&vmii, sizeof(struct vm_intinfo));
1058	error = vcpu_ioctl(vcpu, VM_GET_INTINFO, &vmii);
1059	if (error == 0) {
1060		*info1 = vmii.info1;
1061		*info2 = vmii.info2;
1062	}
1063	return (error);
1064}
1065
1066int
1067vm_set_intinfo(struct vcpu *vcpu, uint64_t info1)
1068{
1069	struct vm_intinfo vmii;
1070	int error;
1071
1072	bzero(&vmii, sizeof(struct vm_intinfo));
1073	vmii.info1 = info1;
1074	error = vcpu_ioctl(vcpu, VM_SET_INTINFO, &vmii);
1075	return (error);
1076}
1077#endif
1078
1079#ifdef WITH_VMMAPI_SNAPSHOT
1080int
1081vm_restart_instruction(struct vcpu *vcpu)
1082{
1083	int arg;
1084
1085	return (vcpu_ioctl(vcpu, VM_RESTART_INSTRUCTION, &arg));
1086}
1087
1088int
1089vm_snapshot_req(struct vmctx *ctx, struct vm_snapshot_meta *meta)
1090{
1091
1092	if (ioctl(ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
1093#ifdef SNAPSHOT_DEBUG
1094		fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
1095		    __func__, meta->dev_name, errno);
1096#endif
1097		return (-1);
1098	}
1099	return (0);
1100}
1101
1102int
1103vm_restore_time(struct vmctx *ctx)
1104{
1105	int dummy;
1106
1107	dummy = 0;
1108	return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
1109}
1110#endif
1111
1112int
1113vm_set_topology(struct vmctx *ctx,
1114    uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
1115{
1116	struct vm_cpu_topology topology;
1117
1118	bzero(&topology, sizeof (struct vm_cpu_topology));
1119	topology.sockets = sockets;
1120	topology.cores = cores;
1121	topology.threads = threads;
1122	topology.maxcpus = maxcpus;
1123	return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology));
1124}
1125
1126int
1127vm_get_topology(struct vmctx *ctx,
1128    uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus)
1129{
1130	struct vm_cpu_topology topology;
1131	int error;
1132
1133	bzero(&topology, sizeof (struct vm_cpu_topology));
1134	error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology);
1135	if (error == 0) {
1136		*sockets = topology.sockets;
1137		*cores = topology.cores;
1138		*threads = topology.threads;
1139		*maxcpus = topology.maxcpus;
1140	}
1141	return (error);
1142}
1143
1144int
1145vm_limit_rights(struct vmctx *ctx)
1146{
1147	cap_rights_t rights;
1148
1149	cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW);
1150	if (caph_rights_limit(ctx->fd, &rights) != 0)
1151		return (-1);
1152	if (caph_ioctls_limit(ctx->fd, vm_ioctl_cmds, vm_ioctl_ncmds) != 0)
1153		return (-1);
1154	return (0);
1155}
1156
1157/*
1158 * Avoid using in new code.  Operations on the fd should be wrapped here so that
1159 * capability rights can be kept in sync.
1160 */
1161int
1162vm_get_device_fd(struct vmctx *ctx)
1163{
1164
1165	return (ctx->fd);
1166}
1167
1168/* Legacy interface, do not use. */
1169const cap_ioctl_t *
1170vm_get_ioctls(size_t *len)
1171{
1172	cap_ioctl_t *cmds;
1173	size_t sz;
1174
1175	if (len == NULL) {
1176		sz = vm_ioctl_ncmds * sizeof(vm_ioctl_cmds[0]);
1177		cmds = malloc(sz);
1178		if (cmds == NULL)
1179			return (NULL);
1180		bcopy(vm_ioctl_cmds, cmds, sz);
1181		return (cmds);
1182	}
1183
1184	*len = vm_ioctl_ncmds;
1185	return (NULL);
1186}
1187