1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * tools/testing/selftests/kvm/lib/kvm_util.c
4 *
5 * Copyright (C) 2018, Google LLC.
6 */
7#include "test_util.h"
8#include "kvm_util.h"
9#include "processor.h"
10#include "ucall_common.h"
11
12#include <assert.h>
13#include <sched.h>
14#include <sys/mman.h>
15#include <sys/types.h>
16#include <sys/stat.h>
17#include <unistd.h>
18#include <linux/kernel.h>
19
20#define KVM_UTIL_MIN_PFN	2
21
22uint32_t guest_random_seed;
23struct guest_random_state guest_rng;
24
25static int vcpu_mmap_sz(void);
26
27int open_path_or_exit(const char *path, int flags)
28{
29	int fd;
30
31	fd = open(path, flags);
32	__TEST_REQUIRE(fd >= 0 || errno != ENOENT, "Cannot open %s: %s", path, strerror(errno));
33	TEST_ASSERT(fd >= 0, "Failed to open '%s'", path);
34
35	return fd;
36}
37
38/*
39 * Open KVM_DEV_PATH if available, otherwise exit the entire program.
40 *
41 * Input Args:
42 *   flags - The flags to pass when opening KVM_DEV_PATH.
43 *
44 * Return:
45 *   The opened file descriptor of /dev/kvm.
46 */
47static int _open_kvm_dev_path_or_exit(int flags)
48{
49	return open_path_or_exit(KVM_DEV_PATH, flags);
50}
51
52int open_kvm_dev_path_or_exit(void)
53{
54	return _open_kvm_dev_path_or_exit(O_RDONLY);
55}
56
57static ssize_t get_module_param(const char *module_name, const char *param,
58				void *buffer, size_t buffer_size)
59{
60	const int path_size = 128;
61	char path[path_size];
62	ssize_t bytes_read;
63	int fd, r;
64
65	r = snprintf(path, path_size, "/sys/module/%s/parameters/%s",
66		     module_name, param);
67	TEST_ASSERT(r < path_size,
68		    "Failed to construct sysfs path in %d bytes.", path_size);
69
70	fd = open_path_or_exit(path, O_RDONLY);
71
72	bytes_read = read(fd, buffer, buffer_size);
73	TEST_ASSERT(bytes_read > 0, "read(%s) returned %ld, wanted %ld bytes",
74		    path, bytes_read, buffer_size);
75
76	r = close(fd);
77	TEST_ASSERT(!r, "close(%s) failed", path);
78	return bytes_read;
79}
80
81static int get_module_param_integer(const char *module_name, const char *param)
82{
83	/*
84	 * 16 bytes to hold a 64-bit value (1 byte per char), 1 byte for the
85	 * NUL char, and 1 byte because the kernel sucks and inserts a newline
86	 * at the end.
87	 */
88	char value[16 + 1 + 1];
89	ssize_t r;
90
91	memset(value, '\0', sizeof(value));
92
93	r = get_module_param(module_name, param, value, sizeof(value));
94	TEST_ASSERT(value[r - 1] == '\n',
95		    "Expected trailing newline, got char '%c'", value[r - 1]);
96
97	/*
98	 * Squash the newline, otherwise atoi_paranoid() will complain about
99	 * trailing non-NUL characters in the string.
100	 */
101	value[r - 1] = '\0';
102	return atoi_paranoid(value);
103}
104
105static bool get_module_param_bool(const char *module_name, const char *param)
106{
107	char value;
108	ssize_t r;
109
110	r = get_module_param(module_name, param, &value, sizeof(value));
111	TEST_ASSERT_EQ(r, 1);
112
113	if (value == 'Y')
114		return true;
115	else if (value == 'N')
116		return false;
117
118	TEST_FAIL("Unrecognized value '%c' for boolean module param", value);
119}
120
121bool get_kvm_param_bool(const char *param)
122{
123	return get_module_param_bool("kvm", param);
124}
125
126bool get_kvm_intel_param_bool(const char *param)
127{
128	return get_module_param_bool("kvm_intel", param);
129}
130
131bool get_kvm_amd_param_bool(const char *param)
132{
133	return get_module_param_bool("kvm_amd", param);
134}
135
136int get_kvm_param_integer(const char *param)
137{
138	return get_module_param_integer("kvm", param);
139}
140
141int get_kvm_intel_param_integer(const char *param)
142{
143	return get_module_param_integer("kvm_intel", param);
144}
145
146int get_kvm_amd_param_integer(const char *param)
147{
148	return get_module_param_integer("kvm_amd", param);
149}
150
151/*
152 * Capability
153 *
154 * Input Args:
155 *   cap - Capability
156 *
157 * Output Args: None
158 *
159 * Return:
160 *   On success, the Value corresponding to the capability (KVM_CAP_*)
161 *   specified by the value of cap.  On failure a TEST_ASSERT failure
162 *   is produced.
163 *
164 * Looks up and returns the value corresponding to the capability
165 * (KVM_CAP_*) given by cap.
166 */
167unsigned int kvm_check_cap(long cap)
168{
169	int ret;
170	int kvm_fd;
171
172	kvm_fd = open_kvm_dev_path_or_exit();
173	ret = __kvm_ioctl(kvm_fd, KVM_CHECK_EXTENSION, (void *)cap);
174	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_CHECK_EXTENSION, ret));
175
176	close(kvm_fd);
177
178	return (unsigned int)ret;
179}
180
181void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
182{
183	if (vm_check_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL))
184		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING_ACQ_REL, ring_size);
185	else
186		vm_enable_cap(vm, KVM_CAP_DIRTY_LOG_RING, ring_size);
187	vm->dirty_ring_size = ring_size;
188}
189
190static void vm_open(struct kvm_vm *vm)
191{
192	vm->kvm_fd = _open_kvm_dev_path_or_exit(O_RDWR);
193
194	TEST_REQUIRE(kvm_has_cap(KVM_CAP_IMMEDIATE_EXIT));
195
196	vm->fd = __kvm_ioctl(vm->kvm_fd, KVM_CREATE_VM, (void *)vm->type);
197	TEST_ASSERT(vm->fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_VM, vm->fd));
198}
199
200const char *vm_guest_mode_string(uint32_t i)
201{
202	static const char * const strings[] = {
203		[VM_MODE_P52V48_4K]	= "PA-bits:52,  VA-bits:48,  4K pages",
204		[VM_MODE_P52V48_16K]	= "PA-bits:52,  VA-bits:48, 16K pages",
205		[VM_MODE_P52V48_64K]	= "PA-bits:52,  VA-bits:48, 64K pages",
206		[VM_MODE_P48V48_4K]	= "PA-bits:48,  VA-bits:48,  4K pages",
207		[VM_MODE_P48V48_16K]	= "PA-bits:48,  VA-bits:48, 16K pages",
208		[VM_MODE_P48V48_64K]	= "PA-bits:48,  VA-bits:48, 64K pages",
209		[VM_MODE_P40V48_4K]	= "PA-bits:40,  VA-bits:48,  4K pages",
210		[VM_MODE_P40V48_16K]	= "PA-bits:40,  VA-bits:48, 16K pages",
211		[VM_MODE_P40V48_64K]	= "PA-bits:40,  VA-bits:48, 64K pages",
212		[VM_MODE_PXXV48_4K]	= "PA-bits:ANY, VA-bits:48,  4K pages",
213		[VM_MODE_P47V64_4K]	= "PA-bits:47,  VA-bits:64,  4K pages",
214		[VM_MODE_P44V64_4K]	= "PA-bits:44,  VA-bits:64,  4K pages",
215		[VM_MODE_P36V48_4K]	= "PA-bits:36,  VA-bits:48,  4K pages",
216		[VM_MODE_P36V48_16K]	= "PA-bits:36,  VA-bits:48, 16K pages",
217		[VM_MODE_P36V48_64K]	= "PA-bits:36,  VA-bits:48, 64K pages",
218		[VM_MODE_P36V47_16K]	= "PA-bits:36,  VA-bits:47, 16K pages",
219	};
220	_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
221		       "Missing new mode strings?");
222
223	TEST_ASSERT(i < NUM_VM_MODES, "Guest mode ID %d too big", i);
224
225	return strings[i];
226}
227
228const struct vm_guest_mode_params vm_guest_mode_params[] = {
229	[VM_MODE_P52V48_4K]	= { 52, 48,  0x1000, 12 },
230	[VM_MODE_P52V48_16K]	= { 52, 48,  0x4000, 14 },
231	[VM_MODE_P52V48_64K]	= { 52, 48, 0x10000, 16 },
232	[VM_MODE_P48V48_4K]	= { 48, 48,  0x1000, 12 },
233	[VM_MODE_P48V48_16K]	= { 48, 48,  0x4000, 14 },
234	[VM_MODE_P48V48_64K]	= { 48, 48, 0x10000, 16 },
235	[VM_MODE_P40V48_4K]	= { 40, 48,  0x1000, 12 },
236	[VM_MODE_P40V48_16K]	= { 40, 48,  0x4000, 14 },
237	[VM_MODE_P40V48_64K]	= { 40, 48, 0x10000, 16 },
238	[VM_MODE_PXXV48_4K]	= {  0,  0,  0x1000, 12 },
239	[VM_MODE_P47V64_4K]	= { 47, 64,  0x1000, 12 },
240	[VM_MODE_P44V64_4K]	= { 44, 64,  0x1000, 12 },
241	[VM_MODE_P36V48_4K]	= { 36, 48,  0x1000, 12 },
242	[VM_MODE_P36V48_16K]	= { 36, 48,  0x4000, 14 },
243	[VM_MODE_P36V48_64K]	= { 36, 48, 0x10000, 16 },
244	[VM_MODE_P36V47_16K]	= { 36, 47,  0x4000, 14 },
245};
246_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
247	       "Missing new mode params?");
248
249/*
250 * Initializes vm->vpages_valid to match the canonical VA space of the
251 * architecture.
252 *
253 * The default implementation is valid for architectures which split the
254 * range addressed by a single page table into a low and high region
255 * based on the MSB of the VA. On architectures with this behavior
256 * the VA region spans [0, 2^(va_bits - 1)), [-(2^(va_bits - 1), -1].
257 */
258__weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm)
259{
260	sparsebit_set_num(vm->vpages_valid,
261		0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
262	sparsebit_set_num(vm->vpages_valid,
263		(~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
264		(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
265}
266
267struct kvm_vm *____vm_create(struct vm_shape shape)
268{
269	struct kvm_vm *vm;
270
271	vm = calloc(1, sizeof(*vm));
272	TEST_ASSERT(vm != NULL, "Insufficient Memory");
273
274	INIT_LIST_HEAD(&vm->vcpus);
275	vm->regions.gpa_tree = RB_ROOT;
276	vm->regions.hva_tree = RB_ROOT;
277	hash_init(vm->regions.slot_hash);
278
279	vm->mode = shape.mode;
280	vm->type = shape.type;
281
282	vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits;
283	vm->va_bits = vm_guest_mode_params[vm->mode].va_bits;
284	vm->page_size = vm_guest_mode_params[vm->mode].page_size;
285	vm->page_shift = vm_guest_mode_params[vm->mode].page_shift;
286
287	/* Setup mode specific traits. */
288	switch (vm->mode) {
289	case VM_MODE_P52V48_4K:
290		vm->pgtable_levels = 4;
291		break;
292	case VM_MODE_P52V48_64K:
293		vm->pgtable_levels = 3;
294		break;
295	case VM_MODE_P48V48_4K:
296		vm->pgtable_levels = 4;
297		break;
298	case VM_MODE_P48V48_64K:
299		vm->pgtable_levels = 3;
300		break;
301	case VM_MODE_P40V48_4K:
302	case VM_MODE_P36V48_4K:
303		vm->pgtable_levels = 4;
304		break;
305	case VM_MODE_P40V48_64K:
306	case VM_MODE_P36V48_64K:
307		vm->pgtable_levels = 3;
308		break;
309	case VM_MODE_P52V48_16K:
310	case VM_MODE_P48V48_16K:
311	case VM_MODE_P40V48_16K:
312	case VM_MODE_P36V48_16K:
313		vm->pgtable_levels = 4;
314		break;
315	case VM_MODE_P36V47_16K:
316		vm->pgtable_levels = 3;
317		break;
318	case VM_MODE_PXXV48_4K:
319#ifdef __x86_64__
320		kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
321		kvm_init_vm_address_properties(vm);
322		/*
323		 * Ignore KVM support for 5-level paging (vm->va_bits == 57),
324		 * it doesn't take effect unless a CR4.LA57 is set, which it
325		 * isn't for this mode (48-bit virtual address space).
326		 */
327		TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
328			    "Linear address width (%d bits) not supported",
329			    vm->va_bits);
330		pr_debug("Guest physical address width detected: %d\n",
331			 vm->pa_bits);
332		vm->pgtable_levels = 4;
333		vm->va_bits = 48;
334#else
335		TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
336#endif
337		break;
338	case VM_MODE_P47V64_4K:
339		vm->pgtable_levels = 5;
340		break;
341	case VM_MODE_P44V64_4K:
342		vm->pgtable_levels = 5;
343		break;
344	default:
345		TEST_FAIL("Unknown guest mode: 0x%x", vm->mode);
346	}
347
348#ifdef __aarch64__
349	TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types");
350	if (vm->pa_bits != 40)
351		vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
352#endif
353
354	vm_open(vm);
355
356	/* Limit to VA-bit canonical virtual addresses. */
357	vm->vpages_valid = sparsebit_alloc();
358	vm_vaddr_populate_bitmap(vm);
359
360	/* Limit physical addresses to PA-bits. */
361	vm->max_gfn = vm_compute_max_gfn(vm);
362
363	/* Allocate and setup memory for guest. */
364	vm->vpages_mapped = sparsebit_alloc();
365
366	return vm;
367}
368
369static uint64_t vm_nr_pages_required(enum vm_guest_mode mode,
370				     uint32_t nr_runnable_vcpus,
371				     uint64_t extra_mem_pages)
372{
373	uint64_t page_size = vm_guest_mode_params[mode].page_size;
374	uint64_t nr_pages;
375
376	TEST_ASSERT(nr_runnable_vcpus,
377		    "Use vm_create_barebones() for VMs that _never_ have vCPUs");
378
379	TEST_ASSERT(nr_runnable_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
380		    "nr_vcpus = %d too large for host, max-vcpus = %d",
381		    nr_runnable_vcpus, kvm_check_cap(KVM_CAP_MAX_VCPUS));
382
383	/*
384	 * Arbitrarily allocate 512 pages (2mb when page size is 4kb) for the
385	 * test code and other per-VM assets that will be loaded into memslot0.
386	 */
387	nr_pages = 512;
388
389	/* Account for the per-vCPU stacks on behalf of the test. */
390	nr_pages += nr_runnable_vcpus * DEFAULT_STACK_PGS;
391
392	/*
393	 * Account for the number of pages needed for the page tables.  The
394	 * maximum page table size for a memory region will be when the
395	 * smallest page size is used. Considering each page contains x page
396	 * table descriptors, the total extra size for page tables (for extra
397	 * N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
398	 * than N/x*2.
399	 */
400	nr_pages += (nr_pages + extra_mem_pages) / PTES_PER_MIN_PAGE * 2;
401
402	/* Account for the number of pages needed by ucall. */
403	nr_pages += ucall_nr_pages_required(page_size);
404
405	return vm_adjust_num_guest_pages(mode, nr_pages);
406}
407
408struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus,
409			   uint64_t nr_extra_pages)
410{
411	uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus,
412						 nr_extra_pages);
413	struct userspace_mem_region *slot0;
414	struct kvm_vm *vm;
415	int i;
416
417	pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__,
418		 vm_guest_mode_string(shape.mode), shape.type, nr_pages);
419
420	vm = ____vm_create(shape);
421
422	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0);
423	for (i = 0; i < NR_MEM_REGIONS; i++)
424		vm->memslots[i] = 0;
425
426	kvm_vm_elf_load(vm, program_invocation_name);
427
428	/*
429	 * TODO: Add proper defines to protect the library's memslots, and then
430	 * carve out memslot1 for the ucall MMIO address.  KVM treats writes to
431	 * read-only memslots as MMIO, and creating a read-only memslot for the
432	 * MMIO region would prevent silently clobbering the MMIO region.
433	 */
434	slot0 = memslot2region(vm, 0);
435	ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size);
436
437	pr_info("Random seed: 0x%x\n", guest_random_seed);
438	guest_rng = new_guest_random_state(guest_random_seed);
439	sync_global_to_guest(vm, guest_rng);
440
441	kvm_arch_vm_post_create(vm);
442
443	return vm;
444}
445
446/*
447 * VM Create with customized parameters
448 *
449 * Input Args:
450 *   mode - VM Mode (e.g. VM_MODE_P52V48_4K)
451 *   nr_vcpus - VCPU count
452 *   extra_mem_pages - Non-slot0 physical memory total size
453 *   guest_code - Guest entry point
454 *   vcpuids - VCPU IDs
455 *
456 * Output Args: None
457 *
458 * Return:
459 *   Pointer to opaque structure that describes the created VM.
460 *
461 * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
462 * extra_mem_pages is only used to calculate the maximum page table size,
463 * no real memory allocation for non-slot0 memory in this function.
464 */
465struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus,
466				      uint64_t extra_mem_pages,
467				      void *guest_code, struct kvm_vcpu *vcpus[])
468{
469	struct kvm_vm *vm;
470	int i;
471
472	TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array");
473
474	vm = __vm_create(shape, nr_vcpus, extra_mem_pages);
475
476	for (i = 0; i < nr_vcpus; ++i)
477		vcpus[i] = vm_vcpu_add(vm, i, guest_code);
478
479	return vm;
480}
481
482struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape,
483					       struct kvm_vcpu **vcpu,
484					       uint64_t extra_mem_pages,
485					       void *guest_code)
486{
487	struct kvm_vcpu *vcpus[1];
488	struct kvm_vm *vm;
489
490	vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus);
491
492	*vcpu = vcpus[0];
493	return vm;
494}
495
496/*
497 * VM Restart
498 *
499 * Input Args:
500 *   vm - VM that has been released before
501 *
502 * Output Args: None
503 *
504 * Reopens the file descriptors associated to the VM and reinstates the
505 * global state, such as the irqchip and the memory regions that are mapped
506 * into the guest.
507 */
508void kvm_vm_restart(struct kvm_vm *vmp)
509{
510	int ctr;
511	struct userspace_mem_region *region;
512
513	vm_open(vmp);
514	if (vmp->has_irqchip)
515		vm_create_irqchip(vmp);
516
517	hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) {
518		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, &region->region);
519
520		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
521			    "  rc: %i errno: %i\n"
522			    "  slot: %u flags: 0x%x\n"
523			    "  guest_phys_addr: 0x%llx size: 0x%llx",
524			    ret, errno, region->region.slot,
525			    region->region.flags,
526			    region->region.guest_phys_addr,
527			    region->region.memory_size);
528	}
529}
530
531__weak struct kvm_vcpu *vm_arch_vcpu_recreate(struct kvm_vm *vm,
532					      uint32_t vcpu_id)
533{
534	return __vm_vcpu_add(vm, vcpu_id);
535}
536
537struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm)
538{
539	kvm_vm_restart(vm);
540
541	return vm_vcpu_recreate(vm, 0);
542}
543
544void kvm_pin_this_task_to_pcpu(uint32_t pcpu)
545{
546	cpu_set_t mask;
547	int r;
548
549	CPU_ZERO(&mask);
550	CPU_SET(pcpu, &mask);
551	r = sched_setaffinity(0, sizeof(mask), &mask);
552	TEST_ASSERT(!r, "sched_setaffinity() failed for pCPU '%u'.", pcpu);
553}
554
555static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask)
556{
557	uint32_t pcpu = atoi_non_negative("CPU number", cpu_str);
558
559	TEST_ASSERT(CPU_ISSET(pcpu, allowed_mask),
560		    "Not allowed to run on pCPU '%d', check cgroups?", pcpu);
561	return pcpu;
562}
563
564void kvm_print_vcpu_pinning_help(void)
565{
566	const char *name = program_invocation_name;
567
568	printf(" -c: Pin tasks to physical CPUs.  Takes a list of comma separated\n"
569	       "     values (target pCPU), one for each vCPU, plus an optional\n"
570	       "     entry for the main application task (specified via entry\n"
571	       "     <nr_vcpus + 1>).  If used, entries must be provided for all\n"
572	       "     vCPUs, i.e. pinning vCPUs is all or nothing.\n\n"
573	       "     E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n"
574	       "     vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n"
575	       "         %s -v 3 -c 22,23,24,50\n\n"
576	       "     To leave the application task unpinned, drop the final entry:\n\n"
577	       "         %s -v 3 -c 22,23,24\n\n"
578	       "     (default: no pinning)\n", name, name);
579}
580
581void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[],
582			    int nr_vcpus)
583{
584	cpu_set_t allowed_mask;
585	char *cpu, *cpu_list;
586	char delim[2] = ",";
587	int i, r;
588
589	cpu_list = strdup(pcpus_string);
590	TEST_ASSERT(cpu_list, "strdup() allocation failed.");
591
592	r = sched_getaffinity(0, sizeof(allowed_mask), &allowed_mask);
593	TEST_ASSERT(!r, "sched_getaffinity() failed");
594
595	cpu = strtok(cpu_list, delim);
596
597	/* 1. Get all pcpus for vcpus. */
598	for (i = 0; i < nr_vcpus; i++) {
599		TEST_ASSERT(cpu, "pCPU not provided for vCPU '%d'", i);
600		vcpu_to_pcpu[i] = parse_pcpu(cpu, &allowed_mask);
601		cpu = strtok(NULL, delim);
602	}
603
604	/* 2. Check if the main worker needs to be pinned. */
605	if (cpu) {
606		kvm_pin_this_task_to_pcpu(parse_pcpu(cpu, &allowed_mask));
607		cpu = strtok(NULL, delim);
608	}
609
610	TEST_ASSERT(!cpu, "pCPU list contains trailing garbage characters '%s'", cpu);
611	free(cpu_list);
612}
613
614/*
615 * Userspace Memory Region Find
616 *
617 * Input Args:
618 *   vm - Virtual Machine
619 *   start - Starting VM physical address
620 *   end - Ending VM physical address, inclusive.
621 *
622 * Output Args: None
623 *
624 * Return:
625 *   Pointer to overlapping region, NULL if no such region.
626 *
627 * Searches for a region with any physical memory that overlaps with
628 * any portion of the guest physical addresses from start to end
629 * inclusive.  If multiple overlapping regions exist, a pointer to any
630 * of the regions is returned.  Null is returned only when no overlapping
631 * region exists.
632 */
633static struct userspace_mem_region *
634userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
635{
636	struct rb_node *node;
637
638	for (node = vm->regions.gpa_tree.rb_node; node; ) {
639		struct userspace_mem_region *region =
640			container_of(node, struct userspace_mem_region, gpa_node);
641		uint64_t existing_start = region->region.guest_phys_addr;
642		uint64_t existing_end = region->region.guest_phys_addr
643			+ region->region.memory_size - 1;
644		if (start <= existing_end && end >= existing_start)
645			return region;
646
647		if (start < existing_start)
648			node = node->rb_left;
649		else
650			node = node->rb_right;
651	}
652
653	return NULL;
654}
655
656__weak void vcpu_arch_free(struct kvm_vcpu *vcpu)
657{
658
659}
660
661/*
662 * VM VCPU Remove
663 *
664 * Input Args:
665 *   vcpu - VCPU to remove
666 *
667 * Output Args: None
668 *
669 * Return: None, TEST_ASSERT failures for all error conditions
670 *
671 * Removes a vCPU from a VM and frees its resources.
672 */
673static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu)
674{
675	int ret;
676
677	if (vcpu->dirty_gfns) {
678		ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
679		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
680		vcpu->dirty_gfns = NULL;
681	}
682
683	ret = munmap(vcpu->run, vcpu_mmap_sz());
684	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
685
686	ret = close(vcpu->fd);
687	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
688
689	list_del(&vcpu->list);
690
691	vcpu_arch_free(vcpu);
692	free(vcpu);
693}
694
695void kvm_vm_release(struct kvm_vm *vmp)
696{
697	struct kvm_vcpu *vcpu, *tmp;
698	int ret;
699
700	list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
701		vm_vcpu_rm(vmp, vcpu);
702
703	ret = close(vmp->fd);
704	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
705
706	ret = close(vmp->kvm_fd);
707	TEST_ASSERT(!ret,  __KVM_SYSCALL_ERROR("close()", ret));
708}
709
710static void __vm_mem_region_delete(struct kvm_vm *vm,
711				   struct userspace_mem_region *region,
712				   bool unlink)
713{
714	int ret;
715
716	if (unlink) {
717		rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
718		rb_erase(&region->hva_node, &vm->regions.hva_tree);
719		hash_del(&region->slot_node);
720	}
721
722	region->region.memory_size = 0;
723	vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
724
725	sparsebit_free(&region->unused_phy_pages);
726	sparsebit_free(&region->protected_phy_pages);
727	ret = munmap(region->mmap_start, region->mmap_size);
728	TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
729	if (region->fd >= 0) {
730		/* There's an extra map when using shared memory. */
731		ret = munmap(region->mmap_alias, region->mmap_size);
732		TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret));
733		close(region->fd);
734	}
735	if (region->region.guest_memfd >= 0)
736		close(region->region.guest_memfd);
737
738	free(region);
739}
740
741/*
742 * Destroys and frees the VM pointed to by vmp.
743 */
744void kvm_vm_free(struct kvm_vm *vmp)
745{
746	int ctr;
747	struct hlist_node *node;
748	struct userspace_mem_region *region;
749
750	if (vmp == NULL)
751		return;
752
753	/* Free cached stats metadata and close FD */
754	if (vmp->stats_fd) {
755		free(vmp->stats_desc);
756		close(vmp->stats_fd);
757	}
758
759	/* Free userspace_mem_regions. */
760	hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
761		__vm_mem_region_delete(vmp, region, false);
762
763	/* Free sparsebit arrays. */
764	sparsebit_free(&vmp->vpages_valid);
765	sparsebit_free(&vmp->vpages_mapped);
766
767	kvm_vm_release(vmp);
768
769	/* Free the structure describing the VM. */
770	free(vmp);
771}
772
773int kvm_memfd_alloc(size_t size, bool hugepages)
774{
775	int memfd_flags = MFD_CLOEXEC;
776	int fd, r;
777
778	if (hugepages)
779		memfd_flags |= MFD_HUGETLB;
780
781	fd = memfd_create("kvm_selftest", memfd_flags);
782	TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd));
783
784	r = ftruncate(fd, size);
785	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r));
786
787	r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size);
788	TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r));
789
790	return fd;
791}
792
793/*
794 * Memory Compare, host virtual to guest virtual
795 *
796 * Input Args:
797 *   hva - Starting host virtual address
798 *   vm - Virtual Machine
799 *   gva - Starting guest virtual address
800 *   len - number of bytes to compare
801 *
802 * Output Args: None
803 *
804 * Input/Output Args: None
805 *
806 * Return:
807 *   Returns 0 if the bytes starting at hva for a length of len
808 *   are equal the guest virtual bytes starting at gva.  Returns
809 *   a value < 0, if bytes at hva are less than those at gva.
810 *   Otherwise a value > 0 is returned.
811 *
812 * Compares the bytes starting at the host virtual address hva, for
813 * a length of len, to the guest bytes starting at the guest virtual
814 * address given by gva.
815 */
816int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
817{
818	size_t amt;
819
820	/*
821	 * Compare a batch of bytes until either a match is found
822	 * or all the bytes have been compared.
823	 */
824	for (uintptr_t offset = 0; offset < len; offset += amt) {
825		uintptr_t ptr1 = (uintptr_t)hva + offset;
826
827		/*
828		 * Determine host address for guest virtual address
829		 * at offset.
830		 */
831		uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
832
833		/*
834		 * Determine amount to compare on this pass.
835		 * Don't allow the comparsion to cross a page boundary.
836		 */
837		amt = len - offset;
838		if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
839			amt = vm->page_size - (ptr1 % vm->page_size);
840		if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
841			amt = vm->page_size - (ptr2 % vm->page_size);
842
843		assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
844		assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
845
846		/*
847		 * Perform the comparison.  If there is a difference
848		 * return that result to the caller, otherwise need
849		 * to continue on looking for a mismatch.
850		 */
851		int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
852		if (ret != 0)
853			return ret;
854	}
855
856	/*
857	 * No mismatch found.  Let the caller know the two memory
858	 * areas are equal.
859	 */
860	return 0;
861}
862
863static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
864					       struct userspace_mem_region *region)
865{
866	struct rb_node **cur, *parent;
867
868	for (cur = &gpa_tree->rb_node, parent = NULL; *cur; ) {
869		struct userspace_mem_region *cregion;
870
871		cregion = container_of(*cur, typeof(*cregion), gpa_node);
872		parent = *cur;
873		if (region->region.guest_phys_addr <
874		    cregion->region.guest_phys_addr)
875			cur = &(*cur)->rb_left;
876		else {
877			TEST_ASSERT(region->region.guest_phys_addr !=
878				    cregion->region.guest_phys_addr,
879				    "Duplicate GPA in region tree");
880
881			cur = &(*cur)->rb_right;
882		}
883	}
884
885	rb_link_node(&region->gpa_node, parent, cur);
886	rb_insert_color(&region->gpa_node, gpa_tree);
887}
888
889static void vm_userspace_mem_region_hva_insert(struct rb_root *hva_tree,
890					       struct userspace_mem_region *region)
891{
892	struct rb_node **cur, *parent;
893
894	for (cur = &hva_tree->rb_node, parent = NULL; *cur; ) {
895		struct userspace_mem_region *cregion;
896
897		cregion = container_of(*cur, typeof(*cregion), hva_node);
898		parent = *cur;
899		if (region->host_mem < cregion->host_mem)
900			cur = &(*cur)->rb_left;
901		else {
902			TEST_ASSERT(region->host_mem !=
903				    cregion->host_mem,
904				    "Duplicate HVA in region tree");
905
906			cur = &(*cur)->rb_right;
907		}
908	}
909
910	rb_link_node(&region->hva_node, parent, cur);
911	rb_insert_color(&region->hva_node, hva_tree);
912}
913
914
915int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
916				uint64_t gpa, uint64_t size, void *hva)
917{
918	struct kvm_userspace_memory_region region = {
919		.slot = slot,
920		.flags = flags,
921		.guest_phys_addr = gpa,
922		.memory_size = size,
923		.userspace_addr = (uintptr_t)hva,
924	};
925
926	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region);
927}
928
929void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
930			       uint64_t gpa, uint64_t size, void *hva)
931{
932	int ret = __vm_set_user_memory_region(vm, slot, flags, gpa, size, hva);
933
934	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed, errno = %d (%s)",
935		    errno, strerror(errno));
936}
937
938#define TEST_REQUIRE_SET_USER_MEMORY_REGION2()			\
939	__TEST_REQUIRE(kvm_has_cap(KVM_CAP_USER_MEMORY2),	\
940		       "KVM selftests now require KVM_SET_USER_MEMORY_REGION2 (introduced in v6.8)")
941
942int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
943				 uint64_t gpa, uint64_t size, void *hva,
944				 uint32_t guest_memfd, uint64_t guest_memfd_offset)
945{
946	struct kvm_userspace_memory_region2 region = {
947		.slot = slot,
948		.flags = flags,
949		.guest_phys_addr = gpa,
950		.memory_size = size,
951		.userspace_addr = (uintptr_t)hva,
952		.guest_memfd = guest_memfd,
953		.guest_memfd_offset = guest_memfd_offset,
954	};
955
956	TEST_REQUIRE_SET_USER_MEMORY_REGION2();
957
958	return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, &region);
959}
960
961void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags,
962				uint64_t gpa, uint64_t size, void *hva,
963				uint32_t guest_memfd, uint64_t guest_memfd_offset)
964{
965	int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva,
966					       guest_memfd, guest_memfd_offset);
967
968	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)",
969		    errno, strerror(errno));
970}
971
972
973/* FIXME: This thing needs to be ripped apart and rewritten. */
974void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
975		uint64_t guest_paddr, uint32_t slot, uint64_t npages,
976		uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset)
977{
978	int ret;
979	struct userspace_mem_region *region;
980	size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
981	size_t mem_size = npages * vm->page_size;
982	size_t alignment;
983
984	TEST_REQUIRE_SET_USER_MEMORY_REGION2();
985
986	TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
987		"Number of guest pages is not compatible with the host. "
988		"Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
989
990	TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
991		"address not on a page boundary.\n"
992		"  guest_paddr: 0x%lx vm->page_size: 0x%x",
993		guest_paddr, vm->page_size);
994	TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
995		<= vm->max_gfn, "Physical range beyond maximum "
996		"supported physical address,\n"
997		"  guest_paddr: 0x%lx npages: 0x%lx\n"
998		"  vm->max_gfn: 0x%lx vm->page_size: 0x%x",
999		guest_paddr, npages, vm->max_gfn, vm->page_size);
1000
1001	/*
1002	 * Confirm a mem region with an overlapping address doesn't
1003	 * already exist.
1004	 */
1005	region = (struct userspace_mem_region *) userspace_mem_region_find(
1006		vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
1007	if (region != NULL)
1008		TEST_FAIL("overlapping userspace_mem_region already "
1009			"exists\n"
1010			"  requested guest_paddr: 0x%lx npages: 0x%lx "
1011			"page_size: 0x%x\n"
1012			"  existing guest_paddr: 0x%lx size: 0x%lx",
1013			guest_paddr, npages, vm->page_size,
1014			(uint64_t) region->region.guest_phys_addr,
1015			(uint64_t) region->region.memory_size);
1016
1017	/* Confirm no region with the requested slot already exists. */
1018	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1019			       slot) {
1020		if (region->region.slot != slot)
1021			continue;
1022
1023		TEST_FAIL("A mem region with the requested slot "
1024			"already exists.\n"
1025			"  requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
1026			"  existing slot: %u paddr: 0x%lx size: 0x%lx",
1027			slot, guest_paddr, npages,
1028			region->region.slot,
1029			(uint64_t) region->region.guest_phys_addr,
1030			(uint64_t) region->region.memory_size);
1031	}
1032
1033	/* Allocate and initialize new mem region structure. */
1034	region = calloc(1, sizeof(*region));
1035	TEST_ASSERT(region != NULL, "Insufficient Memory");
1036	region->mmap_size = mem_size;
1037
1038#ifdef __s390x__
1039	/* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
1040	alignment = 0x100000;
1041#else
1042	alignment = 1;
1043#endif
1044
1045	/*
1046	 * When using THP mmap is not guaranteed to returned a hugepage aligned
1047	 * address so we have to pad the mmap. Padding is not needed for HugeTLB
1048	 * because mmap will always return an address aligned to the HugeTLB
1049	 * page size.
1050	 */
1051	if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
1052		alignment = max(backing_src_pagesz, alignment);
1053
1054	TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz));
1055
1056	/* Add enough memory to align up if necessary */
1057	if (alignment > 1)
1058		region->mmap_size += alignment;
1059
1060	region->fd = -1;
1061	if (backing_src_is_shared(src_type))
1062		region->fd = kvm_memfd_alloc(region->mmap_size,
1063					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
1064
1065	region->mmap_start = mmap(NULL, region->mmap_size,
1066				  PROT_READ | PROT_WRITE,
1067				  vm_mem_backing_src_alias(src_type)->flag,
1068				  region->fd, 0);
1069	TEST_ASSERT(region->mmap_start != MAP_FAILED,
1070		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
1071
1072	TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
1073		    region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
1074		    "mmap_start %p is not aligned to HugeTLB page size 0x%lx",
1075		    region->mmap_start, backing_src_pagesz);
1076
1077	/* Align host address */
1078	region->host_mem = align_ptr_up(region->mmap_start, alignment);
1079
1080	/* As needed perform madvise */
1081	if ((src_type == VM_MEM_SRC_ANONYMOUS ||
1082	     src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) {
1083		ret = madvise(region->host_mem, mem_size,
1084			      src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
1085		TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s",
1086			    region->host_mem, mem_size,
1087			    vm_mem_backing_src_alias(src_type)->name);
1088	}
1089
1090	region->backing_src_type = src_type;
1091
1092	if (flags & KVM_MEM_GUEST_MEMFD) {
1093		if (guest_memfd < 0) {
1094			uint32_t guest_memfd_flags = 0;
1095			TEST_ASSERT(!guest_memfd_offset,
1096				    "Offset must be zero when creating new guest_memfd");
1097			guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags);
1098		} else {
1099			/*
1100			 * Install a unique fd for each memslot so that the fd
1101			 * can be closed when the region is deleted without
1102			 * needing to track if the fd is owned by the framework
1103			 * or by the caller.
1104			 */
1105			guest_memfd = dup(guest_memfd);
1106			TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd));
1107		}
1108
1109		region->region.guest_memfd = guest_memfd;
1110		region->region.guest_memfd_offset = guest_memfd_offset;
1111	} else {
1112		region->region.guest_memfd = -1;
1113	}
1114
1115	region->unused_phy_pages = sparsebit_alloc();
1116	if (vm_arch_has_protected_memory(vm))
1117		region->protected_phy_pages = sparsebit_alloc();
1118	sparsebit_set_num(region->unused_phy_pages,
1119		guest_paddr >> vm->page_shift, npages);
1120	region->region.slot = slot;
1121	region->region.flags = flags;
1122	region->region.guest_phys_addr = guest_paddr;
1123	region->region.memory_size = npages * vm->page_size;
1124	region->region.userspace_addr = (uintptr_t) region->host_mem;
1125	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1126	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1127		"  rc: %i errno: %i\n"
1128		"  slot: %u flags: 0x%x\n"
1129		"  guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d",
1130		ret, errno, slot, flags,
1131		guest_paddr, (uint64_t) region->region.memory_size,
1132		region->region.guest_memfd);
1133
1134	/* Add to quick lookup data structures */
1135	vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region);
1136	vm_userspace_mem_region_hva_insert(&vm->regions.hva_tree, region);
1137	hash_add(vm->regions.slot_hash, &region->slot_node, slot);
1138
1139	/* If shared memory, create an alias. */
1140	if (region->fd >= 0) {
1141		region->mmap_alias = mmap(NULL, region->mmap_size,
1142					  PROT_READ | PROT_WRITE,
1143					  vm_mem_backing_src_alias(src_type)->flag,
1144					  region->fd, 0);
1145		TEST_ASSERT(region->mmap_alias != MAP_FAILED,
1146			    __KVM_SYSCALL_ERROR("mmap()",  (int)(unsigned long)MAP_FAILED));
1147
1148		/* Align host alias address */
1149		region->host_alias = align_ptr_up(region->mmap_alias, alignment);
1150	}
1151}
1152
1153void vm_userspace_mem_region_add(struct kvm_vm *vm,
1154				 enum vm_mem_backing_src_type src_type,
1155				 uint64_t guest_paddr, uint32_t slot,
1156				 uint64_t npages, uint32_t flags)
1157{
1158	vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0);
1159}
1160
1161/*
1162 * Memslot to region
1163 *
1164 * Input Args:
1165 *   vm - Virtual Machine
1166 *   memslot - KVM memory slot ID
1167 *
1168 * Output Args: None
1169 *
1170 * Return:
1171 *   Pointer to memory region structure that describe memory region
1172 *   using kvm memory slot ID given by memslot.  TEST_ASSERT failure
1173 *   on error (e.g. currently no memory region using memslot as a KVM
1174 *   memory slot ID).
1175 */
1176struct userspace_mem_region *
1177memslot2region(struct kvm_vm *vm, uint32_t memslot)
1178{
1179	struct userspace_mem_region *region;
1180
1181	hash_for_each_possible(vm->regions.slot_hash, region, slot_node,
1182			       memslot)
1183		if (region->region.slot == memslot)
1184			return region;
1185
1186	fprintf(stderr, "No mem region with the requested slot found,\n"
1187		"  requested slot: %u\n", memslot);
1188	fputs("---- vm dump ----\n", stderr);
1189	vm_dump(stderr, vm, 2);
1190	TEST_FAIL("Mem region not found");
1191	return NULL;
1192}
1193
1194/*
1195 * VM Memory Region Flags Set
1196 *
1197 * Input Args:
1198 *   vm - Virtual Machine
1199 *   flags - Starting guest physical address
1200 *
1201 * Output Args: None
1202 *
1203 * Return: None
1204 *
1205 * Sets the flags of the memory region specified by the value of slot,
1206 * to the values given by flags.
1207 */
1208void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
1209{
1210	int ret;
1211	struct userspace_mem_region *region;
1212
1213	region = memslot2region(vm, slot);
1214
1215	region->region.flags = flags;
1216
1217	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1218
1219	TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n"
1220		"  rc: %i errno: %i slot: %u flags: 0x%x",
1221		ret, errno, slot, flags);
1222}
1223
1224/*
1225 * VM Memory Region Move
1226 *
1227 * Input Args:
1228 *   vm - Virtual Machine
1229 *   slot - Slot of the memory region to move
1230 *   new_gpa - Starting guest physical address
1231 *
1232 * Output Args: None
1233 *
1234 * Return: None
1235 *
1236 * Change the gpa of a memory region.
1237 */
1238void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
1239{
1240	struct userspace_mem_region *region;
1241	int ret;
1242
1243	region = memslot2region(vm, slot);
1244
1245	region->region.guest_phys_addr = new_gpa;
1246
1247	ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
1248
1249	TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n"
1250		    "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
1251		    ret, errno, slot, new_gpa);
1252}
1253
1254/*
1255 * VM Memory Region Delete
1256 *
1257 * Input Args:
1258 *   vm - Virtual Machine
1259 *   slot - Slot of the memory region to delete
1260 *
1261 * Output Args: None
1262 *
1263 * Return: None
1264 *
1265 * Delete a memory region.
1266 */
1267void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
1268{
1269	__vm_mem_region_delete(vm, memslot2region(vm, slot), true);
1270}
1271
1272void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
1273			    bool punch_hole)
1274{
1275	const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
1276	struct userspace_mem_region *region;
1277	uint64_t end = base + size;
1278	uint64_t gpa, len;
1279	off_t fd_offset;
1280	int ret;
1281
1282	for (gpa = base; gpa < end; gpa += len) {
1283		uint64_t offset;
1284
1285		region = userspace_mem_region_find(vm, gpa, gpa);
1286		TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
1287			    "Private memory region not found for GPA 0x%lx", gpa);
1288
1289		offset = gpa - region->region.guest_phys_addr;
1290		fd_offset = region->region.guest_memfd_offset + offset;
1291		len = min_t(uint64_t, end - gpa, region->region.memory_size - offset);
1292
1293		ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
1294		TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
1295			    punch_hole ? "punch hole" : "allocate", gpa, len,
1296			    region->region.guest_memfd, mode, fd_offset);
1297	}
1298}
1299
1300/* Returns the size of a vCPU's kvm_run structure. */
1301static int vcpu_mmap_sz(void)
1302{
1303	int dev_fd, ret;
1304
1305	dev_fd = open_kvm_dev_path_or_exit();
1306
1307	ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
1308	TEST_ASSERT(ret >= sizeof(struct kvm_run),
1309		    KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, ret));
1310
1311	close(dev_fd);
1312
1313	return ret;
1314}
1315
1316static bool vcpu_exists(struct kvm_vm *vm, uint32_t vcpu_id)
1317{
1318	struct kvm_vcpu *vcpu;
1319
1320	list_for_each_entry(vcpu, &vm->vcpus, list) {
1321		if (vcpu->id == vcpu_id)
1322			return true;
1323	}
1324
1325	return false;
1326}
1327
1328/*
1329 * Adds a virtual CPU to the VM specified by vm with the ID given by vcpu_id.
1330 * No additional vCPU setup is done.  Returns the vCPU.
1331 */
1332struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
1333{
1334	struct kvm_vcpu *vcpu;
1335
1336	/* Confirm a vcpu with the specified id doesn't already exist. */
1337	TEST_ASSERT(!vcpu_exists(vm, vcpu_id), "vCPU%d already exists", vcpu_id);
1338
1339	/* Allocate and initialize new vcpu structure. */
1340	vcpu = calloc(1, sizeof(*vcpu));
1341	TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
1342
1343	vcpu->vm = vm;
1344	vcpu->id = vcpu_id;
1345	vcpu->fd = __vm_ioctl(vm, KVM_CREATE_VCPU, (void *)(unsigned long)vcpu_id);
1346	TEST_ASSERT_VM_VCPU_IOCTL(vcpu->fd >= 0, KVM_CREATE_VCPU, vcpu->fd, vm);
1347
1348	TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->run), "vcpu mmap size "
1349		"smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
1350		vcpu_mmap_sz(), sizeof(*vcpu->run));
1351	vcpu->run = (struct kvm_run *) mmap(NULL, vcpu_mmap_sz(),
1352		PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
1353	TEST_ASSERT(vcpu->run != MAP_FAILED,
1354		    __KVM_SYSCALL_ERROR("mmap()", (int)(unsigned long)MAP_FAILED));
1355
1356	/* Add to linked-list of VCPUs. */
1357	list_add(&vcpu->list, &vm->vcpus);
1358
1359	return vcpu;
1360}
1361
1362/*
1363 * VM Virtual Address Unused Gap
1364 *
1365 * Input Args:
1366 *   vm - Virtual Machine
1367 *   sz - Size (bytes)
1368 *   vaddr_min - Minimum Virtual Address
1369 *
1370 * Output Args: None
1371 *
1372 * Return:
1373 *   Lowest virtual address at or below vaddr_min, with at least
1374 *   sz unused bytes.  TEST_ASSERT failure if no area of at least
1375 *   size sz is available.
1376 *
1377 * Within the VM specified by vm, locates the lowest starting virtual
1378 * address >= vaddr_min, that has at least sz unallocated bytes.  A
1379 * TEST_ASSERT failure occurs for invalid input or no area of at least
1380 * sz unallocated bytes >= vaddr_min is available.
1381 */
1382vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
1383			       vm_vaddr_t vaddr_min)
1384{
1385	uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
1386
1387	/* Determine lowest permitted virtual page index. */
1388	uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
1389	if ((pgidx_start * vm->page_size) < vaddr_min)
1390		goto no_va_found;
1391
1392	/* Loop over section with enough valid virtual page indexes. */
1393	if (!sparsebit_is_set_num(vm->vpages_valid,
1394		pgidx_start, pages))
1395		pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
1396			pgidx_start, pages);
1397	do {
1398		/*
1399		 * Are there enough unused virtual pages available at
1400		 * the currently proposed starting virtual page index.
1401		 * If not, adjust proposed starting index to next
1402		 * possible.
1403		 */
1404		if (sparsebit_is_clear_num(vm->vpages_mapped,
1405			pgidx_start, pages))
1406			goto va_found;
1407		pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
1408			pgidx_start, pages);
1409		if (pgidx_start == 0)
1410			goto no_va_found;
1411
1412		/*
1413		 * If needed, adjust proposed starting virtual address,
1414		 * to next range of valid virtual addresses.
1415		 */
1416		if (!sparsebit_is_set_num(vm->vpages_valid,
1417			pgidx_start, pages)) {
1418			pgidx_start = sparsebit_next_set_num(
1419				vm->vpages_valid, pgidx_start, pages);
1420			if (pgidx_start == 0)
1421				goto no_va_found;
1422		}
1423	} while (pgidx_start != 0);
1424
1425no_va_found:
1426	TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
1427
1428	/* NOT REACHED */
1429	return -1;
1430
1431va_found:
1432	TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
1433		pgidx_start, pages),
1434		"Unexpected, invalid virtual page index range,\n"
1435		"  pgidx_start: 0x%lx\n"
1436		"  pages: 0x%lx",
1437		pgidx_start, pages);
1438	TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
1439		pgidx_start, pages),
1440		"Unexpected, pages already mapped,\n"
1441		"  pgidx_start: 0x%lx\n"
1442		"  pages: 0x%lx",
1443		pgidx_start, pages);
1444
1445	return pgidx_start * vm->page_size;
1446}
1447
1448static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz,
1449				     vm_vaddr_t vaddr_min,
1450				     enum kvm_mem_region_type type,
1451				     bool protected)
1452{
1453	uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
1454
1455	virt_pgd_alloc(vm);
1456	vm_paddr_t paddr = __vm_phy_pages_alloc(vm, pages,
1457						KVM_UTIL_MIN_PFN * vm->page_size,
1458						vm->memslots[type], protected);
1459
1460	/*
1461	 * Find an unused range of virtual page addresses of at least
1462	 * pages in length.
1463	 */
1464	vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
1465
1466	/* Map the virtual pages. */
1467	for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
1468		pages--, vaddr += vm->page_size, paddr += vm->page_size) {
1469
1470		virt_pg_map(vm, vaddr, paddr);
1471
1472		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1473	}
1474
1475	return vaddr_start;
1476}
1477
1478vm_vaddr_t __vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
1479			    enum kvm_mem_region_type type)
1480{
1481	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type,
1482				  vm_arch_has_protected_memory(vm));
1483}
1484
1485vm_vaddr_t vm_vaddr_alloc_shared(struct kvm_vm *vm, size_t sz,
1486				 vm_vaddr_t vaddr_min,
1487				 enum kvm_mem_region_type type)
1488{
1489	return ____vm_vaddr_alloc(vm, sz, vaddr_min, type, false);
1490}
1491
1492/*
1493 * VM Virtual Address Allocate
1494 *
1495 * Input Args:
1496 *   vm - Virtual Machine
1497 *   sz - Size in bytes
1498 *   vaddr_min - Minimum starting virtual address
1499 *
1500 * Output Args: None
1501 *
1502 * Return:
1503 *   Starting guest virtual address
1504 *
1505 * Allocates at least sz bytes within the virtual address space of the vm
1506 * given by vm.  The allocated bytes are mapped to a virtual address >=
1507 * the address given by vaddr_min.  Note that each allocation uses a
1508 * a unique set of pages, with the minimum real allocation being at least
1509 * a page. The allocated physical space comes from the TEST_DATA memory region.
1510 */
1511vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min)
1512{
1513	return __vm_vaddr_alloc(vm, sz, vaddr_min, MEM_REGION_TEST_DATA);
1514}
1515
1516/*
1517 * VM Virtual Address Allocate Pages
1518 *
1519 * Input Args:
1520 *   vm - Virtual Machine
1521 *
1522 * Output Args: None
1523 *
1524 * Return:
1525 *   Starting guest virtual address
1526 *
1527 * Allocates at least N system pages worth of bytes within the virtual address
1528 * space of the vm.
1529 */
1530vm_vaddr_t vm_vaddr_alloc_pages(struct kvm_vm *vm, int nr_pages)
1531{
1532	return vm_vaddr_alloc(vm, nr_pages * getpagesize(), KVM_UTIL_MIN_VADDR);
1533}
1534
1535vm_vaddr_t __vm_vaddr_alloc_page(struct kvm_vm *vm, enum kvm_mem_region_type type)
1536{
1537	return __vm_vaddr_alloc(vm, getpagesize(), KVM_UTIL_MIN_VADDR, type);
1538}
1539
1540/*
1541 * VM Virtual Address Allocate Page
1542 *
1543 * Input Args:
1544 *   vm - Virtual Machine
1545 *
1546 * Output Args: None
1547 *
1548 * Return:
1549 *   Starting guest virtual address
1550 *
1551 * Allocates at least one system page worth of bytes within the virtual address
1552 * space of the vm.
1553 */
1554vm_vaddr_t vm_vaddr_alloc_page(struct kvm_vm *vm)
1555{
1556	return vm_vaddr_alloc_pages(vm, 1);
1557}
1558
1559/*
1560 * Map a range of VM virtual address to the VM's physical address
1561 *
1562 * Input Args:
1563 *   vm - Virtual Machine
1564 *   vaddr - Virtuall address to map
1565 *   paddr - VM Physical Address
1566 *   npages - The number of pages to map
1567 *
1568 * Output Args: None
1569 *
1570 * Return: None
1571 *
1572 * Within the VM given by @vm, creates a virtual translation for
1573 * @npages starting at @vaddr to the page range starting at @paddr.
1574 */
1575void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
1576	      unsigned int npages)
1577{
1578	size_t page_size = vm->page_size;
1579	size_t size = npages * page_size;
1580
1581	TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
1582	TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
1583
1584	while (npages--) {
1585		virt_pg_map(vm, vaddr, paddr);
1586		sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift);
1587
1588		vaddr += page_size;
1589		paddr += page_size;
1590	}
1591}
1592
1593/*
1594 * Address VM Physical to Host Virtual
1595 *
1596 * Input Args:
1597 *   vm - Virtual Machine
1598 *   gpa - VM physical address
1599 *
1600 * Output Args: None
1601 *
1602 * Return:
1603 *   Equivalent host virtual address
1604 *
1605 * Locates the memory region containing the VM physical address given
1606 * by gpa, within the VM given by vm.  When found, the host virtual
1607 * address providing the memory to the vm physical address is returned.
1608 * A TEST_ASSERT failure occurs if no region containing gpa exists.
1609 */
1610void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
1611{
1612	struct userspace_mem_region *region;
1613
1614	gpa = vm_untag_gpa(vm, gpa);
1615
1616	region = userspace_mem_region_find(vm, gpa, gpa);
1617	if (!region) {
1618		TEST_FAIL("No vm physical memory at 0x%lx", gpa);
1619		return NULL;
1620	}
1621
1622	return (void *)((uintptr_t)region->host_mem
1623		+ (gpa - region->region.guest_phys_addr));
1624}
1625
1626/*
1627 * Address Host Virtual to VM Physical
1628 *
1629 * Input Args:
1630 *   vm - Virtual Machine
1631 *   hva - Host virtual address
1632 *
1633 * Output Args: None
1634 *
1635 * Return:
1636 *   Equivalent VM physical address
1637 *
1638 * Locates the memory region containing the host virtual address given
1639 * by hva, within the VM given by vm.  When found, the equivalent
1640 * VM physical address is returned. A TEST_ASSERT failure occurs if no
1641 * region containing hva exists.
1642 */
1643vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
1644{
1645	struct rb_node *node;
1646
1647	for (node = vm->regions.hva_tree.rb_node; node; ) {
1648		struct userspace_mem_region *region =
1649			container_of(node, struct userspace_mem_region, hva_node);
1650
1651		if (hva >= region->host_mem) {
1652			if (hva <= (region->host_mem
1653				+ region->region.memory_size - 1))
1654				return (vm_paddr_t)((uintptr_t)
1655					region->region.guest_phys_addr
1656					+ (hva - (uintptr_t)region->host_mem));
1657
1658			node = node->rb_right;
1659		} else
1660			node = node->rb_left;
1661	}
1662
1663	TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
1664	return -1;
1665}
1666
1667/*
1668 * Address VM physical to Host Virtual *alias*.
1669 *
1670 * Input Args:
1671 *   vm - Virtual Machine
1672 *   gpa - VM physical address
1673 *
1674 * Output Args: None
1675 *
1676 * Return:
1677 *   Equivalent address within the host virtual *alias* area, or NULL
1678 *   (without failing the test) if the guest memory is not shared (so
1679 *   no alias exists).
1680 *
1681 * Create a writable, shared virtual=>physical alias for the specific GPA.
1682 * The primary use case is to allow the host selftest to manipulate guest
1683 * memory without mapping said memory in the guest's address space. And, for
1684 * userfaultfd-based demand paging, to do so without triggering userfaults.
1685 */
1686void *addr_gpa2alias(struct kvm_vm *vm, vm_paddr_t gpa)
1687{
1688	struct userspace_mem_region *region;
1689	uintptr_t offset;
1690
1691	region = userspace_mem_region_find(vm, gpa, gpa);
1692	if (!region)
1693		return NULL;
1694
1695	if (!region->host_alias)
1696		return NULL;
1697
1698	offset = gpa - region->region.guest_phys_addr;
1699	return (void *) ((uintptr_t) region->host_alias + offset);
1700}
1701
1702/* Create an interrupt controller chip for the specified VM. */
1703void vm_create_irqchip(struct kvm_vm *vm)
1704{
1705	vm_ioctl(vm, KVM_CREATE_IRQCHIP, NULL);
1706
1707	vm->has_irqchip = true;
1708}
1709
1710int _vcpu_run(struct kvm_vcpu *vcpu)
1711{
1712	int rc;
1713
1714	do {
1715		rc = __vcpu_run(vcpu);
1716	} while (rc == -1 && errno == EINTR);
1717
1718	assert_on_unhandled_exception(vcpu);
1719
1720	return rc;
1721}
1722
1723/*
1724 * Invoke KVM_RUN on a vCPU until KVM returns something other than -EINTR.
1725 * Assert if the KVM returns an error (other than -EINTR).
1726 */
1727void vcpu_run(struct kvm_vcpu *vcpu)
1728{
1729	int ret = _vcpu_run(vcpu);
1730
1731	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_RUN, ret));
1732}
1733
1734void vcpu_run_complete_io(struct kvm_vcpu *vcpu)
1735{
1736	int ret;
1737
1738	vcpu->run->immediate_exit = 1;
1739	ret = __vcpu_run(vcpu);
1740	vcpu->run->immediate_exit = 0;
1741
1742	TEST_ASSERT(ret == -1 && errno == EINTR,
1743		    "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
1744		    ret, errno);
1745}
1746
1747/*
1748 * Get the list of guest registers which are supported for
1749 * KVM_GET_ONE_REG/KVM_SET_ONE_REG ioctls.  Returns a kvm_reg_list pointer,
1750 * it is the caller's responsibility to free the list.
1751 */
1752struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vcpu *vcpu)
1753{
1754	struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
1755	int ret;
1756
1757	ret = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, &reg_list_n);
1758	TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
1759
1760	reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
1761	reg_list->n = reg_list_n.n;
1762	vcpu_ioctl(vcpu, KVM_GET_REG_LIST, reg_list);
1763	return reg_list;
1764}
1765
1766void *vcpu_map_dirty_ring(struct kvm_vcpu *vcpu)
1767{
1768	uint32_t page_size = getpagesize();
1769	uint32_t size = vcpu->vm->dirty_ring_size;
1770
1771	TEST_ASSERT(size > 0, "Should enable dirty ring first");
1772
1773	if (!vcpu->dirty_gfns) {
1774		void *addr;
1775
1776		addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, vcpu->fd,
1777			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1778		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped private");
1779
1780		addr = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_PRIVATE, vcpu->fd,
1781			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1782		TEST_ASSERT(addr == MAP_FAILED, "Dirty ring mapped exec");
1783
1784		addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd,
1785			    page_size * KVM_DIRTY_LOG_PAGE_OFFSET);
1786		TEST_ASSERT(addr != MAP_FAILED, "Dirty ring map failed");
1787
1788		vcpu->dirty_gfns = addr;
1789		vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
1790	}
1791
1792	return vcpu->dirty_gfns;
1793}
1794
1795/*
1796 * Device Ioctl
1797 */
1798
1799int __kvm_has_device_attr(int dev_fd, uint32_t group, uint64_t attr)
1800{
1801	struct kvm_device_attr attribute = {
1802		.group = group,
1803		.attr = attr,
1804		.flags = 0,
1805	};
1806
1807	return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute);
1808}
1809
1810int __kvm_test_create_device(struct kvm_vm *vm, uint64_t type)
1811{
1812	struct kvm_create_device create_dev = {
1813		.type = type,
1814		.flags = KVM_CREATE_DEVICE_TEST,
1815	};
1816
1817	return __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1818}
1819
1820int __kvm_create_device(struct kvm_vm *vm, uint64_t type)
1821{
1822	struct kvm_create_device create_dev = {
1823		.type = type,
1824		.fd = -1,
1825		.flags = 0,
1826	};
1827	int err;
1828
1829	err = __vm_ioctl(vm, KVM_CREATE_DEVICE, &create_dev);
1830	TEST_ASSERT(err <= 0, "KVM_CREATE_DEVICE shouldn't return a positive value");
1831	return err ? : create_dev.fd;
1832}
1833
1834int __kvm_device_attr_get(int dev_fd, uint32_t group, uint64_t attr, void *val)
1835{
1836	struct kvm_device_attr kvmattr = {
1837		.group = group,
1838		.attr = attr,
1839		.flags = 0,
1840		.addr = (uintptr_t)val,
1841	};
1842
1843	return __kvm_ioctl(dev_fd, KVM_GET_DEVICE_ATTR, &kvmattr);
1844}
1845
1846int __kvm_device_attr_set(int dev_fd, uint32_t group, uint64_t attr, void *val)
1847{
1848	struct kvm_device_attr kvmattr = {
1849		.group = group,
1850		.attr = attr,
1851		.flags = 0,
1852		.addr = (uintptr_t)val,
1853	};
1854
1855	return __kvm_ioctl(dev_fd, KVM_SET_DEVICE_ATTR, &kvmattr);
1856}
1857
1858/*
1859 * IRQ related functions.
1860 */
1861
1862int _kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1863{
1864	struct kvm_irq_level irq_level = {
1865		.irq    = irq,
1866		.level  = level,
1867	};
1868
1869	return __vm_ioctl(vm, KVM_IRQ_LINE, &irq_level);
1870}
1871
1872void kvm_irq_line(struct kvm_vm *vm, uint32_t irq, int level)
1873{
1874	int ret = _kvm_irq_line(vm, irq, level);
1875
1876	TEST_ASSERT(ret >= 0, KVM_IOCTL_ERROR(KVM_IRQ_LINE, ret));
1877}
1878
1879struct kvm_irq_routing *kvm_gsi_routing_create(void)
1880{
1881	struct kvm_irq_routing *routing;
1882	size_t size;
1883
1884	size = sizeof(struct kvm_irq_routing);
1885	/* Allocate space for the max number of entries: this wastes 196 KBs. */
1886	size += KVM_MAX_IRQ_ROUTES * sizeof(struct kvm_irq_routing_entry);
1887	routing = calloc(1, size);
1888	assert(routing);
1889
1890	return routing;
1891}
1892
1893void kvm_gsi_routing_irqchip_add(struct kvm_irq_routing *routing,
1894		uint32_t gsi, uint32_t pin)
1895{
1896	int i;
1897
1898	assert(routing);
1899	assert(routing->nr < KVM_MAX_IRQ_ROUTES);
1900
1901	i = routing->nr;
1902	routing->entries[i].gsi = gsi;
1903	routing->entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
1904	routing->entries[i].flags = 0;
1905	routing->entries[i].u.irqchip.irqchip = 0;
1906	routing->entries[i].u.irqchip.pin = pin;
1907	routing->nr++;
1908}
1909
1910int _kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1911{
1912	int ret;
1913
1914	assert(routing);
1915	ret = __vm_ioctl(vm, KVM_SET_GSI_ROUTING, routing);
1916	free(routing);
1917
1918	return ret;
1919}
1920
1921void kvm_gsi_routing_write(struct kvm_vm *vm, struct kvm_irq_routing *routing)
1922{
1923	int ret;
1924
1925	ret = _kvm_gsi_routing_write(vm, routing);
1926	TEST_ASSERT(!ret, KVM_IOCTL_ERROR(KVM_SET_GSI_ROUTING, ret));
1927}
1928
1929/*
1930 * VM Dump
1931 *
1932 * Input Args:
1933 *   vm - Virtual Machine
1934 *   indent - Left margin indent amount
1935 *
1936 * Output Args:
1937 *   stream - Output FILE stream
1938 *
1939 * Return: None
1940 *
1941 * Dumps the current state of the VM given by vm, to the FILE stream
1942 * given by stream.
1943 */
1944void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
1945{
1946	int ctr;
1947	struct userspace_mem_region *region;
1948	struct kvm_vcpu *vcpu;
1949
1950	fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
1951	fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
1952	fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
1953	fprintf(stream, "%*sMem Regions:\n", indent, "");
1954	hash_for_each(vm->regions.slot_hash, ctr, region, slot_node) {
1955		fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
1956			"host_virt: %p\n", indent + 2, "",
1957			(uint64_t) region->region.guest_phys_addr,
1958			(uint64_t) region->region.memory_size,
1959			region->host_mem);
1960		fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
1961		sparsebit_dump(stream, region->unused_phy_pages, 0);
1962		if (region->protected_phy_pages) {
1963			fprintf(stream, "%*sprotected_phy_pages: ", indent + 2, "");
1964			sparsebit_dump(stream, region->protected_phy_pages, 0);
1965		}
1966	}
1967	fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
1968	sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
1969	fprintf(stream, "%*spgd_created: %u\n", indent, "",
1970		vm->pgd_created);
1971	if (vm->pgd_created) {
1972		fprintf(stream, "%*sVirtual Translation Tables:\n",
1973			indent + 2, "");
1974		virt_dump(stream, vm, indent + 4);
1975	}
1976	fprintf(stream, "%*sVCPUs:\n", indent, "");
1977
1978	list_for_each_entry(vcpu, &vm->vcpus, list)
1979		vcpu_dump(stream, vcpu, indent + 2);
1980}
1981
1982#define KVM_EXIT_STRING(x) {KVM_EXIT_##x, #x}
1983
1984/* Known KVM exit reasons */
1985static struct exit_reason {
1986	unsigned int reason;
1987	const char *name;
1988} exit_reasons_known[] = {
1989	KVM_EXIT_STRING(UNKNOWN),
1990	KVM_EXIT_STRING(EXCEPTION),
1991	KVM_EXIT_STRING(IO),
1992	KVM_EXIT_STRING(HYPERCALL),
1993	KVM_EXIT_STRING(DEBUG),
1994	KVM_EXIT_STRING(HLT),
1995	KVM_EXIT_STRING(MMIO),
1996	KVM_EXIT_STRING(IRQ_WINDOW_OPEN),
1997	KVM_EXIT_STRING(SHUTDOWN),
1998	KVM_EXIT_STRING(FAIL_ENTRY),
1999	KVM_EXIT_STRING(INTR),
2000	KVM_EXIT_STRING(SET_TPR),
2001	KVM_EXIT_STRING(TPR_ACCESS),
2002	KVM_EXIT_STRING(S390_SIEIC),
2003	KVM_EXIT_STRING(S390_RESET),
2004	KVM_EXIT_STRING(DCR),
2005	KVM_EXIT_STRING(NMI),
2006	KVM_EXIT_STRING(INTERNAL_ERROR),
2007	KVM_EXIT_STRING(OSI),
2008	KVM_EXIT_STRING(PAPR_HCALL),
2009	KVM_EXIT_STRING(S390_UCONTROL),
2010	KVM_EXIT_STRING(WATCHDOG),
2011	KVM_EXIT_STRING(S390_TSCH),
2012	KVM_EXIT_STRING(EPR),
2013	KVM_EXIT_STRING(SYSTEM_EVENT),
2014	KVM_EXIT_STRING(S390_STSI),
2015	KVM_EXIT_STRING(IOAPIC_EOI),
2016	KVM_EXIT_STRING(HYPERV),
2017	KVM_EXIT_STRING(ARM_NISV),
2018	KVM_EXIT_STRING(X86_RDMSR),
2019	KVM_EXIT_STRING(X86_WRMSR),
2020	KVM_EXIT_STRING(DIRTY_RING_FULL),
2021	KVM_EXIT_STRING(AP_RESET_HOLD),
2022	KVM_EXIT_STRING(X86_BUS_LOCK),
2023	KVM_EXIT_STRING(XEN),
2024	KVM_EXIT_STRING(RISCV_SBI),
2025	KVM_EXIT_STRING(RISCV_CSR),
2026	KVM_EXIT_STRING(NOTIFY),
2027#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
2028	KVM_EXIT_STRING(MEMORY_NOT_PRESENT),
2029#endif
2030};
2031
2032/*
2033 * Exit Reason String
2034 *
2035 * Input Args:
2036 *   exit_reason - Exit reason
2037 *
2038 * Output Args: None
2039 *
2040 * Return:
2041 *   Constant string pointer describing the exit reason.
2042 *
2043 * Locates and returns a constant string that describes the KVM exit
2044 * reason given by exit_reason.  If no such string is found, a constant
2045 * string of "Unknown" is returned.
2046 */
2047const char *exit_reason_str(unsigned int exit_reason)
2048{
2049	unsigned int n1;
2050
2051	for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
2052		if (exit_reason == exit_reasons_known[n1].reason)
2053			return exit_reasons_known[n1].name;
2054	}
2055
2056	return "Unknown";
2057}
2058
2059/*
2060 * Physical Contiguous Page Allocator
2061 *
2062 * Input Args:
2063 *   vm - Virtual Machine
2064 *   num - number of pages
2065 *   paddr_min - Physical address minimum
2066 *   memslot - Memory region to allocate page from
2067 *   protected - True if the pages will be used as protected/private memory
2068 *
2069 * Output Args: None
2070 *
2071 * Return:
2072 *   Starting physical address
2073 *
2074 * Within the VM specified by vm, locates a range of available physical
2075 * pages at or above paddr_min. If found, the pages are marked as in use
2076 * and their base address is returned. A TEST_ASSERT failure occurs if
2077 * not enough pages are available at or above paddr_min.
2078 */
2079vm_paddr_t __vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
2080				vm_paddr_t paddr_min, uint32_t memslot,
2081				bool protected)
2082{
2083	struct userspace_mem_region *region;
2084	sparsebit_idx_t pg, base;
2085
2086	TEST_ASSERT(num > 0, "Must allocate at least one page");
2087
2088	TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
2089		"not divisible by page size.\n"
2090		"  paddr_min: 0x%lx page_size: 0x%x",
2091		paddr_min, vm->page_size);
2092
2093	region = memslot2region(vm, memslot);
2094	TEST_ASSERT(!protected || region->protected_phy_pages,
2095		    "Region doesn't support protected memory");
2096
2097	base = pg = paddr_min >> vm->page_shift;
2098	do {
2099		for (; pg < base + num; ++pg) {
2100			if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
2101				base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
2102				break;
2103			}
2104		}
2105	} while (pg && pg != base + num);
2106
2107	if (pg == 0) {
2108		fprintf(stderr, "No guest physical page available, "
2109			"paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
2110			paddr_min, vm->page_size, memslot);
2111		fputs("---- vm dump ----\n", stderr);
2112		vm_dump(stderr, vm, 2);
2113		abort();
2114	}
2115
2116	for (pg = base; pg < base + num; ++pg) {
2117		sparsebit_clear(region->unused_phy_pages, pg);
2118		if (protected)
2119			sparsebit_set(region->protected_phy_pages, pg);
2120	}
2121
2122	return base * vm->page_size;
2123}
2124
2125vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
2126			     uint32_t memslot)
2127{
2128	return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
2129}
2130
2131vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm)
2132{
2133	return vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR,
2134				 vm->memslots[MEM_REGION_PT]);
2135}
2136
2137/*
2138 * Address Guest Virtual to Host Virtual
2139 *
2140 * Input Args:
2141 *   vm - Virtual Machine
2142 *   gva - VM virtual address
2143 *
2144 * Output Args: None
2145 *
2146 * Return:
2147 *   Equivalent host virtual address
2148 */
2149void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
2150{
2151	return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
2152}
2153
2154unsigned long __weak vm_compute_max_gfn(struct kvm_vm *vm)
2155{
2156	return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
2157}
2158
2159static unsigned int vm_calc_num_pages(unsigned int num_pages,
2160				      unsigned int page_shift,
2161				      unsigned int new_page_shift,
2162				      bool ceil)
2163{
2164	unsigned int n = 1 << (new_page_shift - page_shift);
2165
2166	if (page_shift >= new_page_shift)
2167		return num_pages * (1 << (page_shift - new_page_shift));
2168
2169	return num_pages / n + !!(ceil && num_pages % n);
2170}
2171
2172static inline int getpageshift(void)
2173{
2174	return __builtin_ffs(getpagesize()) - 1;
2175}
2176
2177unsigned int
2178vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
2179{
2180	return vm_calc_num_pages(num_guest_pages,
2181				 vm_guest_mode_params[mode].page_shift,
2182				 getpageshift(), true);
2183}
2184
2185unsigned int
2186vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
2187{
2188	return vm_calc_num_pages(num_host_pages, getpageshift(),
2189				 vm_guest_mode_params[mode].page_shift, false);
2190}
2191
2192unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
2193{
2194	unsigned int n;
2195	n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
2196	return vm_adjust_num_guest_pages(mode, n);
2197}
2198
2199/*
2200 * Read binary stats descriptors
2201 *
2202 * Input Args:
2203 *   stats_fd - the file descriptor for the binary stats file from which to read
2204 *   header - the binary stats metadata header corresponding to the given FD
2205 *
2206 * Output Args: None
2207 *
2208 * Return:
2209 *   A pointer to a newly allocated series of stat descriptors.
2210 *   Caller is responsible for freeing the returned kvm_stats_desc.
2211 *
2212 * Read the stats descriptors from the binary stats interface.
2213 */
2214struct kvm_stats_desc *read_stats_descriptors(int stats_fd,
2215					      struct kvm_stats_header *header)
2216{
2217	struct kvm_stats_desc *stats_desc;
2218	ssize_t desc_size, total_size, ret;
2219
2220	desc_size = get_stats_descriptor_size(header);
2221	total_size = header->num_desc * desc_size;
2222
2223	stats_desc = calloc(header->num_desc, desc_size);
2224	TEST_ASSERT(stats_desc, "Allocate memory for stats descriptors");
2225
2226	ret = pread(stats_fd, stats_desc, total_size, header->desc_offset);
2227	TEST_ASSERT(ret == total_size, "Read KVM stats descriptors");
2228
2229	return stats_desc;
2230}
2231
2232/*
2233 * Read stat data for a particular stat
2234 *
2235 * Input Args:
2236 *   stats_fd - the file descriptor for the binary stats file from which to read
2237 *   header - the binary stats metadata header corresponding to the given FD
2238 *   desc - the binary stat metadata for the particular stat to be read
2239 *   max_elements - the maximum number of 8-byte values to read into data
2240 *
2241 * Output Args:
2242 *   data - the buffer into which stat data should be read
2243 *
2244 * Read the data values of a specified stat from the binary stats interface.
2245 */
2246void read_stat_data(int stats_fd, struct kvm_stats_header *header,
2247		    struct kvm_stats_desc *desc, uint64_t *data,
2248		    size_t max_elements)
2249{
2250	size_t nr_elements = min_t(ssize_t, desc->size, max_elements);
2251	size_t size = nr_elements * sizeof(*data);
2252	ssize_t ret;
2253
2254	TEST_ASSERT(desc->size, "No elements in stat '%s'", desc->name);
2255	TEST_ASSERT(max_elements, "Zero elements requested for stat '%s'", desc->name);
2256
2257	ret = pread(stats_fd, data, size,
2258		    header->data_offset + desc->offset);
2259
2260	TEST_ASSERT(ret >= 0, "pread() failed on stat '%s', errno: %i (%s)",
2261		    desc->name, errno, strerror(errno));
2262	TEST_ASSERT(ret == size,
2263		    "pread() on stat '%s' read %ld bytes, wanted %lu bytes",
2264		    desc->name, size, ret);
2265}
2266
2267/*
2268 * Read the data of the named stat
2269 *
2270 * Input Args:
2271 *   vm - the VM for which the stat should be read
2272 *   stat_name - the name of the stat to read
2273 *   max_elements - the maximum number of 8-byte values to read into data
2274 *
2275 * Output Args:
2276 *   data - the buffer into which stat data should be read
2277 *
2278 * Read the data values of a specified stat from the binary stats interface.
2279 */
2280void __vm_get_stat(struct kvm_vm *vm, const char *stat_name, uint64_t *data,
2281		   size_t max_elements)
2282{
2283	struct kvm_stats_desc *desc;
2284	size_t size_desc;
2285	int i;
2286
2287	if (!vm->stats_fd) {
2288		vm->stats_fd = vm_get_stats_fd(vm);
2289		read_stats_header(vm->stats_fd, &vm->stats_header);
2290		vm->stats_desc = read_stats_descriptors(vm->stats_fd,
2291							&vm->stats_header);
2292	}
2293
2294	size_desc = get_stats_descriptor_size(&vm->stats_header);
2295
2296	for (i = 0; i < vm->stats_header.num_desc; ++i) {
2297		desc = (void *)vm->stats_desc + (i * size_desc);
2298
2299		if (strcmp(desc->name, stat_name))
2300			continue;
2301
2302		read_stat_data(vm->stats_fd, &vm->stats_header, desc,
2303			       data, max_elements);
2304
2305		break;
2306	}
2307}
2308
2309__weak void kvm_arch_vm_post_create(struct kvm_vm *vm)
2310{
2311}
2312
2313__weak void kvm_selftest_arch_init(void)
2314{
2315}
2316
2317void __attribute((constructor)) kvm_selftest_init(void)
2318{
2319	/* Tell stdout not to buffer its content. */
2320	setbuf(stdout, NULL);
2321
2322	guest_random_seed = random();
2323
2324	kvm_selftest_arch_init();
2325}
2326
2327bool vm_is_gpa_protected(struct kvm_vm *vm, vm_paddr_t paddr)
2328{
2329	sparsebit_idx_t pg = 0;
2330	struct userspace_mem_region *region;
2331
2332	if (!vm_arch_has_protected_memory(vm))
2333		return false;
2334
2335	region = userspace_mem_region_find(vm, paddr, paddr);
2336	TEST_ASSERT(region, "No vm physical memory at 0x%lx", paddr);
2337
2338	pg = paddr >> vm->page_shift;
2339	return sparsebit_is_set(region->protected_phy_pages, pg);
2340}
2341