1/*
2 * Copyright (c) 2014 Roger Pau Monn�� <roger.pau@citrix.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/uio.h>
33#include <sys/bus.h>
34#include <sys/malloc.h>
35#include <sys/kernel.h>
36#include <sys/lock.h>
37#include <sys/mutex.h>
38#include <sys/rwlock.h>
39#include <sys/selinfo.h>
40#include <sys/poll.h>
41#include <sys/conf.h>
42#include <sys/fcntl.h>
43#include <sys/ioccom.h>
44#include <sys/rman.h>
45#include <sys/tree.h>
46#include <sys/module.h>
47#include <sys/proc.h>
48#include <sys/bitset.h>
49
50#include <vm/vm.h>
51#include <vm/vm_param.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_page.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57#include <vm/vm_pager.h>
58
59#include <machine/md_var.h>
60
61#include <xen/xen-os.h>
62#include <xen/hypervisor.h>
63#include <xen/privcmd.h>
64#include <xen/error.h>
65
66MALLOC_DEFINE(M_PRIVCMD, "privcmd_dev", "Xen privcmd user-space device");
67
68#define MAX_DMOP_BUFFERS 16
69
70struct privcmd_map {
71	vm_object_t mem;
72	vm_size_t size;
73	struct resource *pseudo_phys_res;
74	int pseudo_phys_res_id;
75	vm_paddr_t phys_base_addr;
76	boolean_t mapped;
77	BITSET_DEFINE_VAR() *err;
78};
79
80static d_ioctl_t     privcmd_ioctl;
81static d_open_t      privcmd_open;
82static d_mmap_single_t	privcmd_mmap_single;
83
84static struct cdevsw privcmd_devsw = {
85	.d_version = D_VERSION,
86	.d_ioctl = privcmd_ioctl,
87	.d_mmap_single = privcmd_mmap_single,
88	.d_open = privcmd_open,
89	.d_name = "privcmd",
90};
91
92static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
93    vm_ooffset_t foff, struct ucred *cred, u_short *color);
94static void privcmd_pg_dtor(void *handle);
95static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
96    int prot, vm_page_t *mres);
97
98static struct cdev_pager_ops privcmd_pg_ops = {
99	.cdev_pg_fault = privcmd_pg_fault,
100	.cdev_pg_ctor =	privcmd_pg_ctor,
101	.cdev_pg_dtor =	privcmd_pg_dtor,
102};
103
104struct per_user_data {
105	domid_t dom;
106};
107
108static device_t privcmd_dev = NULL;
109
110/*------------------------- Privcmd Pager functions --------------------------*/
111static int
112privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
113    vm_ooffset_t foff, struct ucred *cred, u_short *color)
114{
115
116	return (0);
117}
118
119static void
120privcmd_pg_dtor(void *handle)
121{
122	struct xen_remove_from_physmap rm = { .domid = DOMID_SELF };
123	struct privcmd_map *map = handle;
124	int error;
125	vm_size_t i;
126	vm_page_t m;
127
128	/*
129	 * Remove the mappings from the used pages. This will remove the
130	 * underlying p2m bindings in Xen second stage translation.
131	 */
132	if (map->mapped == true) {
133		VM_OBJECT_WLOCK(map->mem);
134retry:
135		for (i = 0; i < map->size; i++) {
136			m = vm_page_lookup(map->mem, i);
137			if (m == NULL)
138				continue;
139			if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0)
140				goto retry;
141			cdev_pager_free_page(map->mem, m);
142		}
143		VM_OBJECT_WUNLOCK(map->mem);
144
145		for (i = 0; i < map->size; i++) {
146			rm.gpfn = atop(map->phys_base_addr) + i;
147			HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm);
148		}
149		free(map->err, M_PRIVCMD);
150	}
151
152	error = xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
153	    map->pseudo_phys_res);
154	KASSERT(error == 0, ("Unable to release memory resource: %d", error));
155
156	free(map, M_PRIVCMD);
157}
158
159static int
160privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
161    int prot, vm_page_t *mres)
162{
163	struct privcmd_map *map = object->handle;
164	vm_pindex_t pidx;
165	vm_page_t page;
166
167	if (map->mapped != true)
168		return (VM_PAGER_FAIL);
169
170	pidx = OFF_TO_IDX(offset);
171	if (pidx >= map->size || BIT_ISSET(map->size, pidx, map->err))
172		return (VM_PAGER_FAIL);
173
174	page = PHYS_TO_VM_PAGE(map->phys_base_addr + offset);
175	if (page == NULL)
176		return (VM_PAGER_FAIL);
177
178	KASSERT((page->flags & PG_FICTITIOUS) != 0,
179	    ("not fictitious %p", page));
180	KASSERT(vm_page_wired(page), ("page %p not wired", page));
181	KASSERT(!vm_page_busied(page), ("page %p is busy", page));
182
183	vm_page_busy_acquire(page, 0);
184	vm_page_valid(page);
185
186	if (*mres != NULL)
187		vm_page_replace(page, object, pidx, *mres);
188	else
189		vm_page_insert(page, object, pidx);
190	*mres = page;
191	return (VM_PAGER_OK);
192}
193
194/*----------------------- Privcmd char device methods ------------------------*/
195static int
196privcmd_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
197    vm_object_t *object, int nprot)
198{
199	struct privcmd_map *map;
200
201	map = malloc(sizeof(*map), M_PRIVCMD, M_WAITOK | M_ZERO);
202
203	map->size = OFF_TO_IDX(size);
204	map->pseudo_phys_res_id = 0;
205
206	map->pseudo_phys_res = xenmem_alloc(privcmd_dev,
207	    &map->pseudo_phys_res_id, size);
208	if (map->pseudo_phys_res == NULL) {
209		free(map, M_PRIVCMD);
210		return (ENOMEM);
211	}
212
213	map->phys_base_addr = rman_get_start(map->pseudo_phys_res);
214	map->mem = cdev_pager_allocate(map, OBJT_MGTDEVICE, &privcmd_pg_ops,
215	    size, nprot, *offset, NULL);
216	if (map->mem == NULL) {
217		xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
218		    map->pseudo_phys_res);
219		free(map, M_PRIVCMD);
220		return (ENOMEM);
221	}
222
223	*object = map->mem;
224
225	return (0);
226}
227
228static struct privcmd_map *
229setup_virtual_area(struct thread *td, unsigned long addr, unsigned long num)
230{
231	vm_map_t map;
232	vm_map_entry_t entry;
233	vm_object_t mem;
234	vm_pindex_t pindex;
235	vm_prot_t prot;
236	boolean_t wired;
237	struct privcmd_map *umap;
238	int error;
239
240	if ((num == 0) || ((addr & PAGE_MASK) != 0))
241		return NULL;
242
243	map = &td->td_proc->p_vmspace->vm_map;
244	error = vm_map_lookup(&map, addr, VM_PROT_NONE, &entry, &mem, &pindex,
245	    &prot, &wired);
246	if (error != KERN_SUCCESS || (entry->start != addr) ||
247	    (entry->end != addr + (num * PAGE_SIZE)))
248		return NULL;
249
250	vm_map_lookup_done(map, entry);
251	if ((mem->type != OBJT_MGTDEVICE) ||
252	    (mem->un_pager.devp.ops != &privcmd_pg_ops))
253		return NULL;
254
255	umap = mem->handle;
256	/* Allocate a bitset to store broken page mappings. */
257	umap->err = BITSET_ALLOC(num, M_PRIVCMD, M_WAITOK | M_ZERO);
258
259	return umap;
260}
261
262static int
263privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg,
264	      int mode, struct thread *td)
265{
266	int error;
267	unsigned int i;
268	void *data;
269	const struct per_user_data *u;
270
271	error = devfs_get_cdevpriv(&data);
272	if (error != 0)
273		return (EINVAL);
274	/*
275	 * Constify user-data to prevent unintended changes to the restriction
276	 * limits.
277	 */
278	u = data;
279
280	switch (cmd) {
281	case IOCTL_PRIVCMD_HYPERCALL: {
282		struct ioctl_privcmd_hypercall *hcall;
283
284		hcall = (struct ioctl_privcmd_hypercall *)arg;
285
286		/* Forbid hypercalls if restricted. */
287		if (u->dom != DOMID_INVALID) {
288			error = EPERM;
289			break;
290		}
291
292#ifdef __amd64__
293		/*
294		 * The hypervisor page table walker will refuse to access
295		 * user-space pages if SMAP is enabled, so temporary disable it
296		 * while performing the hypercall.
297		 */
298		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
299			stac();
300#endif
301		error = privcmd_hypercall(hcall->op, hcall->arg[0],
302		    hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]);
303#ifdef __amd64__
304		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
305			clac();
306#endif
307		if (error >= 0) {
308			hcall->retval = error;
309			error = 0;
310		} else {
311			error = xen_translate_error(error);
312			hcall->retval = 0;
313		}
314		break;
315	}
316	case IOCTL_PRIVCMD_MMAPBATCH: {
317		struct ioctl_privcmd_mmapbatch *mmap;
318		struct xen_add_to_physmap_range add;
319		xen_ulong_t *idxs;
320		xen_pfn_t *gpfns;
321		int *errs;
322		unsigned int index;
323		struct privcmd_map *umap;
324		uint16_t num;
325
326		mmap = (struct ioctl_privcmd_mmapbatch *)arg;
327
328		if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
329			error = EPERM;
330			break;
331		}
332
333		umap = setup_virtual_area(td, mmap->addr, mmap->num);
334		if (umap == NULL) {
335			error = EINVAL;
336			break;
337		}
338
339		add.domid = DOMID_SELF;
340		add.space = XENMAPSPACE_gmfn_foreign;
341		add.foreign_domid = mmap->dom;
342
343		/*
344		 * The 'size' field in the xen_add_to_physmap_range only
345		 * allows for UINT16_MAX mappings in a single hypercall.
346		 */
347		num = MIN(mmap->num, UINT16_MAX);
348
349		idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK);
350		gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK);
351		errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK);
352
353		set_xen_guest_handle(add.idxs, idxs);
354		set_xen_guest_handle(add.gpfns, gpfns);
355		set_xen_guest_handle(add.errs, errs);
356
357		for (index = 0; index < mmap->num; index += num) {
358			num = MIN(mmap->num - index, UINT16_MAX);
359			add.size = num;
360
361			error = copyin(&mmap->arr[index], idxs,
362			    sizeof(idxs[0]) * num);
363			if (error != 0)
364				goto mmap_out;
365
366			for (i = 0; i < num; i++)
367				gpfns[i] = atop(umap->phys_base_addr +
368				    (i + index) * PAGE_SIZE);
369
370			bzero(errs, sizeof(*errs) * num);
371
372			error = HYPERVISOR_memory_op(
373			    XENMEM_add_to_physmap_range, &add);
374			if (error != 0) {
375				error = xen_translate_error(error);
376				goto mmap_out;
377			}
378
379			for (i = 0; i < num; i++) {
380				if (errs[i] != 0) {
381					errs[i] = xen_translate_error(errs[i]);
382
383					/* Mark the page as invalid. */
384					BIT_SET(mmap->num, index + i,
385					    umap->err);
386				}
387			}
388
389			error = copyout(errs, &mmap->err[index],
390			    sizeof(errs[0]) * num);
391			if (error != 0)
392				goto mmap_out;
393		}
394
395		umap->mapped = true;
396
397mmap_out:
398		free(idxs, M_PRIVCMD);
399		free(gpfns, M_PRIVCMD);
400		free(errs, M_PRIVCMD);
401		if (!umap->mapped)
402			free(umap->err, M_PRIVCMD);
403
404		break;
405	}
406	case IOCTL_PRIVCMD_MMAP_RESOURCE: {
407		struct ioctl_privcmd_mmapresource *mmap;
408		struct xen_mem_acquire_resource adq;
409		xen_pfn_t *gpfns;
410		struct privcmd_map *umap;
411
412		mmap = (struct ioctl_privcmd_mmapresource *)arg;
413
414		if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
415			error = EPERM;
416			break;
417		}
418
419		bzero(&adq, sizeof(adq));
420
421		adq.domid = mmap->dom;
422		adq.type = mmap->type;
423		adq.id = mmap->id;
424
425		/* Shortcut for getting the resource size. */
426		if (mmap->addr == 0 && mmap->num == 0) {
427			error = HYPERVISOR_memory_op(XENMEM_acquire_resource,
428			    &adq);
429			if (error != 0) {
430				error = xen_translate_error(error);
431				break;
432			}
433			error = copyout(&adq.nr_frames, &mmap->num,
434			    sizeof(mmap->num));
435			break;
436		}
437
438		umap = setup_virtual_area(td, mmap->addr, mmap->num);
439		if (umap == NULL) {
440			error = EINVAL;
441			break;
442		}
443
444		adq.nr_frames = mmap->num;
445		adq.frame = mmap->idx;
446
447		gpfns = malloc(sizeof(*gpfns) * mmap->num, M_PRIVCMD, M_WAITOK);
448		for (i = 0; i < mmap->num; i++)
449			gpfns[i] = atop(umap->phys_base_addr) + i;
450		set_xen_guest_handle(adq.frame_list, gpfns);
451
452		error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &adq);
453		if (error != 0)
454			error = xen_translate_error(error);
455		else
456			umap->mapped = true;
457
458		free(gpfns, M_PRIVCMD);
459		if (!umap->mapped)
460			free(umap->err, M_PRIVCMD);
461
462		break;
463	}
464	case IOCTL_PRIVCMD_DM_OP: {
465		const struct ioctl_privcmd_dmop *dmop;
466		struct privcmd_dmop_buf *bufs;
467		struct xen_dm_op_buf *hbufs;
468
469		dmop = (struct ioctl_privcmd_dmop *)arg;
470
471		if (u->dom != DOMID_INVALID && u->dom != dmop->dom) {
472			error = EPERM;
473			break;
474		}
475
476		if (dmop->num == 0)
477			break;
478
479		if (dmop->num > MAX_DMOP_BUFFERS) {
480			error = E2BIG;
481			break;
482		}
483
484		bufs = malloc(sizeof(*bufs) * dmop->num, M_PRIVCMD, M_WAITOK);
485
486		error = copyin(dmop->ubufs, bufs, sizeof(*bufs) * dmop->num);
487		if (error != 0) {
488			free(bufs, M_PRIVCMD);
489			break;
490		}
491
492		hbufs = malloc(sizeof(*hbufs) * dmop->num, M_PRIVCMD, M_WAITOK);
493		for (i = 0; i < dmop->num; i++) {
494			set_xen_guest_handle(hbufs[i].h, bufs[i].uptr);
495			hbufs[i].size = bufs[i].size;
496		}
497
498#ifdef __amd64__
499		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
500			stac();
501#endif
502		error = HYPERVISOR_dm_op(dmop->dom, dmop->num, hbufs);
503#ifdef __amd64__
504		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
505			clac();
506#endif
507		if (error != 0)
508			error = xen_translate_error(error);
509
510		free(bufs, M_PRIVCMD);
511		free(hbufs, M_PRIVCMD);
512
513
514		break;
515	}
516	case IOCTL_PRIVCMD_RESTRICT: {
517		struct per_user_data *u;
518		domid_t dom;
519
520		dom = *(domid_t *)arg;
521
522		error = devfs_get_cdevpriv((void **)&u);
523		if (error != 0)
524			break;
525
526		if (u->dom != DOMID_INVALID && u->dom != dom) {
527			error = -EINVAL;
528			break;
529		}
530		u->dom = dom;
531
532		break;
533	}
534	default:
535		error = ENOSYS;
536		break;
537	}
538
539	return (error);
540}
541
542static void
543user_release(void *arg)
544{
545
546	free(arg, M_PRIVCMD);
547}
548
549static int
550privcmd_open(struct cdev *dev, int flag, int otyp, struct thread *td)
551{
552	struct per_user_data *u;
553	int error;
554
555	u = malloc(sizeof(*u), M_PRIVCMD, M_WAITOK);
556	u->dom = DOMID_INVALID;
557
558	/* Assign the allocated per_user_data to this open instance. */
559	error = devfs_set_cdevpriv(u, user_release);
560	if (error != 0) {
561		free(u, M_PRIVCMD);
562	}
563
564	return (error);
565}
566
567/*------------------ Private Device Attachment Functions  --------------------*/
568static void
569privcmd_identify(driver_t *driver, device_t parent)
570{
571
572	KASSERT(xen_domain(),
573	    ("Trying to attach privcmd device on non Xen domain"));
574
575	if (BUS_ADD_CHILD(parent, 0, "privcmd", 0) == NULL)
576		panic("unable to attach privcmd user-space device");
577}
578
579static int
580privcmd_probe(device_t dev)
581{
582
583	privcmd_dev = dev;
584	device_set_desc(dev, "Xen privileged interface user-space device");
585	return (BUS_PROBE_NOWILDCARD);
586}
587
588static int
589privcmd_attach(device_t dev)
590{
591
592	make_dev_credf(MAKEDEV_ETERNAL, &privcmd_devsw, 0, NULL, UID_ROOT,
593	    GID_WHEEL, 0600, "xen/privcmd");
594	return (0);
595}
596
597/*-------------------- Private Device Attachment Data  -----------------------*/
598static device_method_t privcmd_methods[] = {
599	DEVMETHOD(device_identify,	privcmd_identify),
600	DEVMETHOD(device_probe,		privcmd_probe),
601	DEVMETHOD(device_attach,	privcmd_attach),
602
603	DEVMETHOD_END
604};
605
606static driver_t privcmd_driver = {
607	"privcmd",
608	privcmd_methods,
609	0,
610};
611
612devclass_t privcmd_devclass;
613
614DRIVER_MODULE(privcmd, xenpv, privcmd_driver, privcmd_devclass, 0, 0);
615MODULE_DEPEND(privcmd, xenpv, 1, 1, 1);
616