1/*
2 * Copyright (c) 2014 Roger Pau Monn�� <roger.pau@citrix.com>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/uio.h>
30#include <sys/bus.h>
31#include <sys/malloc.h>
32#include <sys/kernel.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/rwlock.h>
36#include <sys/selinfo.h>
37#include <sys/poll.h>
38#include <sys/conf.h>
39#include <sys/fcntl.h>
40#include <sys/ioccom.h>
41#include <sys/rman.h>
42#include <sys/tree.h>
43#include <sys/module.h>
44#include <sys/proc.h>
45#include <sys/bitset.h>
46
47#include <vm/vm.h>
48#include <vm/vm_param.h>
49#include <vm/vm_extern.h>
50#include <vm/vm_kern.h>
51#include <vm/vm_page.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54#include <vm/vm_pager.h>
55
56#include <machine/md_var.h>
57
58#include <xen/xen-os.h>
59#include <xen/hypervisor.h>
60#include <xen/privcmd.h>
61#include <xen/error.h>
62
63MALLOC_DEFINE(M_PRIVCMD, "privcmd_dev", "Xen privcmd user-space device");
64
65#define MAX_DMOP_BUFFERS 16
66
67struct privcmd_map {
68	vm_object_t mem;
69	vm_size_t size;
70	struct resource *pseudo_phys_res;
71	int pseudo_phys_res_id;
72	vm_paddr_t phys_base_addr;
73	boolean_t mapped;
74	BITSET_DEFINE_VAR() *err;
75};
76
77static d_ioctl_t     privcmd_ioctl;
78static d_open_t      privcmd_open;
79static d_mmap_single_t	privcmd_mmap_single;
80
81static struct cdevsw privcmd_devsw = {
82	.d_version = D_VERSION,
83	.d_ioctl = privcmd_ioctl,
84	.d_mmap_single = privcmd_mmap_single,
85	.d_open = privcmd_open,
86	.d_name = "privcmd",
87};
88
89static int privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
90    vm_ooffset_t foff, struct ucred *cred, u_short *color);
91static void privcmd_pg_dtor(void *handle);
92static int privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
93    int prot, vm_page_t *mres);
94
95static struct cdev_pager_ops privcmd_pg_ops = {
96	.cdev_pg_fault = privcmd_pg_fault,
97	.cdev_pg_ctor =	privcmd_pg_ctor,
98	.cdev_pg_dtor =	privcmd_pg_dtor,
99};
100
101struct per_user_data {
102	domid_t dom;
103};
104
105static device_t privcmd_dev = NULL;
106
107/*------------------------- Privcmd Pager functions --------------------------*/
108static int
109privcmd_pg_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
110    vm_ooffset_t foff, struct ucred *cred, u_short *color)
111{
112
113	return (0);
114}
115
116static void
117privcmd_pg_dtor(void *handle)
118{
119	struct xen_remove_from_physmap rm = { .domid = DOMID_SELF };
120	struct privcmd_map *map = handle;
121	int error __diagused;
122	vm_size_t i;
123	vm_page_t m;
124
125	/*
126	 * Remove the mappings from the used pages. This will remove the
127	 * underlying p2m bindings in Xen second stage translation.
128	 */
129	if (map->mapped == true) {
130		VM_OBJECT_WLOCK(map->mem);
131retry:
132		for (i = 0; i < map->size; i++) {
133			m = vm_page_lookup(map->mem, i);
134			if (m == NULL)
135				continue;
136			if (vm_page_busy_acquire(m, VM_ALLOC_WAITFAIL) == 0)
137				goto retry;
138			cdev_pager_free_page(map->mem, m);
139		}
140		VM_OBJECT_WUNLOCK(map->mem);
141
142		for (i = 0; i < map->size; i++) {
143			rm.gpfn = atop(map->phys_base_addr) + i;
144			HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &rm);
145		}
146		free(map->err, M_PRIVCMD);
147	}
148
149	error = xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
150	    map->pseudo_phys_res);
151	KASSERT(error == 0, ("Unable to release memory resource: %d", error));
152
153	free(map, M_PRIVCMD);
154}
155
156static int
157privcmd_pg_fault(vm_object_t object, vm_ooffset_t offset,
158    int prot, vm_page_t *mres)
159{
160	struct privcmd_map *map = object->handle;
161	vm_pindex_t pidx;
162	vm_page_t page;
163
164	if (map->mapped != true)
165		return (VM_PAGER_FAIL);
166
167	pidx = OFF_TO_IDX(offset);
168	if (pidx >= map->size || BIT_ISSET(map->size, pidx, map->err))
169		return (VM_PAGER_FAIL);
170
171	page = PHYS_TO_VM_PAGE(map->phys_base_addr + offset);
172	if (page == NULL)
173		return (VM_PAGER_FAIL);
174
175	KASSERT((page->flags & PG_FICTITIOUS) != 0,
176	    ("not fictitious %p", page));
177	KASSERT(vm_page_wired(page), ("page %p not wired", page));
178	KASSERT(!vm_page_busied(page), ("page %p is busy", page));
179
180	vm_page_busy_acquire(page, 0);
181	vm_page_valid(page);
182
183	if (*mres != NULL)
184		vm_page_replace(page, object, pidx, *mres);
185	else
186		vm_page_insert(page, object, pidx);
187	*mres = page;
188	return (VM_PAGER_OK);
189}
190
191/*----------------------- Privcmd char device methods ------------------------*/
192static int
193privcmd_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t size,
194    vm_object_t *object, int nprot)
195{
196	struct privcmd_map *map;
197
198	map = malloc(sizeof(*map), M_PRIVCMD, M_WAITOK | M_ZERO);
199
200	map->size = OFF_TO_IDX(size);
201	map->pseudo_phys_res_id = 0;
202
203	map->pseudo_phys_res = xenmem_alloc(privcmd_dev,
204	    &map->pseudo_phys_res_id, size);
205	if (map->pseudo_phys_res == NULL) {
206		free(map, M_PRIVCMD);
207		return (ENOMEM);
208	}
209
210	map->phys_base_addr = rman_get_start(map->pseudo_phys_res);
211	map->mem = cdev_pager_allocate(map, OBJT_MGTDEVICE, &privcmd_pg_ops,
212	    size, nprot, *offset, NULL);
213	if (map->mem == NULL) {
214		xenmem_free(privcmd_dev, map->pseudo_phys_res_id,
215		    map->pseudo_phys_res);
216		free(map, M_PRIVCMD);
217		return (ENOMEM);
218	}
219
220	*object = map->mem;
221
222	return (0);
223}
224
225static struct privcmd_map *
226setup_virtual_area(struct thread *td, unsigned long addr, unsigned long num)
227{
228	vm_map_t map;
229	vm_map_entry_t entry;
230	vm_object_t mem;
231	vm_pindex_t pindex;
232	vm_prot_t prot;
233	boolean_t wired;
234	struct privcmd_map *umap;
235	int error;
236
237	if ((num == 0) || ((addr & PAGE_MASK) != 0))
238		return NULL;
239
240	map = &td->td_proc->p_vmspace->vm_map;
241	error = vm_map_lookup(&map, addr, VM_PROT_NONE, &entry, &mem, &pindex,
242	    &prot, &wired);
243	if (error != KERN_SUCCESS || (entry->start != addr) ||
244	    (entry->end != addr + (num * PAGE_SIZE)))
245		return NULL;
246
247	vm_map_lookup_done(map, entry);
248	if ((mem->type != OBJT_MGTDEVICE) ||
249	    (mem->un_pager.devp.ops != &privcmd_pg_ops))
250		return NULL;
251
252	umap = mem->handle;
253	/* Allocate a bitset to store broken page mappings. */
254	umap->err = BITSET_ALLOC(num, M_PRIVCMD, M_WAITOK | M_ZERO);
255
256	return umap;
257}
258
259static int
260privcmd_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg,
261	      int mode, struct thread *td)
262{
263	int error;
264	unsigned int i;
265	void *data;
266	const struct per_user_data *u;
267
268	error = devfs_get_cdevpriv(&data);
269	if (error != 0)
270		return (EINVAL);
271	/*
272	 * Constify user-data to prevent unintended changes to the restriction
273	 * limits.
274	 */
275	u = data;
276
277	switch (cmd) {
278	case IOCTL_PRIVCMD_HYPERCALL: {
279		struct ioctl_privcmd_hypercall *hcall;
280
281		hcall = (struct ioctl_privcmd_hypercall *)arg;
282
283		/* Forbid hypercalls if restricted. */
284		if (u->dom != DOMID_INVALID) {
285			error = EPERM;
286			break;
287		}
288
289#ifdef __amd64__
290		/*
291		 * The hypervisor page table walker will refuse to access
292		 * user-space pages if SMAP is enabled, so temporary disable it
293		 * while performing the hypercall.
294		 */
295		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
296			stac();
297#endif
298		error = privcmd_hypercall(hcall->op, hcall->arg[0],
299		    hcall->arg[1], hcall->arg[2], hcall->arg[3], hcall->arg[4]);
300#ifdef __amd64__
301		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
302			clac();
303#endif
304		if (error >= 0) {
305			hcall->retval = error;
306			error = 0;
307		} else {
308			error = xen_translate_error(error);
309			hcall->retval = 0;
310		}
311		break;
312	}
313	case IOCTL_PRIVCMD_MMAPBATCH: {
314		struct ioctl_privcmd_mmapbatch *mmap;
315		struct xen_add_to_physmap_batch add;
316		xen_ulong_t *idxs;
317		xen_pfn_t *gpfns;
318		int *errs;
319		unsigned int index;
320		struct privcmd_map *umap;
321		uint16_t num;
322
323		mmap = (struct ioctl_privcmd_mmapbatch *)arg;
324
325		if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
326			error = EPERM;
327			break;
328		}
329
330		umap = setup_virtual_area(td, mmap->addr, mmap->num);
331		if (umap == NULL) {
332			error = EINVAL;
333			break;
334		}
335
336		add.domid = DOMID_SELF;
337		add.space = XENMAPSPACE_gmfn_foreign;
338		add.u.foreign_domid = mmap->dom;
339
340		/*
341		 * The 'size' field in the xen_add_to_physmap_range only
342		 * allows for UINT16_MAX mappings in a single hypercall.
343		 */
344		num = MIN(mmap->num, UINT16_MAX);
345
346		idxs = malloc(sizeof(*idxs) * num, M_PRIVCMD, M_WAITOK);
347		gpfns = malloc(sizeof(*gpfns) * num, M_PRIVCMD, M_WAITOK);
348		errs = malloc(sizeof(*errs) * num, M_PRIVCMD, M_WAITOK);
349
350		set_xen_guest_handle(add.idxs, idxs);
351		set_xen_guest_handle(add.gpfns, gpfns);
352		set_xen_guest_handle(add.errs, errs);
353
354		for (index = 0; index < mmap->num; index += num) {
355			num = MIN(mmap->num - index, UINT16_MAX);
356			add.size = num;
357
358			error = copyin(&mmap->arr[index], idxs,
359			    sizeof(idxs[0]) * num);
360			if (error != 0)
361				goto mmap_out;
362
363			for (i = 0; i < num; i++)
364				gpfns[i] = atop(umap->phys_base_addr +
365				    (i + index) * PAGE_SIZE);
366
367			bzero(errs, sizeof(*errs) * num);
368
369			error = HYPERVISOR_memory_op(
370			    XENMEM_add_to_physmap_batch, &add);
371			if (error != 0) {
372				error = xen_translate_error(error);
373				goto mmap_out;
374			}
375
376			for (i = 0; i < num; i++) {
377				if (errs[i] != 0) {
378					errs[i] = xen_translate_error(errs[i]);
379
380					/* Mark the page as invalid. */
381					BIT_SET(mmap->num, index + i,
382					    umap->err);
383				}
384			}
385
386			error = copyout(errs, &mmap->err[index],
387			    sizeof(errs[0]) * num);
388			if (error != 0)
389				goto mmap_out;
390		}
391
392		umap->mapped = true;
393
394mmap_out:
395		free(idxs, M_PRIVCMD);
396		free(gpfns, M_PRIVCMD);
397		free(errs, M_PRIVCMD);
398		if (!umap->mapped)
399			free(umap->err, M_PRIVCMD);
400
401		break;
402	}
403	case IOCTL_PRIVCMD_MMAP_RESOURCE: {
404		struct ioctl_privcmd_mmapresource *mmap;
405		struct xen_mem_acquire_resource adq;
406		xen_pfn_t *gpfns;
407		struct privcmd_map *umap;
408
409		mmap = (struct ioctl_privcmd_mmapresource *)arg;
410
411		if (u->dom != DOMID_INVALID && u->dom != mmap->dom) {
412			error = EPERM;
413			break;
414		}
415
416		bzero(&adq, sizeof(adq));
417
418		adq.domid = mmap->dom;
419		adq.type = mmap->type;
420		adq.id = mmap->id;
421
422		/* Shortcut for getting the resource size. */
423		if (mmap->addr == 0 && mmap->num == 0) {
424			error = HYPERVISOR_memory_op(XENMEM_acquire_resource,
425			    &adq);
426			if (error != 0)
427				error = xen_translate_error(error);
428			else
429				mmap->num = adq.nr_frames;
430			break;
431		}
432
433		umap = setup_virtual_area(td, mmap->addr, mmap->num);
434		if (umap == NULL) {
435			error = EINVAL;
436			break;
437		}
438
439		adq.nr_frames = mmap->num;
440		adq.frame = mmap->idx;
441
442		gpfns = malloc(sizeof(*gpfns) * mmap->num, M_PRIVCMD, M_WAITOK);
443		for (i = 0; i < mmap->num; i++)
444			gpfns[i] = atop(umap->phys_base_addr) + i;
445		set_xen_guest_handle(adq.frame_list, gpfns);
446
447		error = HYPERVISOR_memory_op(XENMEM_acquire_resource, &adq);
448		if (error != 0)
449			error = xen_translate_error(error);
450		else
451			umap->mapped = true;
452
453		free(gpfns, M_PRIVCMD);
454		if (!umap->mapped)
455			free(umap->err, M_PRIVCMD);
456
457		break;
458	}
459	case IOCTL_PRIVCMD_DM_OP: {
460		const struct ioctl_privcmd_dmop *dmop;
461		struct privcmd_dmop_buf *bufs;
462		struct xen_dm_op_buf *hbufs;
463
464		dmop = (struct ioctl_privcmd_dmop *)arg;
465
466		if (u->dom != DOMID_INVALID && u->dom != dmop->dom) {
467			error = EPERM;
468			break;
469		}
470
471		if (dmop->num == 0)
472			break;
473
474		if (dmop->num > MAX_DMOP_BUFFERS) {
475			error = E2BIG;
476			break;
477		}
478
479		bufs = malloc(sizeof(*bufs) * dmop->num, M_PRIVCMD, M_WAITOK);
480
481		error = copyin(dmop->ubufs, bufs, sizeof(*bufs) * dmop->num);
482		if (error != 0) {
483			free(bufs, M_PRIVCMD);
484			break;
485		}
486
487		hbufs = malloc(sizeof(*hbufs) * dmop->num, M_PRIVCMD, M_WAITOK);
488		for (i = 0; i < dmop->num; i++) {
489			set_xen_guest_handle(hbufs[i].h, bufs[i].uptr);
490			hbufs[i].size = bufs[i].size;
491		}
492
493#ifdef __amd64__
494		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
495			stac();
496#endif
497		error = HYPERVISOR_dm_op(dmop->dom, dmop->num, hbufs);
498#ifdef __amd64__
499		if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
500			clac();
501#endif
502		if (error != 0)
503			error = xen_translate_error(error);
504
505		free(bufs, M_PRIVCMD);
506		free(hbufs, M_PRIVCMD);
507
508
509		break;
510	}
511	case IOCTL_PRIVCMD_RESTRICT: {
512		struct per_user_data *u;
513		domid_t dom;
514
515		dom = *(domid_t *)arg;
516
517		error = devfs_get_cdevpriv((void **)&u);
518		if (error != 0)
519			break;
520
521		if (u->dom != DOMID_INVALID && u->dom != dom) {
522			error = -EINVAL;
523			break;
524		}
525		u->dom = dom;
526
527		break;
528	}
529	default:
530		error = ENOSYS;
531		break;
532	}
533
534	return (error);
535}
536
537static void
538user_release(void *arg)
539{
540
541	free(arg, M_PRIVCMD);
542}
543
544static int
545privcmd_open(struct cdev *dev, int flag, int otyp, struct thread *td)
546{
547	struct per_user_data *u;
548	int error;
549
550	u = malloc(sizeof(*u), M_PRIVCMD, M_WAITOK);
551	u->dom = DOMID_INVALID;
552
553	/* Assign the allocated per_user_data to this open instance. */
554	error = devfs_set_cdevpriv(u, user_release);
555	if (error != 0) {
556		free(u, M_PRIVCMD);
557	}
558
559	return (error);
560}
561
562/*------------------ Private Device Attachment Functions  --------------------*/
563static void
564privcmd_identify(driver_t *driver, device_t parent)
565{
566
567	KASSERT(xen_domain(),
568	    ("Trying to attach privcmd device on non Xen domain"));
569
570	if (BUS_ADD_CHILD(parent, 0, "privcmd", 0) == NULL)
571		panic("unable to attach privcmd user-space device");
572}
573
574static int
575privcmd_probe(device_t dev)
576{
577
578	privcmd_dev = dev;
579	device_set_desc(dev, "Xen privileged interface user-space device");
580	return (BUS_PROBE_NOWILDCARD);
581}
582
583static int
584privcmd_attach(device_t dev)
585{
586
587	make_dev_credf(MAKEDEV_ETERNAL, &privcmd_devsw, 0, NULL, UID_ROOT,
588	    GID_WHEEL, 0600, "xen/privcmd");
589	return (0);
590}
591
592/*-------------------- Private Device Attachment Data  -----------------------*/
593static device_method_t privcmd_methods[] = {
594	DEVMETHOD(device_identify,	privcmd_identify),
595	DEVMETHOD(device_probe,		privcmd_probe),
596	DEVMETHOD(device_attach,	privcmd_attach),
597
598	DEVMETHOD_END
599};
600
601static driver_t privcmd_driver = {
602	"privcmd",
603	privcmd_methods,
604	0,
605};
606
607DRIVER_MODULE(privcmd, xenpv, privcmd_driver, 0, 0);
608MODULE_DEPEND(privcmd, xenpv, 1, 1, 1);
609