intel_utils.c revision 279470
1/*-
2 * Copyright (c) 2013 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
6 * under sponsorship from the FreeBSD Foundation.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/cdefs.h>
31__FBSDID("$FreeBSD: stable/10/sys/x86/iommu/intel_utils.c 279470 2015-03-01 04:22:06Z rstone $");
32
33#include <sys/param.h>
34#include <sys/bus.h>
35#include <sys/kernel.h>
36#include <sys/lock.h>
37#include <sys/malloc.h>
38#include <sys/memdesc.h>
39#include <sys/mutex.h>
40#include <sys/proc.h>
41#include <sys/queue.h>
42#include <sys/rman.h>
43#include <sys/rwlock.h>
44#include <sys/sched.h>
45#include <sys/sf_buf.h>
46#include <sys/sysctl.h>
47#include <sys/systm.h>
48#include <sys/taskqueue.h>
49#include <sys/tree.h>
50#include <dev/pci/pcivar.h>
51#include <vm/vm.h>
52#include <vm/vm_extern.h>
53#include <vm/vm_kern.h>
54#include <vm/vm_object.h>
55#include <vm/vm_page.h>
56#include <vm/vm_map.h>
57#include <vm/vm_pageout.h>
58#include <machine/bus.h>
59#include <machine/cpu.h>
60#include <x86/include/busdma_impl.h>
61#include <x86/iommu/intel_reg.h>
62#include <x86/iommu/busdma_dmar.h>
63#include <x86/iommu/intel_dmar.h>
64
65u_int
66dmar_nd2mask(u_int nd)
67{
68	static const u_int masks[] = {
69		0x000f,	/* nd == 0 */
70		0x002f,	/* nd == 1 */
71		0x00ff,	/* nd == 2 */
72		0x02ff,	/* nd == 3 */
73		0x0fff,	/* nd == 4 */
74		0x2fff,	/* nd == 5 */
75		0xffff,	/* nd == 6 */
76		0x0000,	/* nd == 7 reserved */
77	};
78
79	KASSERT(nd <= 6, ("number of domains %d", nd));
80	return (masks[nd]);
81}
82
83static const struct sagaw_bits_tag {
84	int agaw;
85	int cap;
86	int awlvl;
87	int pglvl;
88} sagaw_bits[] = {
89	{.agaw = 30, .cap = DMAR_CAP_SAGAW_2LVL, .awlvl = DMAR_CTX2_AW_2LVL,
90	    .pglvl = 2},
91	{.agaw = 39, .cap = DMAR_CAP_SAGAW_3LVL, .awlvl = DMAR_CTX2_AW_3LVL,
92	    .pglvl = 3},
93	{.agaw = 48, .cap = DMAR_CAP_SAGAW_4LVL, .awlvl = DMAR_CTX2_AW_4LVL,
94	    .pglvl = 4},
95	{.agaw = 57, .cap = DMAR_CAP_SAGAW_5LVL, .awlvl = DMAR_CTX2_AW_5LVL,
96	    .pglvl = 5},
97	{.agaw = 64, .cap = DMAR_CAP_SAGAW_6LVL, .awlvl = DMAR_CTX2_AW_6LVL,
98	    .pglvl = 6}
99};
100#define SIZEOF_SAGAW_BITS (sizeof(sagaw_bits) / sizeof(sagaw_bits[0]))
101
102bool
103dmar_pglvl_supported(struct dmar_unit *unit, int pglvl)
104{
105	int i;
106
107	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
108		if (sagaw_bits[i].pglvl != pglvl)
109			continue;
110		if ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
111			return (true);
112	}
113	return (false);
114}
115
116int
117ctx_set_agaw(struct dmar_ctx *ctx, int mgaw)
118{
119	int sagaw, i;
120
121	ctx->mgaw = mgaw;
122	sagaw = DMAR_CAP_SAGAW(ctx->dmar->hw_cap);
123	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
124		if (sagaw_bits[i].agaw >= mgaw) {
125			ctx->agaw = sagaw_bits[i].agaw;
126			ctx->pglvl = sagaw_bits[i].pglvl;
127			ctx->awlvl = sagaw_bits[i].awlvl;
128			return (0);
129		}
130	}
131	device_printf(ctx->dmar->dev,
132	    "context request mgaw %d for pci%d:%d:%d:%d, "
133	    "no agaw found, sagaw %x\n", mgaw, ctx->dmar->segment,
134	    pci_get_bus(ctx->ctx_tag.owner),
135	    pci_get_slot(ctx->ctx_tag.owner),
136	    pci_get_function(ctx->ctx_tag.owner), sagaw);
137	return (EINVAL);
138}
139
140/*
141 * Find a best fit mgaw for the given maxaddr:
142 *   - if allow_less is false, must find sagaw which maps all requested
143 *     addresses (used by identity mappings);
144 *   - if allow_less is true, and no supported sagaw can map all requested
145 *     address space, accept the biggest sagaw, whatever is it.
146 */
147int
148dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr, bool allow_less)
149{
150	int i;
151
152	for (i = 0; i < SIZEOF_SAGAW_BITS; i++) {
153		if ((1ULL << sagaw_bits[i].agaw) >= maxaddr &&
154		    (DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap) != 0)
155			break;
156	}
157	if (allow_less && i == SIZEOF_SAGAW_BITS) {
158		do {
159			i--;
160		} while ((DMAR_CAP_SAGAW(unit->hw_cap) & sagaw_bits[i].cap)
161		    == 0);
162	}
163	if (i < SIZEOF_SAGAW_BITS)
164		return (sagaw_bits[i].agaw);
165	KASSERT(0, ("no mgaw for maxaddr %jx allow_less %d",
166	    (uintmax_t) maxaddr, allow_less));
167	return (-1);
168}
169
170/*
171 * Calculate the total amount of page table pages needed to map the
172 * whole bus address space on the context with the selected agaw.
173 */
174vm_pindex_t
175pglvl_max_pages(int pglvl)
176{
177	vm_pindex_t res;
178	int i;
179
180	for (res = 0, i = pglvl; i > 0; i--) {
181		res *= DMAR_NPTEPG;
182		res++;
183	}
184	return (res);
185}
186
187/*
188 * Return true if the page table level lvl supports the superpage for
189 * the context ctx.
190 */
191int
192ctx_is_sp_lvl(struct dmar_ctx *ctx, int lvl)
193{
194	int alvl, cap_sps;
195	static const int sagaw_sp[] = {
196		DMAR_CAP_SPS_2M,
197		DMAR_CAP_SPS_1G,
198		DMAR_CAP_SPS_512G,
199		DMAR_CAP_SPS_1T
200	};
201
202	alvl = ctx->pglvl - lvl - 1;
203	cap_sps = DMAR_CAP_SPS(ctx->dmar->hw_cap);
204	return (alvl < sizeof(sagaw_sp) / sizeof(sagaw_sp[0]) &&
205	    (sagaw_sp[alvl] & cap_sps) != 0);
206}
207
208dmar_gaddr_t
209pglvl_page_size(int total_pglvl, int lvl)
210{
211	int rlvl;
212	static const dmar_gaddr_t pg_sz[] = {
213		(dmar_gaddr_t)DMAR_PAGE_SIZE,
214		(dmar_gaddr_t)DMAR_PAGE_SIZE << DMAR_NPTEPGSHIFT,
215		(dmar_gaddr_t)DMAR_PAGE_SIZE << (2 * DMAR_NPTEPGSHIFT),
216		(dmar_gaddr_t)DMAR_PAGE_SIZE << (3 * DMAR_NPTEPGSHIFT),
217		(dmar_gaddr_t)DMAR_PAGE_SIZE << (4 * DMAR_NPTEPGSHIFT),
218		(dmar_gaddr_t)DMAR_PAGE_SIZE << (5 * DMAR_NPTEPGSHIFT)
219	};
220
221	KASSERT(lvl >= 0 && lvl < total_pglvl,
222	    ("total %d lvl %d", total_pglvl, lvl));
223	rlvl = total_pglvl - lvl - 1;
224	KASSERT(rlvl < sizeof(pg_sz) / sizeof(pg_sz[0]),
225	    ("sizeof pg_sz lvl %d", lvl));
226	return (pg_sz[rlvl]);
227}
228
229dmar_gaddr_t
230ctx_page_size(struct dmar_ctx *ctx, int lvl)
231{
232
233	return (pglvl_page_size(ctx->pglvl, lvl));
234}
235
236int
237calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size,
238    dmar_gaddr_t *isizep)
239{
240	dmar_gaddr_t isize;
241	int am;
242
243	for (am = DMAR_CAP_MAMV(unit->hw_cap);; am--) {
244		isize = 1ULL << (am + DMAR_PAGE_SHIFT);
245		if ((base & (isize - 1)) == 0 && size >= isize)
246			break;
247		if (am == 0)
248			break;
249	}
250	*isizep = isize;
251	return (am);
252}
253
254dmar_haddr_t dmar_high;
255int haw;
256int dmar_tbl_pagecnt;
257
258vm_page_t
259dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags)
260{
261	vm_page_t m;
262	int zeroed;
263
264	zeroed = (flags & DMAR_PGF_ZERO) != 0 ? VM_ALLOC_ZERO : 0;
265	for (;;) {
266		if ((flags & DMAR_PGF_OBJL) == 0)
267			VM_OBJECT_WLOCK(obj);
268		m = vm_page_lookup(obj, idx);
269		if ((flags & DMAR_PGF_NOALLOC) != 0 || m != NULL) {
270			if ((flags & DMAR_PGF_OBJL) == 0)
271				VM_OBJECT_WUNLOCK(obj);
272			break;
273		}
274		m = vm_page_alloc_contig(obj, idx, VM_ALLOC_NOBUSY |
275		    VM_ALLOC_SYSTEM | VM_ALLOC_NODUMP | zeroed, 1, 0,
276		    dmar_high, PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
277		if ((flags & DMAR_PGF_OBJL) == 0)
278			VM_OBJECT_WUNLOCK(obj);
279		if (m != NULL) {
280			if (zeroed && (m->flags & PG_ZERO) == 0)
281				pmap_zero_page(m);
282			atomic_add_int(&dmar_tbl_pagecnt, 1);
283			break;
284		}
285		if ((flags & DMAR_PGF_WAITOK) == 0)
286			break;
287		if ((flags & DMAR_PGF_OBJL) != 0)
288			VM_OBJECT_WUNLOCK(obj);
289		VM_WAIT;
290		if ((flags & DMAR_PGF_OBJL) != 0)
291			VM_OBJECT_WLOCK(obj);
292	}
293	return (m);
294}
295
296void
297dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags)
298{
299	vm_page_t m;
300
301	if ((flags & DMAR_PGF_OBJL) == 0)
302		VM_OBJECT_WLOCK(obj);
303	m = vm_page_lookup(obj, idx);
304	if (m != NULL) {
305		vm_page_free(m);
306		atomic_subtract_int(&dmar_tbl_pagecnt, 1);
307	}
308	if ((flags & DMAR_PGF_OBJL) == 0)
309		VM_OBJECT_WUNLOCK(obj);
310}
311
312void *
313dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags,
314    struct sf_buf **sf)
315{
316	vm_page_t m;
317	bool allocated;
318
319	if ((flags & DMAR_PGF_OBJL) == 0)
320		VM_OBJECT_WLOCK(obj);
321	m = vm_page_lookup(obj, idx);
322	if (m == NULL && (flags & DMAR_PGF_ALLOC) != 0) {
323		m = dmar_pgalloc(obj, idx, flags | DMAR_PGF_OBJL);
324		allocated = true;
325	} else
326		allocated = false;
327	if (m == NULL) {
328		if ((flags & DMAR_PGF_OBJL) == 0)
329			VM_OBJECT_WUNLOCK(obj);
330		return (NULL);
331	}
332	/* Sleepable allocations cannot fail. */
333	if ((flags & DMAR_PGF_WAITOK) != 0)
334		VM_OBJECT_WUNLOCK(obj);
335	sched_pin();
336	*sf = sf_buf_alloc(m, SFB_CPUPRIVATE | ((flags & DMAR_PGF_WAITOK)
337	    == 0 ? SFB_NOWAIT : 0));
338	if (*sf == NULL) {
339		sched_unpin();
340		if (allocated) {
341			VM_OBJECT_ASSERT_WLOCKED(obj);
342			dmar_pgfree(obj, m->pindex, flags | DMAR_PGF_OBJL);
343		}
344		if ((flags & DMAR_PGF_OBJL) == 0)
345			VM_OBJECT_WUNLOCK(obj);
346		return (NULL);
347	}
348	if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) ==
349	    (DMAR_PGF_WAITOK | DMAR_PGF_OBJL))
350		VM_OBJECT_WLOCK(obj);
351	else if ((flags & (DMAR_PGF_WAITOK | DMAR_PGF_OBJL)) == 0)
352		VM_OBJECT_WUNLOCK(obj);
353	return ((void *)sf_buf_kva(*sf));
354}
355
356void
357dmar_unmap_pgtbl(struct sf_buf *sf)
358{
359
360	sf_buf_free(sf);
361	sched_unpin();
362}
363
364static void
365dmar_flush_transl_to_ram(struct dmar_unit *unit, void *dst, size_t sz)
366{
367
368	if (DMAR_IS_COHERENT(unit))
369		return;
370	/*
371	 * If DMAR does not snoop paging structures accesses, flush
372	 * CPU cache to memory.
373	 */
374	pmap_invalidate_cache_range((uintptr_t)dst, (uintptr_t)dst + sz,
375	    TRUE);
376}
377
378void
379dmar_flush_pte_to_ram(struct dmar_unit *unit, dmar_pte_t *dst)
380{
381
382	dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
383}
384
385void
386dmar_flush_ctx_to_ram(struct dmar_unit *unit, dmar_ctx_entry_t *dst)
387{
388
389	dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
390}
391
392void
393dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst)
394{
395
396	dmar_flush_transl_to_ram(unit, dst, sizeof(*dst));
397}
398
399/*
400 * Load the root entry pointer into the hardware, busily waiting for
401 * the completion.
402 */
403int
404dmar_load_root_entry_ptr(struct dmar_unit *unit)
405{
406	vm_page_t root_entry;
407
408	/*
409	 * Access to the GCMD register must be serialized while the
410	 * command is submitted.
411	 */
412	DMAR_ASSERT_LOCKED(unit);
413
414	VM_OBJECT_RLOCK(unit->ctx_obj);
415	root_entry = vm_page_lookup(unit->ctx_obj, 0);
416	VM_OBJECT_RUNLOCK(unit->ctx_obj);
417	dmar_write8(unit, DMAR_RTADDR_REG, VM_PAGE_TO_PHYS(root_entry));
418	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_SRTP);
419	/* XXXKIB should have a timeout */
420	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_RTPS) == 0)
421		cpu_spinwait();
422	return (0);
423}
424
425/*
426 * Globally invalidate the context entries cache, busily waiting for
427 * the completion.
428 */
429int
430dmar_inv_ctx_glob(struct dmar_unit *unit)
431{
432
433	/*
434	 * Access to the CCMD register must be serialized while the
435	 * command is submitted.
436	 */
437	DMAR_ASSERT_LOCKED(unit);
438	KASSERT(!unit->qi_enabled, ("QI enabled"));
439
440	/*
441	 * The DMAR_CCMD_ICC bit in the upper dword should be written
442	 * after the low dword write is completed.  Amd64
443	 * dmar_write8() does not have this issue, i386 dmar_write8()
444	 * writes the upper dword last.
445	 */
446	dmar_write8(unit, DMAR_CCMD_REG, DMAR_CCMD_ICC | DMAR_CCMD_CIRG_GLOB);
447	/* XXXKIB should have a timeout */
448	while ((dmar_read4(unit, DMAR_CCMD_REG + 4) & DMAR_CCMD_ICC32) != 0)
449		cpu_spinwait();
450	return (0);
451}
452
453/*
454 * Globally invalidate the IOTLB, busily waiting for the completion.
455 */
456int
457dmar_inv_iotlb_glob(struct dmar_unit *unit)
458{
459	int reg;
460
461	DMAR_ASSERT_LOCKED(unit);
462	KASSERT(!unit->qi_enabled, ("QI enabled"));
463
464	reg = 16 * DMAR_ECAP_IRO(unit->hw_ecap);
465	/* See a comment about DMAR_CCMD_ICC in dmar_inv_ctx_glob. */
466	dmar_write8(unit, reg + DMAR_IOTLB_REG_OFF, DMAR_IOTLB_IVT |
467	    DMAR_IOTLB_IIRG_GLB | DMAR_IOTLB_DR | DMAR_IOTLB_DW);
468	/* XXXKIB should have a timeout */
469	while ((dmar_read4(unit, reg + DMAR_IOTLB_REG_OFF + 4) &
470	    DMAR_IOTLB_IVT32) != 0)
471		cpu_spinwait();
472	return (0);
473}
474
475/*
476 * Flush the chipset write buffers.  See 11.1 "Write Buffer Flushing"
477 * in the architecture specification.
478 */
479int
480dmar_flush_write_bufs(struct dmar_unit *unit)
481{
482
483	DMAR_ASSERT_LOCKED(unit);
484
485	/*
486	 * DMAR_GCMD_WBF is only valid when CAP_RWBF is reported.
487	 */
488	KASSERT((unit->hw_cap & DMAR_CAP_RWBF) != 0,
489	    ("dmar%d: no RWBF", unit->unit));
490
491	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd | DMAR_GCMD_WBF);
492	/* XXXKIB should have a timeout */
493	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_WBFS) == 0)
494		cpu_spinwait();
495	return (0);
496}
497
498int
499dmar_enable_translation(struct dmar_unit *unit)
500{
501
502	DMAR_ASSERT_LOCKED(unit);
503	unit->hw_gcmd |= DMAR_GCMD_TE;
504	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
505	/* XXXKIB should have a timeout */
506	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) == 0)
507		cpu_spinwait();
508	return (0);
509}
510
511int
512dmar_disable_translation(struct dmar_unit *unit)
513{
514
515	DMAR_ASSERT_LOCKED(unit);
516	unit->hw_gcmd &= ~DMAR_GCMD_TE;
517	dmar_write4(unit, DMAR_GCMD_REG, unit->hw_gcmd);
518	/* XXXKIB should have a timeout */
519	while ((dmar_read4(unit, DMAR_GSTS_REG) & DMAR_GSTS_TES) != 0)
520		cpu_spinwait();
521	return (0);
522}
523
524#define BARRIER_F				\
525	u_int f_done, f_inproc, f_wakeup;	\
526						\
527	f_done = 1 << (barrier_id * 3);		\
528	f_inproc = 1 << (barrier_id * 3 + 1);	\
529	f_wakeup = 1 << (barrier_id * 3 + 2)
530
531bool
532dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id)
533{
534	BARRIER_F;
535
536	DMAR_LOCK(dmar);
537	if ((dmar->barrier_flags & f_done) != 0) {
538		DMAR_UNLOCK(dmar);
539		return (false);
540	}
541
542	if ((dmar->barrier_flags & f_inproc) != 0) {
543		while ((dmar->barrier_flags & f_inproc) != 0) {
544			dmar->barrier_flags |= f_wakeup;
545			msleep(&dmar->barrier_flags, &dmar->lock, 0,
546			    "dmarb", 0);
547		}
548		KASSERT((dmar->barrier_flags & f_done) != 0,
549		    ("dmar%d barrier %d missing done", dmar->unit, barrier_id));
550		DMAR_UNLOCK(dmar);
551		return (false);
552	}
553
554	dmar->barrier_flags |= f_inproc;
555	DMAR_UNLOCK(dmar);
556	return (true);
557}
558
559void
560dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id)
561{
562	BARRIER_F;
563
564	DMAR_ASSERT_LOCKED(dmar);
565	KASSERT((dmar->barrier_flags & (f_done | f_inproc)) == f_inproc,
566	    ("dmar%d barrier %d missed entry", dmar->unit, barrier_id));
567	dmar->barrier_flags |= f_done;
568	if ((dmar->barrier_flags & f_wakeup) != 0)
569		wakeup(&dmar->barrier_flags);
570	dmar->barrier_flags &= ~(f_inproc | f_wakeup);
571	DMAR_UNLOCK(dmar);
572}
573
574int dmar_match_verbose;
575
576static SYSCTL_NODE(_hw, OID_AUTO, dmar, CTLFLAG_RD, NULL,
577    "");
578SYSCTL_INT(_hw_dmar, OID_AUTO, tbl_pagecnt, CTLFLAG_RD | CTLFLAG_TUN,
579    &dmar_tbl_pagecnt, 0,
580    "Count of pages used for DMAR pagetables");
581SYSCTL_INT(_hw_dmar, OID_AUTO, match_verbose, CTLFLAG_RW | CTLFLAG_TUN,
582    &dmar_match_verbose, 0,
583    "Verbose matching of the PCI devices to DMAR paths");
584#ifdef INVARIANTS
585int dmar_check_free;
586SYSCTL_INT(_hw_dmar, OID_AUTO, check_free, CTLFLAG_RW | CTLFLAG_TUN,
587    &dmar_check_free, 0,
588    "Check the GPA RBtree for free_down and free_after validity");
589#endif
590
591