1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright �� 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 *          Ashok Raj <ashok.raj@intel.com>,
7 *          Shaohua Li <shaohua.li@intel.com>,
8 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 *          Fenghua Yu <fenghua.yu@intel.com>
10 *          Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt)     "DMAR: " fmt
14#define dev_fmt(fmt)    pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-pages.h"
31#include "pasid.h"
32#include "cap_audit.h"
33#include "perfmon.h"
34
35#define ROOT_SIZE		VTD_PAGE_SIZE
36#define CONTEXT_SIZE		VTD_PAGE_SIZE
37
38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43#define IOAPIC_RANGE_START	(0xfee00000)
44#define IOAPIC_RANGE_END	(0xfeefffff)
45#define IOVA_START_ADDR		(0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
55				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58static void __init check_tylersburg_isoch(void);
59static int rwbf_quirk;
60
61/*
62 * set to 1 to panic kernel if can't successfully enable VT-d
63 * (used when kernel is launched w/ TXT)
64 */
65static int force_on = 0;
66static int intel_iommu_tboot_noforce;
67static int no_platform_optin;
68
69#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70
71/*
72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73 * if marked present.
74 */
75static phys_addr_t root_entry_lctp(struct root_entry *re)
76{
77	if (!(re->lo & 1))
78		return 0;
79
80	return re->lo & VTD_PAGE_MASK;
81}
82
83/*
84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85 * if marked present.
86 */
87static phys_addr_t root_entry_uctp(struct root_entry *re)
88{
89	if (!(re->hi & 1))
90		return 0;
91
92	return re->hi & VTD_PAGE_MASK;
93}
94
95static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96{
97	struct device_domain_info *info =
98		rb_entry(node, struct device_domain_info, node);
99	const u16 *rid_lhs = key;
100
101	if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102		return -1;
103
104	if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105		return 1;
106
107	return 0;
108}
109
110static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111{
112	struct device_domain_info *info =
113		rb_entry(lhs, struct device_domain_info, node);
114	u16 key = PCI_DEVID(info->bus, info->devfn);
115
116	return device_rid_cmp_key(&key, rhs);
117}
118
119/*
120 * Looks up an IOMMU-probed device using its source ID.
121 *
122 * Returns the pointer to the device if there is a match. Otherwise,
123 * returns NULL.
124 *
125 * Note that this helper doesn't guarantee that the device won't be
126 * released by the iommu subsystem after being returned. The caller
127 * should use its own synchronization mechanism to avoid the device
128 * being released during its use if its possibly the case.
129 */
130struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131{
132	struct device_domain_info *info = NULL;
133	struct rb_node *node;
134	unsigned long flags;
135
136	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137	node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138	if (node)
139		info = rb_entry(node, struct device_domain_info, node);
140	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141
142	return info ? info->dev : NULL;
143}
144
145static int device_rbtree_insert(struct intel_iommu *iommu,
146				struct device_domain_info *info)
147{
148	struct rb_node *curr;
149	unsigned long flags;
150
151	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152	curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154	if (WARN_ON(curr))
155		return -EEXIST;
156
157	return 0;
158}
159
160static void device_rbtree_remove(struct device_domain_info *info)
161{
162	struct intel_iommu *iommu = info->iommu;
163	unsigned long flags;
164
165	spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166	rb_erase(&info->node, &iommu->device_rbtree);
167	spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168}
169
170/*
171 * This domain is a statically identity mapping domain.
172 *	1. This domain creats a static 1:1 mapping to all usable memory.
173 * 	2. It maps to each iommu if successful.
174 *	3. Each iommu mapps to this domain if successful.
175 */
176static struct dmar_domain *si_domain;
177static int hw_pass_through = 1;
178
179struct dmar_rmrr_unit {
180	struct list_head list;		/* list of rmrr units	*/
181	struct acpi_dmar_header *hdr;	/* ACPI header		*/
182	u64	base_address;		/* reserved base address*/
183	u64	end_address;		/* reserved end address */
184	struct dmar_dev_scope *devices;	/* target devices */
185	int	devices_cnt;		/* target device count */
186};
187
188struct dmar_atsr_unit {
189	struct list_head list;		/* list of ATSR units */
190	struct acpi_dmar_header *hdr;	/* ACPI header */
191	struct dmar_dev_scope *devices;	/* target devices */
192	int devices_cnt;		/* target device count */
193	u8 include_all:1;		/* include all ports */
194};
195
196struct dmar_satc_unit {
197	struct list_head list;		/* list of SATC units */
198	struct acpi_dmar_header *hdr;	/* ACPI header */
199	struct dmar_dev_scope *devices;	/* target devices */
200	struct intel_iommu *iommu;	/* the corresponding iommu */
201	int devices_cnt;		/* target device count */
202	u8 atc_required:1;		/* ATS is required */
203};
204
205static LIST_HEAD(dmar_atsr_units);
206static LIST_HEAD(dmar_rmrr_units);
207static LIST_HEAD(dmar_satc_units);
208
209#define for_each_rmrr_units(rmrr) \
210	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
211
212static void intel_iommu_domain_free(struct iommu_domain *domain);
213
214int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
215int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
216
217int intel_iommu_enabled = 0;
218EXPORT_SYMBOL_GPL(intel_iommu_enabled);
219
220static int intel_iommu_superpage = 1;
221static int iommu_identity_mapping;
222static int iommu_skip_te_disable;
223static int disable_igfx_iommu;
224
225#define IDENTMAP_AZALIA		4
226
227const struct iommu_ops intel_iommu_ops;
228static const struct iommu_dirty_ops intel_dirty_ops;
229
230static bool translation_pre_enabled(struct intel_iommu *iommu)
231{
232	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
233}
234
235static void clear_translation_pre_enabled(struct intel_iommu *iommu)
236{
237	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
238}
239
240static void init_translation_status(struct intel_iommu *iommu)
241{
242	u32 gsts;
243
244	gsts = readl(iommu->reg + DMAR_GSTS_REG);
245	if (gsts & DMA_GSTS_TES)
246		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
247}
248
249static int __init intel_iommu_setup(char *str)
250{
251	if (!str)
252		return -EINVAL;
253
254	while (*str) {
255		if (!strncmp(str, "on", 2)) {
256			dmar_disabled = 0;
257			pr_info("IOMMU enabled\n");
258		} else if (!strncmp(str, "off", 3)) {
259			dmar_disabled = 1;
260			no_platform_optin = 1;
261			pr_info("IOMMU disabled\n");
262		} else if (!strncmp(str, "igfx_off", 8)) {
263			disable_igfx_iommu = 1;
264			pr_info("Disable GFX device mapping\n");
265		} else if (!strncmp(str, "forcedac", 8)) {
266			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
267			iommu_dma_forcedac = true;
268		} else if (!strncmp(str, "strict", 6)) {
269			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
270			iommu_set_dma_strict();
271		} else if (!strncmp(str, "sp_off", 6)) {
272			pr_info("Disable supported super page\n");
273			intel_iommu_superpage = 0;
274		} else if (!strncmp(str, "sm_on", 5)) {
275			pr_info("Enable scalable mode if hardware supports\n");
276			intel_iommu_sm = 1;
277		} else if (!strncmp(str, "sm_off", 6)) {
278			pr_info("Scalable mode is disallowed\n");
279			intel_iommu_sm = 0;
280		} else if (!strncmp(str, "tboot_noforce", 13)) {
281			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
282			intel_iommu_tboot_noforce = 1;
283		} else {
284			pr_notice("Unknown option - '%s'\n", str);
285		}
286
287		str += strcspn(str, ",");
288		while (*str == ',')
289			str++;
290	}
291
292	return 1;
293}
294__setup("intel_iommu=", intel_iommu_setup);
295
296static int domain_type_is_si(struct dmar_domain *domain)
297{
298	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
299}
300
301static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
302{
303	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
304
305	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
306}
307
308/*
309 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
310 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
311 * the returned SAGAW.
312 */
313static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
314{
315	unsigned long fl_sagaw, sl_sagaw;
316
317	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
318	sl_sagaw = cap_sagaw(iommu->cap);
319
320	/* Second level only. */
321	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
322		return sl_sagaw;
323
324	/* First level only. */
325	if (!ecap_slts(iommu->ecap))
326		return fl_sagaw;
327
328	return fl_sagaw & sl_sagaw;
329}
330
331static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
332{
333	unsigned long sagaw;
334	int agaw;
335
336	sagaw = __iommu_calculate_sagaw(iommu);
337	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
338		if (test_bit(agaw, &sagaw))
339			break;
340	}
341
342	return agaw;
343}
344
345/*
346 * Calculate max SAGAW for each iommu.
347 */
348int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
349{
350	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
351}
352
353/*
354 * calculate agaw for each iommu.
355 * "SAGAW" may be different across iommus, use a default agaw, and
356 * get a supported less agaw for iommus that don't support the default agaw.
357 */
358int iommu_calculate_agaw(struct intel_iommu *iommu)
359{
360	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
361}
362
363static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
364{
365	return sm_supported(iommu) ?
366			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
367}
368
369static void domain_update_iommu_coherency(struct dmar_domain *domain)
370{
371	struct iommu_domain_info *info;
372	struct dmar_drhd_unit *drhd;
373	struct intel_iommu *iommu;
374	bool found = false;
375	unsigned long i;
376
377	domain->iommu_coherency = true;
378	xa_for_each(&domain->iommu_array, i, info) {
379		found = true;
380		if (!iommu_paging_structure_coherency(info->iommu)) {
381			domain->iommu_coherency = false;
382			break;
383		}
384	}
385	if (found)
386		return;
387
388	/* No hardware attached; use lowest common denominator */
389	rcu_read_lock();
390	for_each_active_iommu(iommu, drhd) {
391		if (!iommu_paging_structure_coherency(iommu)) {
392			domain->iommu_coherency = false;
393			break;
394		}
395	}
396	rcu_read_unlock();
397}
398
399static int domain_update_iommu_superpage(struct dmar_domain *domain,
400					 struct intel_iommu *skip)
401{
402	struct dmar_drhd_unit *drhd;
403	struct intel_iommu *iommu;
404	int mask = 0x3;
405
406	if (!intel_iommu_superpage)
407		return 0;
408
409	/* set iommu_superpage to the smallest common denominator */
410	rcu_read_lock();
411	for_each_active_iommu(iommu, drhd) {
412		if (iommu != skip) {
413			if (domain && domain->use_first_level) {
414				if (!cap_fl1gp_support(iommu->cap))
415					mask = 0x1;
416			} else {
417				mask &= cap_super_page_val(iommu->cap);
418			}
419
420			if (!mask)
421				break;
422		}
423	}
424	rcu_read_unlock();
425
426	return fls(mask);
427}
428
429static int domain_update_device_node(struct dmar_domain *domain)
430{
431	struct device_domain_info *info;
432	int nid = NUMA_NO_NODE;
433	unsigned long flags;
434
435	spin_lock_irqsave(&domain->lock, flags);
436	list_for_each_entry(info, &domain->devices, link) {
437		/*
438		 * There could possibly be multiple device numa nodes as devices
439		 * within the same domain may sit behind different IOMMUs. There
440		 * isn't perfect answer in such situation, so we select first
441		 * come first served policy.
442		 */
443		nid = dev_to_node(info->dev);
444		if (nid != NUMA_NO_NODE)
445			break;
446	}
447	spin_unlock_irqrestore(&domain->lock, flags);
448
449	return nid;
450}
451
452/* Return the super pagesize bitmap if supported. */
453static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
454{
455	unsigned long bitmap = 0;
456
457	/*
458	 * 1-level super page supports page size of 2MiB, 2-level super page
459	 * supports page size of both 2MiB and 1GiB.
460	 */
461	if (domain->iommu_superpage == 1)
462		bitmap |= SZ_2M;
463	else if (domain->iommu_superpage == 2)
464		bitmap |= SZ_2M | SZ_1G;
465
466	return bitmap;
467}
468
469/* Some capabilities may be different across iommus */
470void domain_update_iommu_cap(struct dmar_domain *domain)
471{
472	domain_update_iommu_coherency(domain);
473	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
474
475	/*
476	 * If RHSA is missing, we should default to the device numa domain
477	 * as fall back.
478	 */
479	if (domain->nid == NUMA_NO_NODE)
480		domain->nid = domain_update_device_node(domain);
481
482	/*
483	 * First-level translation restricts the input-address to a
484	 * canonical address (i.e., address bits 63:N have the same
485	 * value as address bit [N-1], where N is 48-bits with 4-level
486	 * paging and 57-bits with 5-level paging). Hence, skip bit
487	 * [N-1].
488	 */
489	if (domain->use_first_level)
490		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
491	else
492		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
493
494	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
495	domain_update_iotlb(domain);
496}
497
498struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
499					 u8 devfn, int alloc)
500{
501	struct root_entry *root = &iommu->root_entry[bus];
502	struct context_entry *context;
503	u64 *entry;
504
505	/*
506	 * Except that the caller requested to allocate a new entry,
507	 * returning a copied context entry makes no sense.
508	 */
509	if (!alloc && context_copied(iommu, bus, devfn))
510		return NULL;
511
512	entry = &root->lo;
513	if (sm_supported(iommu)) {
514		if (devfn >= 0x80) {
515			devfn -= 0x80;
516			entry = &root->hi;
517		}
518		devfn *= 2;
519	}
520	if (*entry & 1)
521		context = phys_to_virt(*entry & VTD_PAGE_MASK);
522	else {
523		unsigned long phy_addr;
524		if (!alloc)
525			return NULL;
526
527		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
528		if (!context)
529			return NULL;
530
531		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
532		phy_addr = virt_to_phys((void *)context);
533		*entry = phy_addr | 1;
534		__iommu_flush_cache(iommu, entry, sizeof(*entry));
535	}
536	return &context[devfn];
537}
538
539/**
540 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
541 *				 sub-hierarchy of a candidate PCI-PCI bridge
542 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
543 * @bridge: the candidate PCI-PCI bridge
544 *
545 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
546 */
547static bool
548is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
549{
550	struct pci_dev *pdev, *pbridge;
551
552	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
553		return false;
554
555	pdev = to_pci_dev(dev);
556	pbridge = to_pci_dev(bridge);
557
558	if (pbridge->subordinate &&
559	    pbridge->subordinate->number <= pdev->bus->number &&
560	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
561		return true;
562
563	return false;
564}
565
566static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
567{
568	struct dmar_drhd_unit *drhd;
569	u32 vtbar;
570	int rc;
571
572	/* We know that this device on this chipset has its own IOMMU.
573	 * If we find it under a different IOMMU, then the BIOS is lying
574	 * to us. Hope that the IOMMU for this device is actually
575	 * disabled, and it needs no translation...
576	 */
577	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
578	if (rc) {
579		/* "can't" happen */
580		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
581		return false;
582	}
583	vtbar &= 0xffff0000;
584
585	/* we know that the this iommu should be at offset 0xa000 from vtbar */
586	drhd = dmar_find_matched_drhd_unit(pdev);
587	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
588		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
589		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
590		return true;
591	}
592
593	return false;
594}
595
596static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
597{
598	if (!iommu || iommu->drhd->ignored)
599		return true;
600
601	if (dev_is_pci(dev)) {
602		struct pci_dev *pdev = to_pci_dev(dev);
603
604		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
605		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
606		    quirk_ioat_snb_local_iommu(pdev))
607			return true;
608	}
609
610	return false;
611}
612
613static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
614{
615	struct dmar_drhd_unit *drhd = NULL;
616	struct pci_dev *pdev = NULL;
617	struct intel_iommu *iommu;
618	struct device *tmp;
619	u16 segment = 0;
620	int i;
621
622	if (!dev)
623		return NULL;
624
625	if (dev_is_pci(dev)) {
626		struct pci_dev *pf_pdev;
627
628		pdev = pci_real_dma_dev(to_pci_dev(dev));
629
630		/* VFs aren't listed in scope tables; we need to look up
631		 * the PF instead to find the IOMMU. */
632		pf_pdev = pci_physfn(pdev);
633		dev = &pf_pdev->dev;
634		segment = pci_domain_nr(pdev->bus);
635	} else if (has_acpi_companion(dev))
636		dev = &ACPI_COMPANION(dev)->dev;
637
638	rcu_read_lock();
639	for_each_iommu(iommu, drhd) {
640		if (pdev && segment != drhd->segment)
641			continue;
642
643		for_each_active_dev_scope(drhd->devices,
644					  drhd->devices_cnt, i, tmp) {
645			if (tmp == dev) {
646				/* For a VF use its original BDF# not that of the PF
647				 * which we used for the IOMMU lookup. Strictly speaking
648				 * we could do this for all PCI devices; we only need to
649				 * get the BDF# from the scope table for ACPI matches. */
650				if (pdev && pdev->is_virtfn)
651					goto got_pdev;
652
653				if (bus && devfn) {
654					*bus = drhd->devices[i].bus;
655					*devfn = drhd->devices[i].devfn;
656				}
657				goto out;
658			}
659
660			if (is_downstream_to_pci_bridge(dev, tmp))
661				goto got_pdev;
662		}
663
664		if (pdev && drhd->include_all) {
665got_pdev:
666			if (bus && devfn) {
667				*bus = pdev->bus->number;
668				*devfn = pdev->devfn;
669			}
670			goto out;
671		}
672	}
673	iommu = NULL;
674out:
675	if (iommu_is_dummy(iommu, dev))
676		iommu = NULL;
677
678	rcu_read_unlock();
679
680	return iommu;
681}
682
683static void domain_flush_cache(struct dmar_domain *domain,
684			       void *addr, int size)
685{
686	if (!domain->iommu_coherency)
687		clflush_cache_range(addr, size);
688}
689
690static void free_context_table(struct intel_iommu *iommu)
691{
692	struct context_entry *context;
693	int i;
694
695	if (!iommu->root_entry)
696		return;
697
698	for (i = 0; i < ROOT_ENTRY_NR; i++) {
699		context = iommu_context_addr(iommu, i, 0, 0);
700		if (context)
701			iommu_free_page(context);
702
703		if (!sm_supported(iommu))
704			continue;
705
706		context = iommu_context_addr(iommu, i, 0x80, 0);
707		if (context)
708			iommu_free_page(context);
709	}
710
711	iommu_free_page(iommu->root_entry);
712	iommu->root_entry = NULL;
713}
714
715#ifdef CONFIG_DMAR_DEBUG
716static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
717			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
718{
719	struct dma_pte *pte;
720	int offset;
721
722	while (1) {
723		offset = pfn_level_offset(pfn, level);
724		pte = &parent[offset];
725		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
726			pr_info("PTE not present at level %d\n", level);
727			break;
728		}
729
730		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
731
732		if (level == 1)
733			break;
734
735		parent = phys_to_virt(dma_pte_addr(pte));
736		level--;
737	}
738}
739
740void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
741			  unsigned long long addr, u32 pasid)
742{
743	struct pasid_dir_entry *dir, *pde;
744	struct pasid_entry *entries, *pte;
745	struct context_entry *ctx_entry;
746	struct root_entry *rt_entry;
747	int i, dir_index, index, level;
748	u8 devfn = source_id & 0xff;
749	u8 bus = source_id >> 8;
750	struct dma_pte *pgtable;
751
752	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
753
754	/* root entry dump */
755	rt_entry = &iommu->root_entry[bus];
756	if (!rt_entry) {
757		pr_info("root table entry is not present\n");
758		return;
759	}
760
761	if (sm_supported(iommu))
762		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
763			rt_entry->hi, rt_entry->lo);
764	else
765		pr_info("root entry: 0x%016llx", rt_entry->lo);
766
767	/* context entry dump */
768	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
769	if (!ctx_entry) {
770		pr_info("context table entry is not present\n");
771		return;
772	}
773
774	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
775		ctx_entry->hi, ctx_entry->lo);
776
777	/* legacy mode does not require PASID entries */
778	if (!sm_supported(iommu)) {
779		level = agaw_to_level(ctx_entry->hi & 7);
780		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
781		goto pgtable_walk;
782	}
783
784	/* get the pointer to pasid directory entry */
785	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
786	if (!dir) {
787		pr_info("pasid directory entry is not present\n");
788		return;
789	}
790	/* For request-without-pasid, get the pasid from context entry */
791	if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
792		pasid = IOMMU_NO_PASID;
793
794	dir_index = pasid >> PASID_PDE_SHIFT;
795	pde = &dir[dir_index];
796	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
797
798	/* get the pointer to the pasid table entry */
799	entries = get_pasid_table_from_pde(pde);
800	if (!entries) {
801		pr_info("pasid table entry is not present\n");
802		return;
803	}
804	index = pasid & PASID_PTE_MASK;
805	pte = &entries[index];
806	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
807		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
808
809	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
810		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
811		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
812	} else {
813		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
814		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
815	}
816
817pgtable_walk:
818	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
819}
820#endif
821
822static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
823				      unsigned long pfn, int *target_level,
824				      gfp_t gfp)
825{
826	struct dma_pte *parent, *pte;
827	int level = agaw_to_level(domain->agaw);
828	int offset;
829
830	if (!domain_pfn_supported(domain, pfn))
831		/* Address beyond IOMMU's addressing capabilities. */
832		return NULL;
833
834	parent = domain->pgd;
835
836	while (1) {
837		void *tmp_page;
838
839		offset = pfn_level_offset(pfn, level);
840		pte = &parent[offset];
841		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
842			break;
843		if (level == *target_level)
844			break;
845
846		if (!dma_pte_present(pte)) {
847			uint64_t pteval, tmp;
848
849			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
850
851			if (!tmp_page)
852				return NULL;
853
854			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
855			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
856			if (domain->use_first_level)
857				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
858
859			tmp = 0ULL;
860			if (!try_cmpxchg64(&pte->val, &tmp, pteval))
861				/* Someone else set it while we were thinking; use theirs. */
862				iommu_free_page(tmp_page);
863			else
864				domain_flush_cache(domain, pte, sizeof(*pte));
865		}
866		if (level == 1)
867			break;
868
869		parent = phys_to_virt(dma_pte_addr(pte));
870		level--;
871	}
872
873	if (!*target_level)
874		*target_level = level;
875
876	return pte;
877}
878
879/* return address's pte at specific level */
880static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
881					 unsigned long pfn,
882					 int level, int *large_page)
883{
884	struct dma_pte *parent, *pte;
885	int total = agaw_to_level(domain->agaw);
886	int offset;
887
888	parent = domain->pgd;
889	while (level <= total) {
890		offset = pfn_level_offset(pfn, total);
891		pte = &parent[offset];
892		if (level == total)
893			return pte;
894
895		if (!dma_pte_present(pte)) {
896			*large_page = total;
897			break;
898		}
899
900		if (dma_pte_superpage(pte)) {
901			*large_page = total;
902			return pte;
903		}
904
905		parent = phys_to_virt(dma_pte_addr(pte));
906		total--;
907	}
908	return NULL;
909}
910
911/* clear last level pte, a tlb flush should be followed */
912static void dma_pte_clear_range(struct dmar_domain *domain,
913				unsigned long start_pfn,
914				unsigned long last_pfn)
915{
916	unsigned int large_page;
917	struct dma_pte *first_pte, *pte;
918
919	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
920	    WARN_ON(start_pfn > last_pfn))
921		return;
922
923	/* we don't need lock here; nobody else touches the iova range */
924	do {
925		large_page = 1;
926		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
927		if (!pte) {
928			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
929			continue;
930		}
931		do {
932			dma_clear_pte(pte);
933			start_pfn += lvl_to_nr_pages(large_page);
934			pte++;
935		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
936
937		domain_flush_cache(domain, first_pte,
938				   (void *)pte - (void *)first_pte);
939
940	} while (start_pfn && start_pfn <= last_pfn);
941}
942
943static void dma_pte_free_level(struct dmar_domain *domain, int level,
944			       int retain_level, struct dma_pte *pte,
945			       unsigned long pfn, unsigned long start_pfn,
946			       unsigned long last_pfn)
947{
948	pfn = max(start_pfn, pfn);
949	pte = &pte[pfn_level_offset(pfn, level)];
950
951	do {
952		unsigned long level_pfn;
953		struct dma_pte *level_pte;
954
955		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
956			goto next;
957
958		level_pfn = pfn & level_mask(level);
959		level_pte = phys_to_virt(dma_pte_addr(pte));
960
961		if (level > 2) {
962			dma_pte_free_level(domain, level - 1, retain_level,
963					   level_pte, level_pfn, start_pfn,
964					   last_pfn);
965		}
966
967		/*
968		 * Free the page table if we're below the level we want to
969		 * retain and the range covers the entire table.
970		 */
971		if (level < retain_level && !(start_pfn > level_pfn ||
972		      last_pfn < level_pfn + level_size(level) - 1)) {
973			dma_clear_pte(pte);
974			domain_flush_cache(domain, pte, sizeof(*pte));
975			iommu_free_page(level_pte);
976		}
977next:
978		pfn += level_size(level);
979	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
980}
981
982/*
983 * clear last level (leaf) ptes and free page table pages below the
984 * level we wish to keep intact.
985 */
986static void dma_pte_free_pagetable(struct dmar_domain *domain,
987				   unsigned long start_pfn,
988				   unsigned long last_pfn,
989				   int retain_level)
990{
991	dma_pte_clear_range(domain, start_pfn, last_pfn);
992
993	/* We don't need lock here; nobody else touches the iova range */
994	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
995			   domain->pgd, 0, start_pfn, last_pfn);
996
997	/* free pgd */
998	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
999		iommu_free_page(domain->pgd);
1000		domain->pgd = NULL;
1001	}
1002}
1003
1004/* When a page at a given level is being unlinked from its parent, we don't
1005   need to *modify* it at all. All we need to do is make a list of all the
1006   pages which can be freed just as soon as we've flushed the IOTLB and we
1007   know the hardware page-walk will no longer touch them.
1008   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1009   be freed. */
1010static void dma_pte_list_pagetables(struct dmar_domain *domain,
1011				    int level, struct dma_pte *pte,
1012				    struct list_head *freelist)
1013{
1014	struct page *pg;
1015
1016	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1017	list_add_tail(&pg->lru, freelist);
1018
1019	if (level == 1)
1020		return;
1021
1022	pte = page_address(pg);
1023	do {
1024		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1025			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1026		pte++;
1027	} while (!first_pte_in_page(pte));
1028}
1029
1030static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1031				struct dma_pte *pte, unsigned long pfn,
1032				unsigned long start_pfn, unsigned long last_pfn,
1033				struct list_head *freelist)
1034{
1035	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1036
1037	pfn = max(start_pfn, pfn);
1038	pte = &pte[pfn_level_offset(pfn, level)];
1039
1040	do {
1041		unsigned long level_pfn = pfn & level_mask(level);
1042
1043		if (!dma_pte_present(pte))
1044			goto next;
1045
1046		/* If range covers entire pagetable, free it */
1047		if (start_pfn <= level_pfn &&
1048		    last_pfn >= level_pfn + level_size(level) - 1) {
1049			/* These suborbinate page tables are going away entirely. Don't
1050			   bother to clear them; we're just going to *free* them. */
1051			if (level > 1 && !dma_pte_superpage(pte))
1052				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1053
1054			dma_clear_pte(pte);
1055			if (!first_pte)
1056				first_pte = pte;
1057			last_pte = pte;
1058		} else if (level > 1) {
1059			/* Recurse down into a level that isn't *entirely* obsolete */
1060			dma_pte_clear_level(domain, level - 1,
1061					    phys_to_virt(dma_pte_addr(pte)),
1062					    level_pfn, start_pfn, last_pfn,
1063					    freelist);
1064		}
1065next:
1066		pfn = level_pfn + level_size(level);
1067	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068
1069	if (first_pte)
1070		domain_flush_cache(domain, first_pte,
1071				   (void *)++last_pte - (void *)first_pte);
1072}
1073
1074/* We can't just free the pages because the IOMMU may still be walking
1075   the page tables, and may have cached the intermediate levels. The
1076   pages can only be freed after the IOTLB flush has been done. */
1077static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1078			 unsigned long last_pfn, struct list_head *freelist)
1079{
1080	if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1081	    WARN_ON(start_pfn > last_pfn))
1082		return;
1083
1084	/* we don't need lock here; nobody else touches the iova range */
1085	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1086			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1087
1088	/* free pgd */
1089	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090		struct page *pgd_page = virt_to_page(domain->pgd);
1091		list_add_tail(&pgd_page->lru, freelist);
1092		domain->pgd = NULL;
1093	}
1094}
1095
1096/* iommu handling */
1097static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1098{
1099	struct root_entry *root;
1100
1101	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
1102	if (!root) {
1103		pr_err("Allocating root entry for %s failed\n",
1104			iommu->name);
1105		return -ENOMEM;
1106	}
1107
1108	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1109	iommu->root_entry = root;
1110
1111	return 0;
1112}
1113
1114static void iommu_set_root_entry(struct intel_iommu *iommu)
1115{
1116	u64 addr;
1117	u32 sts;
1118	unsigned long flag;
1119
1120	addr = virt_to_phys(iommu->root_entry);
1121	if (sm_supported(iommu))
1122		addr |= DMA_RTADDR_SMT;
1123
1124	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1125	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1126
1127	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1128
1129	/* Make sure hardware complete it */
1130	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1131		      readl, (sts & DMA_GSTS_RTPS), sts);
1132
1133	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1134
1135	/*
1136	 * Hardware invalidates all DMA remapping hardware translation
1137	 * caches as part of SRTP flow.
1138	 */
1139	if (cap_esrtps(iommu->cap))
1140		return;
1141
1142	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1143	if (sm_supported(iommu))
1144		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1145	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1146}
1147
1148void iommu_flush_write_buffer(struct intel_iommu *iommu)
1149{
1150	u32 val;
1151	unsigned long flag;
1152
1153	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1154		return;
1155
1156	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1157	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1158
1159	/* Make sure hardware complete it */
1160	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1161		      readl, (!(val & DMA_GSTS_WBFS)), val);
1162
1163	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1164}
1165
1166/* return value determine if we need a write buffer flush */
1167static void __iommu_flush_context(struct intel_iommu *iommu,
1168				  u16 did, u16 source_id, u8 function_mask,
1169				  u64 type)
1170{
1171	u64 val = 0;
1172	unsigned long flag;
1173
1174	switch (type) {
1175	case DMA_CCMD_GLOBAL_INVL:
1176		val = DMA_CCMD_GLOBAL_INVL;
1177		break;
1178	case DMA_CCMD_DOMAIN_INVL:
1179		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1180		break;
1181	case DMA_CCMD_DEVICE_INVL:
1182		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1183			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1184		break;
1185	default:
1186		pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1187			iommu->name, type);
1188		return;
1189	}
1190	val |= DMA_CCMD_ICC;
1191
1192	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1193	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1194
1195	/* Make sure hardware complete it */
1196	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1197		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1198
1199	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1200}
1201
1202/* return value determine if we need a write buffer flush */
1203static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1204				u64 addr, unsigned int size_order, u64 type)
1205{
1206	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1207	u64 val = 0, val_iva = 0;
1208	unsigned long flag;
1209
1210	switch (type) {
1211	case DMA_TLB_GLOBAL_FLUSH:
1212		/* global flush doesn't need set IVA_REG */
1213		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1214		break;
1215	case DMA_TLB_DSI_FLUSH:
1216		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1217		break;
1218	case DMA_TLB_PSI_FLUSH:
1219		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1220		/* IH bit is passed in as part of address */
1221		val_iva = size_order | addr;
1222		break;
1223	default:
1224		pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1225			iommu->name, type);
1226		return;
1227	}
1228
1229	if (cap_write_drain(iommu->cap))
1230		val |= DMA_TLB_WRITE_DRAIN;
1231
1232	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233	/* Note: Only uses first TLB reg currently */
1234	if (val_iva)
1235		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1236	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1237
1238	/* Make sure hardware complete it */
1239	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1240		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1241
1242	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1243
1244	/* check IOTLB invalidation granularity */
1245	if (DMA_TLB_IAIG(val) == 0)
1246		pr_err("Flush IOTLB failed\n");
1247	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1248		pr_debug("TLB flush request %Lx, actual %Lx\n",
1249			(unsigned long long)DMA_TLB_IIRG(type),
1250			(unsigned long long)DMA_TLB_IAIG(val));
1251}
1252
1253static struct device_domain_info *
1254domain_lookup_dev_info(struct dmar_domain *domain,
1255		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1256{
1257	struct device_domain_info *info;
1258	unsigned long flags;
1259
1260	spin_lock_irqsave(&domain->lock, flags);
1261	list_for_each_entry(info, &domain->devices, link) {
1262		if (info->iommu == iommu && info->bus == bus &&
1263		    info->devfn == devfn) {
1264			spin_unlock_irqrestore(&domain->lock, flags);
1265			return info;
1266		}
1267	}
1268	spin_unlock_irqrestore(&domain->lock, flags);
1269
1270	return NULL;
1271}
1272
1273void domain_update_iotlb(struct dmar_domain *domain)
1274{
1275	struct dev_pasid_info *dev_pasid;
1276	struct device_domain_info *info;
1277	bool has_iotlb_device = false;
1278	unsigned long flags;
1279
1280	spin_lock_irqsave(&domain->lock, flags);
1281	list_for_each_entry(info, &domain->devices, link) {
1282		if (info->ats_enabled) {
1283			has_iotlb_device = true;
1284			break;
1285		}
1286	}
1287
1288	list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1289		info = dev_iommu_priv_get(dev_pasid->dev);
1290		if (info->ats_enabled) {
1291			has_iotlb_device = true;
1292			break;
1293		}
1294	}
1295	domain->has_iotlb_device = has_iotlb_device;
1296	spin_unlock_irqrestore(&domain->lock, flags);
1297}
1298
1299/*
1300 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1301 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1302 * check because it applies only to the built-in QAT devices and it doesn't
1303 * grant additional privileges.
1304 */
1305#define BUGGY_QAT_DEVID_MASK 0x4940
1306static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1307{
1308	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1309		return false;
1310
1311	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1312		return false;
1313
1314	return true;
1315}
1316
1317static void iommu_enable_pci_caps(struct device_domain_info *info)
1318{
1319	struct pci_dev *pdev;
1320
1321	if (!dev_is_pci(info->dev))
1322		return;
1323
1324	pdev = to_pci_dev(info->dev);
1325
1326	/* The PCIe spec, in its wisdom, declares that the behaviour of
1327	   the device if you enable PASID support after ATS support is
1328	   undefined. So always enable PASID support on devices which
1329	   have it, even if we can't yet know if we're ever going to
1330	   use it. */
1331	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1332		info->pasid_enabled = 1;
1333
1334	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1335	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1336		info->ats_enabled = 1;
1337		domain_update_iotlb(info->domain);
1338	}
1339}
1340
1341static void iommu_disable_pci_caps(struct device_domain_info *info)
1342{
1343	struct pci_dev *pdev;
1344
1345	if (!dev_is_pci(info->dev))
1346		return;
1347
1348	pdev = to_pci_dev(info->dev);
1349
1350	if (info->ats_enabled) {
1351		pci_disable_ats(pdev);
1352		info->ats_enabled = 0;
1353		domain_update_iotlb(info->domain);
1354	}
1355
1356	if (info->pasid_enabled) {
1357		pci_disable_pasid(pdev);
1358		info->pasid_enabled = 0;
1359	}
1360}
1361
1362static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1363				    u64 addr, unsigned int mask)
1364{
1365	u16 sid, qdep;
1366
1367	if (!info || !info->ats_enabled)
1368		return;
1369
1370	sid = info->bus << 8 | info->devfn;
1371	qdep = info->ats_qdep;
1372	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1373			   qdep, addr, mask);
1374	quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1375}
1376
1377static void intel_flush_iotlb_all(struct iommu_domain *domain)
1378{
1379	cache_tag_flush_all(to_dmar_domain(domain));
1380}
1381
1382static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1383{
1384	u32 pmen;
1385	unsigned long flags;
1386
1387	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1388		return;
1389
1390	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1391	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1392	pmen &= ~DMA_PMEN_EPM;
1393	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1394
1395	/* wait for the protected region status bit to clear */
1396	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1397		readl, !(pmen & DMA_PMEN_PRS), pmen);
1398
1399	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1400}
1401
1402static void iommu_enable_translation(struct intel_iommu *iommu)
1403{
1404	u32 sts;
1405	unsigned long flags;
1406
1407	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1408	iommu->gcmd |= DMA_GCMD_TE;
1409	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1410
1411	/* Make sure hardware complete it */
1412	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1413		      readl, (sts & DMA_GSTS_TES), sts);
1414
1415	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1416}
1417
1418static void iommu_disable_translation(struct intel_iommu *iommu)
1419{
1420	u32 sts;
1421	unsigned long flag;
1422
1423	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1424	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1425		return;
1426
1427	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1428	iommu->gcmd &= ~DMA_GCMD_TE;
1429	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1430
1431	/* Make sure hardware complete it */
1432	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1433		      readl, (!(sts & DMA_GSTS_TES)), sts);
1434
1435	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1436}
1437
1438static int iommu_init_domains(struct intel_iommu *iommu)
1439{
1440	u32 ndomains;
1441
1442	ndomains = cap_ndoms(iommu->cap);
1443	pr_debug("%s: Number of Domains supported <%d>\n",
1444		 iommu->name, ndomains);
1445
1446	spin_lock_init(&iommu->lock);
1447
1448	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1449	if (!iommu->domain_ids)
1450		return -ENOMEM;
1451
1452	/*
1453	 * If Caching mode is set, then invalid translations are tagged
1454	 * with domain-id 0, hence we need to pre-allocate it. We also
1455	 * use domain-id 0 as a marker for non-allocated domain-id, so
1456	 * make sure it is not used for a real domain.
1457	 */
1458	set_bit(0, iommu->domain_ids);
1459
1460	/*
1461	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1462	 * entry for first-level or pass-through translation modes should
1463	 * be programmed with a domain id different from those used for
1464	 * second-level or nested translation. We reserve a domain id for
1465	 * this purpose.
1466	 */
1467	if (sm_supported(iommu))
1468		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1469
1470	return 0;
1471}
1472
1473static void disable_dmar_iommu(struct intel_iommu *iommu)
1474{
1475	if (!iommu->domain_ids)
1476		return;
1477
1478	/*
1479	 * All iommu domains must have been detached from the devices,
1480	 * hence there should be no domain IDs in use.
1481	 */
1482	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1483		    > NUM_RESERVED_DID))
1484		return;
1485
1486	if (iommu->gcmd & DMA_GCMD_TE)
1487		iommu_disable_translation(iommu);
1488}
1489
1490static void free_dmar_iommu(struct intel_iommu *iommu)
1491{
1492	if (iommu->domain_ids) {
1493		bitmap_free(iommu->domain_ids);
1494		iommu->domain_ids = NULL;
1495	}
1496
1497	if (iommu->copied_tables) {
1498		bitmap_free(iommu->copied_tables);
1499		iommu->copied_tables = NULL;
1500	}
1501
1502	/* free context mapping */
1503	free_context_table(iommu);
1504
1505#ifdef CONFIG_INTEL_IOMMU_SVM
1506	if (pasid_supported(iommu)) {
1507		if (ecap_prs(iommu->ecap))
1508			intel_svm_finish_prq(iommu);
1509	}
1510#endif
1511}
1512
1513/*
1514 * Check and return whether first level is used by default for
1515 * DMA translation.
1516 */
1517static bool first_level_by_default(unsigned int type)
1518{
1519	/* Only SL is available in legacy mode */
1520	if (!scalable_mode_support())
1521		return false;
1522
1523	/* Only level (either FL or SL) is available, just use it */
1524	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1525		return intel_cap_flts_sanity();
1526
1527	/* Both levels are available, decide it based on domain type */
1528	return type != IOMMU_DOMAIN_UNMANAGED;
1529}
1530
1531static struct dmar_domain *alloc_domain(unsigned int type)
1532{
1533	struct dmar_domain *domain;
1534
1535	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1536	if (!domain)
1537		return NULL;
1538
1539	domain->nid = NUMA_NO_NODE;
1540	if (first_level_by_default(type))
1541		domain->use_first_level = true;
1542	domain->has_iotlb_device = false;
1543	INIT_LIST_HEAD(&domain->devices);
1544	INIT_LIST_HEAD(&domain->dev_pasids);
1545	INIT_LIST_HEAD(&domain->cache_tags);
1546	spin_lock_init(&domain->lock);
1547	spin_lock_init(&domain->cache_lock);
1548	xa_init(&domain->iommu_array);
1549
1550	return domain;
1551}
1552
1553int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1554{
1555	struct iommu_domain_info *info, *curr;
1556	unsigned long ndomains;
1557	int num, ret = -ENOSPC;
1558
1559	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1560		return 0;
1561
1562	info = kzalloc(sizeof(*info), GFP_KERNEL);
1563	if (!info)
1564		return -ENOMEM;
1565
1566	spin_lock(&iommu->lock);
1567	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1568	if (curr) {
1569		curr->refcnt++;
1570		spin_unlock(&iommu->lock);
1571		kfree(info);
1572		return 0;
1573	}
1574
1575	ndomains = cap_ndoms(iommu->cap);
1576	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1577	if (num >= ndomains) {
1578		pr_err("%s: No free domain ids\n", iommu->name);
1579		goto err_unlock;
1580	}
1581
1582	set_bit(num, iommu->domain_ids);
1583	info->refcnt	= 1;
1584	info->did	= num;
1585	info->iommu	= iommu;
1586	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1587			  NULL, info, GFP_ATOMIC);
1588	if (curr) {
1589		ret = xa_err(curr) ? : -EBUSY;
1590		goto err_clear;
1591	}
1592	domain_update_iommu_cap(domain);
1593
1594	spin_unlock(&iommu->lock);
1595	return 0;
1596
1597err_clear:
1598	clear_bit(info->did, iommu->domain_ids);
1599err_unlock:
1600	spin_unlock(&iommu->lock);
1601	kfree(info);
1602	return ret;
1603}
1604
1605void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1606{
1607	struct iommu_domain_info *info;
1608
1609	if (domain->domain.type == IOMMU_DOMAIN_SVA)
1610		return;
1611
1612	spin_lock(&iommu->lock);
1613	info = xa_load(&domain->iommu_array, iommu->seq_id);
1614	if (--info->refcnt == 0) {
1615		clear_bit(info->did, iommu->domain_ids);
1616		xa_erase(&domain->iommu_array, iommu->seq_id);
1617		domain->nid = NUMA_NO_NODE;
1618		domain_update_iommu_cap(domain);
1619		kfree(info);
1620	}
1621	spin_unlock(&iommu->lock);
1622}
1623
1624static int guestwidth_to_adjustwidth(int gaw)
1625{
1626	int agaw;
1627	int r = (gaw - 12) % 9;
1628
1629	if (r == 0)
1630		agaw = gaw;
1631	else
1632		agaw = gaw + 9 - r;
1633	if (agaw > 64)
1634		agaw = 64;
1635	return agaw;
1636}
1637
1638static void domain_exit(struct dmar_domain *domain)
1639{
1640	if (domain->pgd) {
1641		LIST_HEAD(freelist);
1642
1643		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1644		iommu_put_pages_list(&freelist);
1645	}
1646
1647	if (WARN_ON(!list_empty(&domain->devices)))
1648		return;
1649
1650	kfree(domain);
1651}
1652
1653static int domain_context_mapping_one(struct dmar_domain *domain,
1654				      struct intel_iommu *iommu,
1655				      u8 bus, u8 devfn)
1656{
1657	struct device_domain_info *info =
1658			domain_lookup_dev_info(domain, iommu, bus, devfn);
1659	u16 did = domain_id_iommu(domain, iommu);
1660	int translation = CONTEXT_TT_MULTI_LEVEL;
1661	struct dma_pte *pgd = domain->pgd;
1662	struct context_entry *context;
1663	int agaw, ret;
1664
1665	if (hw_pass_through && domain_type_is_si(domain))
1666		translation = CONTEXT_TT_PASS_THROUGH;
1667
1668	pr_debug("Set context mapping for %02x:%02x.%d\n",
1669		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1670
1671	spin_lock(&iommu->lock);
1672	ret = -ENOMEM;
1673	context = iommu_context_addr(iommu, bus, devfn, 1);
1674	if (!context)
1675		goto out_unlock;
1676
1677	ret = 0;
1678	if (context_present(context) && !context_copied(iommu, bus, devfn))
1679		goto out_unlock;
1680
1681	/*
1682	 * For kdump cases, old valid entries may be cached due to the
1683	 * in-flight DMA and copied pgtable, but there is no unmapping
1684	 * behaviour for them, thus we need an explicit cache flush for
1685	 * the newly-mapped device. For kdump, at this point, the device
1686	 * is supposed to finish reset at its driver probe stage, so no
1687	 * in-flight DMA will exist, and we don't need to worry anymore
1688	 * hereafter.
1689	 */
1690	if (context_copied(iommu, bus, devfn)) {
1691		u16 did_old = context_domain_id(context);
1692
1693		if (did_old < cap_ndoms(iommu->cap)) {
1694			iommu->flush.flush_context(iommu, did_old,
1695						   (((u16)bus) << 8) | devfn,
1696						   DMA_CCMD_MASK_NOBIT,
1697						   DMA_CCMD_DEVICE_INVL);
1698			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1699						 DMA_TLB_DSI_FLUSH);
1700		}
1701
1702		clear_context_copied(iommu, bus, devfn);
1703	}
1704
1705	context_clear_entry(context);
1706	context_set_domain_id(context, did);
1707
1708	if (translation != CONTEXT_TT_PASS_THROUGH) {
1709		/*
1710		 * Skip top levels of page tables for iommu which has
1711		 * less agaw than default. Unnecessary for PT mode.
1712		 */
1713		for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1714			ret = -ENOMEM;
1715			pgd = phys_to_virt(dma_pte_addr(pgd));
1716			if (!dma_pte_present(pgd))
1717				goto out_unlock;
1718		}
1719
1720		if (info && info->ats_supported)
1721			translation = CONTEXT_TT_DEV_IOTLB;
1722		else
1723			translation = CONTEXT_TT_MULTI_LEVEL;
1724
1725		context_set_address_root(context, virt_to_phys(pgd));
1726		context_set_address_width(context, agaw);
1727	} else {
1728		/*
1729		 * In pass through mode, AW must be programmed to
1730		 * indicate the largest AGAW value supported by
1731		 * hardware. And ASR is ignored by hardware.
1732		 */
1733		context_set_address_width(context, iommu->msagaw);
1734	}
1735
1736	context_set_translation_type(context, translation);
1737	context_set_fault_enable(context);
1738	context_set_present(context);
1739	if (!ecap_coherent(iommu->ecap))
1740		clflush_cache_range(context, sizeof(*context));
1741
1742	/*
1743	 * It's a non-present to present mapping. If hardware doesn't cache
1744	 * non-present entry we only need to flush the write-buffer. If the
1745	 * _does_ cache non-present entries, then it does so in the special
1746	 * domain #0, which we have to flush:
1747	 */
1748	if (cap_caching_mode(iommu->cap)) {
1749		iommu->flush.flush_context(iommu, 0,
1750					   (((u16)bus) << 8) | devfn,
1751					   DMA_CCMD_MASK_NOBIT,
1752					   DMA_CCMD_DEVICE_INVL);
1753		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1754	} else {
1755		iommu_flush_write_buffer(iommu);
1756	}
1757
1758	ret = 0;
1759
1760out_unlock:
1761	spin_unlock(&iommu->lock);
1762
1763	return ret;
1764}
1765
1766static int domain_context_mapping_cb(struct pci_dev *pdev,
1767				     u16 alias, void *opaque)
1768{
1769	struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1770	struct intel_iommu *iommu = info->iommu;
1771	struct dmar_domain *domain = opaque;
1772
1773	return domain_context_mapping_one(domain, iommu,
1774					  PCI_BUS_NUM(alias), alias & 0xff);
1775}
1776
1777static int
1778domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1779{
1780	struct device_domain_info *info = dev_iommu_priv_get(dev);
1781	struct intel_iommu *iommu = info->iommu;
1782	u8 bus = info->bus, devfn = info->devfn;
1783
1784	if (!dev_is_pci(dev))
1785		return domain_context_mapping_one(domain, iommu, bus, devfn);
1786
1787	return pci_for_each_dma_alias(to_pci_dev(dev),
1788				      domain_context_mapping_cb, domain);
1789}
1790
1791/* Return largest possible superpage level for a given mapping */
1792static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1793				   unsigned long phy_pfn, unsigned long pages)
1794{
1795	int support, level = 1;
1796	unsigned long pfnmerge;
1797
1798	support = domain->iommu_superpage;
1799
1800	/* To use a large page, the virtual *and* physical addresses
1801	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1802	   of them will mean we have to use smaller pages. So just
1803	   merge them and check both at once. */
1804	pfnmerge = iov_pfn | phy_pfn;
1805
1806	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1807		pages >>= VTD_STRIDE_SHIFT;
1808		if (!pages)
1809			break;
1810		pfnmerge >>= VTD_STRIDE_SHIFT;
1811		level++;
1812		support--;
1813	}
1814	return level;
1815}
1816
1817/*
1818 * Ensure that old small page tables are removed to make room for superpage(s).
1819 * We're going to add new large pages, so make sure we don't remove their parent
1820 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1821 */
1822static void switch_to_super_page(struct dmar_domain *domain,
1823				 unsigned long start_pfn,
1824				 unsigned long end_pfn, int level)
1825{
1826	unsigned long lvl_pages = lvl_to_nr_pages(level);
1827	struct dma_pte *pte = NULL;
1828
1829	while (start_pfn <= end_pfn) {
1830		if (!pte)
1831			pte = pfn_to_dma_pte(domain, start_pfn, &level,
1832					     GFP_ATOMIC);
1833
1834		if (dma_pte_present(pte)) {
1835			dma_pte_free_pagetable(domain, start_pfn,
1836					       start_pfn + lvl_pages - 1,
1837					       level + 1);
1838
1839			cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1840					      end_pfn << VTD_PAGE_SHIFT, 0);
1841		}
1842
1843		pte++;
1844		start_pfn += lvl_pages;
1845		if (first_pte_in_page(pte))
1846			pte = NULL;
1847	}
1848}
1849
1850static int
1851__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1852		 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1853		 gfp_t gfp)
1854{
1855	struct dma_pte *first_pte = NULL, *pte = NULL;
1856	unsigned int largepage_lvl = 0;
1857	unsigned long lvl_pages = 0;
1858	phys_addr_t pteval;
1859	u64 attr;
1860
1861	if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1862		return -EINVAL;
1863
1864	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1865		return -EINVAL;
1866
1867	if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1868		pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1869		return -EINVAL;
1870	}
1871
1872	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1873	attr |= DMA_FL_PTE_PRESENT;
1874	if (domain->use_first_level) {
1875		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1876		if (prot & DMA_PTE_WRITE)
1877			attr |= DMA_FL_PTE_DIRTY;
1878	}
1879
1880	domain->has_mappings = true;
1881
1882	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1883
1884	while (nr_pages > 0) {
1885		uint64_t tmp;
1886
1887		if (!pte) {
1888			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1889					phys_pfn, nr_pages);
1890
1891			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1892					     gfp);
1893			if (!pte)
1894				return -ENOMEM;
1895			first_pte = pte;
1896
1897			lvl_pages = lvl_to_nr_pages(largepage_lvl);
1898
1899			/* It is large page*/
1900			if (largepage_lvl > 1) {
1901				unsigned long end_pfn;
1902				unsigned long pages_to_remove;
1903
1904				pteval |= DMA_PTE_LARGE_PAGE;
1905				pages_to_remove = min_t(unsigned long, nr_pages,
1906							nr_pte_to_next_page(pte) * lvl_pages);
1907				end_pfn = iov_pfn + pages_to_remove - 1;
1908				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1909			} else {
1910				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1911			}
1912
1913		}
1914		/* We don't need lock here, nobody else
1915		 * touches the iova range
1916		 */
1917		tmp = 0ULL;
1918		if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1919			static int dumps = 5;
1920			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1921				iov_pfn, tmp, (unsigned long long)pteval);
1922			if (dumps) {
1923				dumps--;
1924				debug_dma_dump_mappings(NULL);
1925			}
1926			WARN_ON(1);
1927		}
1928
1929		nr_pages -= lvl_pages;
1930		iov_pfn += lvl_pages;
1931		phys_pfn += lvl_pages;
1932		pteval += lvl_pages * VTD_PAGE_SIZE;
1933
1934		/* If the next PTE would be the first in a new page, then we
1935		 * need to flush the cache on the entries we've just written.
1936		 * And then we'll need to recalculate 'pte', so clear it and
1937		 * let it get set again in the if (!pte) block above.
1938		 *
1939		 * If we're done (!nr_pages) we need to flush the cache too.
1940		 *
1941		 * Also if we've been setting superpages, we may need to
1942		 * recalculate 'pte' and switch back to smaller pages for the
1943		 * end of the mapping, if the trailing size is not enough to
1944		 * use another superpage (i.e. nr_pages < lvl_pages).
1945		 */
1946		pte++;
1947		if (!nr_pages || first_pte_in_page(pte) ||
1948		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1949			domain_flush_cache(domain, first_pte,
1950					   (void *)pte - (void *)first_pte);
1951			pte = NULL;
1952		}
1953	}
1954
1955	return 0;
1956}
1957
1958static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1959{
1960	struct intel_iommu *iommu = info->iommu;
1961	struct context_entry *context;
1962	u16 did_old;
1963
1964	spin_lock(&iommu->lock);
1965	context = iommu_context_addr(iommu, bus, devfn, 0);
1966	if (!context) {
1967		spin_unlock(&iommu->lock);
1968		return;
1969	}
1970
1971	did_old = context_domain_id(context);
1972
1973	context_clear_entry(context);
1974	__iommu_flush_cache(iommu, context, sizeof(*context));
1975	spin_unlock(&iommu->lock);
1976	iommu->flush.flush_context(iommu,
1977				   did_old,
1978				   (((u16)bus) << 8) | devfn,
1979				   DMA_CCMD_MASK_NOBIT,
1980				   DMA_CCMD_DEVICE_INVL);
1981
1982	iommu->flush.flush_iotlb(iommu,
1983				 did_old,
1984				 0,
1985				 0,
1986				 DMA_TLB_DSI_FLUSH);
1987
1988	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
1989}
1990
1991static int domain_setup_first_level(struct intel_iommu *iommu,
1992				    struct dmar_domain *domain,
1993				    struct device *dev,
1994				    u32 pasid)
1995{
1996	struct dma_pte *pgd = domain->pgd;
1997	int agaw, level;
1998	int flags = 0;
1999
2000	/*
2001	 * Skip top levels of page tables for iommu which has
2002	 * less agaw than default. Unnecessary for PT mode.
2003	 */
2004	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005		pgd = phys_to_virt(dma_pte_addr(pgd));
2006		if (!dma_pte_present(pgd))
2007			return -ENOMEM;
2008	}
2009
2010	level = agaw_to_level(agaw);
2011	if (level != 4 && level != 5)
2012		return -EINVAL;
2013
2014	if (level == 5)
2015		flags |= PASID_FLAG_FL5LP;
2016
2017	if (domain->force_snooping)
2018		flags |= PASID_FLAG_PAGE_SNOOP;
2019
2020	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2021					     domain_id_iommu(domain, iommu),
2022					     flags);
2023}
2024
2025static bool dev_is_real_dma_subdevice(struct device *dev)
2026{
2027	return dev && dev_is_pci(dev) &&
2028	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2029}
2030
2031static int iommu_domain_identity_map(struct dmar_domain *domain,
2032				     unsigned long first_vpfn,
2033				     unsigned long last_vpfn)
2034{
2035	/*
2036	 * RMRR range might have overlap with physical memory range,
2037	 * clear it first
2038	 */
2039	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2040
2041	return __domain_mapping(domain, first_vpfn,
2042				first_vpfn, last_vpfn - first_vpfn + 1,
2043				DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2044}
2045
2046static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047
2048static int __init si_domain_init(int hw)
2049{
2050	struct dmar_rmrr_unit *rmrr;
2051	struct device *dev;
2052	int i, nid, ret;
2053
2054	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2055	if (!si_domain)
2056		return -EFAULT;
2057
2058	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2059		domain_exit(si_domain);
2060		si_domain = NULL;
2061		return -EFAULT;
2062	}
2063
2064	if (hw)
2065		return 0;
2066
2067	for_each_online_node(nid) {
2068		unsigned long start_pfn, end_pfn;
2069		int i;
2070
2071		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2072			ret = iommu_domain_identity_map(si_domain,
2073					mm_to_dma_pfn_start(start_pfn),
2074					mm_to_dma_pfn_end(end_pfn));
2075			if (ret)
2076				return ret;
2077		}
2078	}
2079
2080	/*
2081	 * Identity map the RMRRs so that devices with RMRRs could also use
2082	 * the si_domain.
2083	 */
2084	for_each_rmrr_units(rmrr) {
2085		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2086					  i, dev) {
2087			unsigned long long start = rmrr->base_address;
2088			unsigned long long end = rmrr->end_address;
2089
2090			if (WARN_ON(end < start ||
2091				    end >> agaw_to_width(si_domain->agaw)))
2092				continue;
2093
2094			ret = iommu_domain_identity_map(si_domain,
2095					mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2096					mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2097			if (ret)
2098				return ret;
2099		}
2100	}
2101
2102	return 0;
2103}
2104
2105static int dmar_domain_attach_device(struct dmar_domain *domain,
2106				     struct device *dev)
2107{
2108	struct device_domain_info *info = dev_iommu_priv_get(dev);
2109	struct intel_iommu *iommu = info->iommu;
2110	unsigned long flags;
2111	int ret;
2112
2113	ret = domain_attach_iommu(domain, iommu);
2114	if (ret)
2115		return ret;
2116
2117	ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
2118	if (ret) {
2119		domain_detach_iommu(domain, iommu);
2120		return ret;
2121	}
2122
2123	info->domain = domain;
2124	spin_lock_irqsave(&domain->lock, flags);
2125	list_add(&info->link, &domain->devices);
2126	spin_unlock_irqrestore(&domain->lock, flags);
2127
2128	if (dev_is_real_dma_subdevice(dev))
2129		return 0;
2130
2131	if (!sm_supported(iommu))
2132		ret = domain_context_mapping(domain, dev);
2133	else if (hw_pass_through && domain_type_is_si(domain))
2134		ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
2135	else if (domain->use_first_level)
2136		ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID);
2137	else
2138		ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID);
2139
2140	if (ret) {
2141		device_block_translation(dev);
2142		return ret;
2143	}
2144
2145	if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2146		iommu_enable_pci_caps(info);
2147
2148	return 0;
2149}
2150
2151/**
2152 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2153 * is relaxable (ie. is allowed to be not enforced under some conditions)
2154 * @dev: device handle
2155 *
2156 * We assume that PCI USB devices with RMRRs have them largely
2157 * for historical reasons and that the RMRR space is not actively used post
2158 * boot.  This exclusion may change if vendors begin to abuse it.
2159 *
2160 * The same exception is made for graphics devices, with the requirement that
2161 * any use of the RMRR regions will be torn down before assigning the device
2162 * to a guest.
2163 *
2164 * Return: true if the RMRR is relaxable, false otherwise
2165 */
2166static bool device_rmrr_is_relaxable(struct device *dev)
2167{
2168	struct pci_dev *pdev;
2169
2170	if (!dev_is_pci(dev))
2171		return false;
2172
2173	pdev = to_pci_dev(dev);
2174	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2175		return true;
2176	else
2177		return false;
2178}
2179
2180/*
2181 * Return the required default domain type for a specific device.
2182 *
2183 * @dev: the device in query
2184 * @startup: true if this is during early boot
2185 *
2186 * Returns:
2187 *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2188 *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2189 *  - 0: both identity and dynamic domains work for this device
2190 */
2191static int device_def_domain_type(struct device *dev)
2192{
2193	if (dev_is_pci(dev)) {
2194		struct pci_dev *pdev = to_pci_dev(dev);
2195
2196		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2197			return IOMMU_DOMAIN_IDENTITY;
2198	}
2199
2200	return 0;
2201}
2202
2203static void intel_iommu_init_qi(struct intel_iommu *iommu)
2204{
2205	/*
2206	 * Start from the sane iommu hardware state.
2207	 * If the queued invalidation is already initialized by us
2208	 * (for example, while enabling interrupt-remapping) then
2209	 * we got the things already rolling from a sane state.
2210	 */
2211	if (!iommu->qi) {
2212		/*
2213		 * Clear any previous faults.
2214		 */
2215		dmar_fault(-1, iommu);
2216		/*
2217		 * Disable queued invalidation if supported and already enabled
2218		 * before OS handover.
2219		 */
2220		dmar_disable_qi(iommu);
2221	}
2222
2223	if (dmar_enable_qi(iommu)) {
2224		/*
2225		 * Queued Invalidate not enabled, use Register Based Invalidate
2226		 */
2227		iommu->flush.flush_context = __iommu_flush_context;
2228		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2229		pr_info("%s: Using Register based invalidation\n",
2230			iommu->name);
2231	} else {
2232		iommu->flush.flush_context = qi_flush_context;
2233		iommu->flush.flush_iotlb = qi_flush_iotlb;
2234		pr_info("%s: Using Queued invalidation\n", iommu->name);
2235	}
2236}
2237
2238static int copy_context_table(struct intel_iommu *iommu,
2239			      struct root_entry *old_re,
2240			      struct context_entry **tbl,
2241			      int bus, bool ext)
2242{
2243	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2244	struct context_entry *new_ce = NULL, ce;
2245	struct context_entry *old_ce = NULL;
2246	struct root_entry re;
2247	phys_addr_t old_ce_phys;
2248
2249	tbl_idx = ext ? bus * 2 : bus;
2250	memcpy(&re, old_re, sizeof(re));
2251
2252	for (devfn = 0; devfn < 256; devfn++) {
2253		/* First calculate the correct index */
2254		idx = (ext ? devfn * 2 : devfn) % 256;
2255
2256		if (idx == 0) {
2257			/* First save what we may have and clean up */
2258			if (new_ce) {
2259				tbl[tbl_idx] = new_ce;
2260				__iommu_flush_cache(iommu, new_ce,
2261						    VTD_PAGE_SIZE);
2262				pos = 1;
2263			}
2264
2265			if (old_ce)
2266				memunmap(old_ce);
2267
2268			ret = 0;
2269			if (devfn < 0x80)
2270				old_ce_phys = root_entry_lctp(&re);
2271			else
2272				old_ce_phys = root_entry_uctp(&re);
2273
2274			if (!old_ce_phys) {
2275				if (ext && devfn == 0) {
2276					/* No LCTP, try UCTP */
2277					devfn = 0x7f;
2278					continue;
2279				} else {
2280					goto out;
2281				}
2282			}
2283
2284			ret = -ENOMEM;
2285			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2286					MEMREMAP_WB);
2287			if (!old_ce)
2288				goto out;
2289
2290			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
2291			if (!new_ce)
2292				goto out_unmap;
2293
2294			ret = 0;
2295		}
2296
2297		/* Now copy the context entry */
2298		memcpy(&ce, old_ce + idx, sizeof(ce));
2299
2300		if (!context_present(&ce))
2301			continue;
2302
2303		did = context_domain_id(&ce);
2304		if (did >= 0 && did < cap_ndoms(iommu->cap))
2305			set_bit(did, iommu->domain_ids);
2306
2307		set_context_copied(iommu, bus, devfn);
2308		new_ce[idx] = ce;
2309	}
2310
2311	tbl[tbl_idx + pos] = new_ce;
2312
2313	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2314
2315out_unmap:
2316	memunmap(old_ce);
2317
2318out:
2319	return ret;
2320}
2321
2322static int copy_translation_tables(struct intel_iommu *iommu)
2323{
2324	struct context_entry **ctxt_tbls;
2325	struct root_entry *old_rt;
2326	phys_addr_t old_rt_phys;
2327	int ctxt_table_entries;
2328	u64 rtaddr_reg;
2329	int bus, ret;
2330	bool new_ext, ext;
2331
2332	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2333	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2334	new_ext    = !!sm_supported(iommu);
2335
2336	/*
2337	 * The RTT bit can only be changed when translation is disabled,
2338	 * but disabling translation means to open a window for data
2339	 * corruption. So bail out and don't copy anything if we would
2340	 * have to change the bit.
2341	 */
2342	if (new_ext != ext)
2343		return -EINVAL;
2344
2345	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2346	if (!iommu->copied_tables)
2347		return -ENOMEM;
2348
2349	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2350	if (!old_rt_phys)
2351		return -EINVAL;
2352
2353	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2354	if (!old_rt)
2355		return -ENOMEM;
2356
2357	/* This is too big for the stack - allocate it from slab */
2358	ctxt_table_entries = ext ? 512 : 256;
2359	ret = -ENOMEM;
2360	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2361	if (!ctxt_tbls)
2362		goto out_unmap;
2363
2364	for (bus = 0; bus < 256; bus++) {
2365		ret = copy_context_table(iommu, &old_rt[bus],
2366					 ctxt_tbls, bus, ext);
2367		if (ret) {
2368			pr_err("%s: Failed to copy context table for bus %d\n",
2369				iommu->name, bus);
2370			continue;
2371		}
2372	}
2373
2374	spin_lock(&iommu->lock);
2375
2376	/* Context tables are copied, now write them to the root_entry table */
2377	for (bus = 0; bus < 256; bus++) {
2378		int idx = ext ? bus * 2 : bus;
2379		u64 val;
2380
2381		if (ctxt_tbls[idx]) {
2382			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2383			iommu->root_entry[bus].lo = val;
2384		}
2385
2386		if (!ext || !ctxt_tbls[idx + 1])
2387			continue;
2388
2389		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2390		iommu->root_entry[bus].hi = val;
2391	}
2392
2393	spin_unlock(&iommu->lock);
2394
2395	kfree(ctxt_tbls);
2396
2397	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2398
2399	ret = 0;
2400
2401out_unmap:
2402	memunmap(old_rt);
2403
2404	return ret;
2405}
2406
2407static int __init init_dmars(void)
2408{
2409	struct dmar_drhd_unit *drhd;
2410	struct intel_iommu *iommu;
2411	int ret;
2412
2413	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2414	if (ret)
2415		goto free_iommu;
2416
2417	for_each_iommu(iommu, drhd) {
2418		if (drhd->ignored) {
2419			iommu_disable_translation(iommu);
2420			continue;
2421		}
2422
2423		/*
2424		 * Find the max pasid size of all IOMMU's in the system.
2425		 * We need to ensure the system pasid table is no bigger
2426		 * than the smallest supported.
2427		 */
2428		if (pasid_supported(iommu)) {
2429			u32 temp = 2 << ecap_pss(iommu->ecap);
2430
2431			intel_pasid_max_id = min_t(u32, temp,
2432						   intel_pasid_max_id);
2433		}
2434
2435		intel_iommu_init_qi(iommu);
2436
2437		ret = iommu_init_domains(iommu);
2438		if (ret)
2439			goto free_iommu;
2440
2441		init_translation_status(iommu);
2442
2443		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2444			iommu_disable_translation(iommu);
2445			clear_translation_pre_enabled(iommu);
2446			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2447				iommu->name);
2448		}
2449
2450		/*
2451		 * TBD:
2452		 * we could share the same root & context tables
2453		 * among all IOMMU's. Need to Split it later.
2454		 */
2455		ret = iommu_alloc_root_entry(iommu);
2456		if (ret)
2457			goto free_iommu;
2458
2459		if (translation_pre_enabled(iommu)) {
2460			pr_info("Translation already enabled - trying to copy translation structures\n");
2461
2462			ret = copy_translation_tables(iommu);
2463			if (ret) {
2464				/*
2465				 * We found the IOMMU with translation
2466				 * enabled - but failed to copy over the
2467				 * old root-entry table. Try to proceed
2468				 * by disabling translation now and
2469				 * allocating a clean root-entry table.
2470				 * This might cause DMAR faults, but
2471				 * probably the dump will still succeed.
2472				 */
2473				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2474				       iommu->name);
2475				iommu_disable_translation(iommu);
2476				clear_translation_pre_enabled(iommu);
2477			} else {
2478				pr_info("Copied translation tables from previous kernel for %s\n",
2479					iommu->name);
2480			}
2481		}
2482
2483		if (!ecap_pass_through(iommu->ecap))
2484			hw_pass_through = 0;
2485		intel_svm_check(iommu);
2486	}
2487
2488	/*
2489	 * Now that qi is enabled on all iommus, set the root entry and flush
2490	 * caches. This is required on some Intel X58 chipsets, otherwise the
2491	 * flush_context function will loop forever and the boot hangs.
2492	 */
2493	for_each_active_iommu(iommu, drhd) {
2494		iommu_flush_write_buffer(iommu);
2495		iommu_set_root_entry(iommu);
2496	}
2497
2498	check_tylersburg_isoch();
2499
2500	ret = si_domain_init(hw_pass_through);
2501	if (ret)
2502		goto free_iommu;
2503
2504	/*
2505	 * for each drhd
2506	 *   enable fault log
2507	 *   global invalidate context cache
2508	 *   global invalidate iotlb
2509	 *   enable translation
2510	 */
2511	for_each_iommu(iommu, drhd) {
2512		if (drhd->ignored) {
2513			/*
2514			 * we always have to disable PMRs or DMA may fail on
2515			 * this device
2516			 */
2517			if (force_on)
2518				iommu_disable_protect_mem_regions(iommu);
2519			continue;
2520		}
2521
2522		iommu_flush_write_buffer(iommu);
2523
2524#ifdef CONFIG_INTEL_IOMMU_SVM
2525		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2526			/*
2527			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2528			 * could cause possible lock race condition.
2529			 */
2530			up_write(&dmar_global_lock);
2531			ret = intel_svm_enable_prq(iommu);
2532			down_write(&dmar_global_lock);
2533			if (ret)
2534				goto free_iommu;
2535		}
2536#endif
2537		ret = dmar_set_interrupt(iommu);
2538		if (ret)
2539			goto free_iommu;
2540	}
2541
2542	return 0;
2543
2544free_iommu:
2545	for_each_active_iommu(iommu, drhd) {
2546		disable_dmar_iommu(iommu);
2547		free_dmar_iommu(iommu);
2548	}
2549	if (si_domain) {
2550		domain_exit(si_domain);
2551		si_domain = NULL;
2552	}
2553
2554	return ret;
2555}
2556
2557static void __init init_no_remapping_devices(void)
2558{
2559	struct dmar_drhd_unit *drhd;
2560	struct device *dev;
2561	int i;
2562
2563	for_each_drhd_unit(drhd) {
2564		if (!drhd->include_all) {
2565			for_each_active_dev_scope(drhd->devices,
2566						  drhd->devices_cnt, i, dev)
2567				break;
2568			/* ignore DMAR unit if no devices exist */
2569			if (i == drhd->devices_cnt)
2570				drhd->ignored = 1;
2571		}
2572	}
2573
2574	for_each_active_drhd_unit(drhd) {
2575		if (drhd->include_all)
2576			continue;
2577
2578		for_each_active_dev_scope(drhd->devices,
2579					  drhd->devices_cnt, i, dev)
2580			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2581				break;
2582		if (i < drhd->devices_cnt)
2583			continue;
2584
2585		/* This IOMMU has *only* gfx devices. Either bypass it or
2586		   set the gfx_mapped flag, as appropriate */
2587		drhd->gfx_dedicated = 1;
2588		if (disable_igfx_iommu)
2589			drhd->ignored = 1;
2590	}
2591}
2592
2593#ifdef CONFIG_SUSPEND
2594static int init_iommu_hw(void)
2595{
2596	struct dmar_drhd_unit *drhd;
2597	struct intel_iommu *iommu = NULL;
2598	int ret;
2599
2600	for_each_active_iommu(iommu, drhd) {
2601		if (iommu->qi) {
2602			ret = dmar_reenable_qi(iommu);
2603			if (ret)
2604				return ret;
2605		}
2606	}
2607
2608	for_each_iommu(iommu, drhd) {
2609		if (drhd->ignored) {
2610			/*
2611			 * we always have to disable PMRs or DMA may fail on
2612			 * this device
2613			 */
2614			if (force_on)
2615				iommu_disable_protect_mem_regions(iommu);
2616			continue;
2617		}
2618
2619		iommu_flush_write_buffer(iommu);
2620		iommu_set_root_entry(iommu);
2621		iommu_enable_translation(iommu);
2622		iommu_disable_protect_mem_regions(iommu);
2623	}
2624
2625	return 0;
2626}
2627
2628static void iommu_flush_all(void)
2629{
2630	struct dmar_drhd_unit *drhd;
2631	struct intel_iommu *iommu;
2632
2633	for_each_active_iommu(iommu, drhd) {
2634		iommu->flush.flush_context(iommu, 0, 0, 0,
2635					   DMA_CCMD_GLOBAL_INVL);
2636		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2637					 DMA_TLB_GLOBAL_FLUSH);
2638	}
2639}
2640
2641static int iommu_suspend(void)
2642{
2643	struct dmar_drhd_unit *drhd;
2644	struct intel_iommu *iommu = NULL;
2645	unsigned long flag;
2646
2647	iommu_flush_all();
2648
2649	for_each_active_iommu(iommu, drhd) {
2650		iommu_disable_translation(iommu);
2651
2652		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2653
2654		iommu->iommu_state[SR_DMAR_FECTL_REG] =
2655			readl(iommu->reg + DMAR_FECTL_REG);
2656		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2657			readl(iommu->reg + DMAR_FEDATA_REG);
2658		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2659			readl(iommu->reg + DMAR_FEADDR_REG);
2660		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2661			readl(iommu->reg + DMAR_FEUADDR_REG);
2662
2663		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2664	}
2665	return 0;
2666}
2667
2668static void iommu_resume(void)
2669{
2670	struct dmar_drhd_unit *drhd;
2671	struct intel_iommu *iommu = NULL;
2672	unsigned long flag;
2673
2674	if (init_iommu_hw()) {
2675		if (force_on)
2676			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2677		else
2678			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2679		return;
2680	}
2681
2682	for_each_active_iommu(iommu, drhd) {
2683
2684		raw_spin_lock_irqsave(&iommu->register_lock, flag);
2685
2686		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2687			iommu->reg + DMAR_FECTL_REG);
2688		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2689			iommu->reg + DMAR_FEDATA_REG);
2690		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2691			iommu->reg + DMAR_FEADDR_REG);
2692		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2693			iommu->reg + DMAR_FEUADDR_REG);
2694
2695		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2696	}
2697}
2698
2699static struct syscore_ops iommu_syscore_ops = {
2700	.resume		= iommu_resume,
2701	.suspend	= iommu_suspend,
2702};
2703
2704static void __init init_iommu_pm_ops(void)
2705{
2706	register_syscore_ops(&iommu_syscore_ops);
2707}
2708
2709#else
2710static inline void init_iommu_pm_ops(void) {}
2711#endif	/* CONFIG_PM */
2712
2713static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2714{
2715	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2716	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2717	    rmrr->end_address <= rmrr->base_address ||
2718	    arch_rmrr_sanity_check(rmrr))
2719		return -EINVAL;
2720
2721	return 0;
2722}
2723
2724int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2725{
2726	struct acpi_dmar_reserved_memory *rmrr;
2727	struct dmar_rmrr_unit *rmrru;
2728
2729	rmrr = (struct acpi_dmar_reserved_memory *)header;
2730	if (rmrr_sanity_check(rmrr)) {
2731		pr_warn(FW_BUG
2732			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2733			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2734			   rmrr->base_address, rmrr->end_address,
2735			   dmi_get_system_info(DMI_BIOS_VENDOR),
2736			   dmi_get_system_info(DMI_BIOS_VERSION),
2737			   dmi_get_system_info(DMI_PRODUCT_VERSION));
2738		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2739	}
2740
2741	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2742	if (!rmrru)
2743		goto out;
2744
2745	rmrru->hdr = header;
2746
2747	rmrru->base_address = rmrr->base_address;
2748	rmrru->end_address = rmrr->end_address;
2749
2750	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2751				((void *)rmrr) + rmrr->header.length,
2752				&rmrru->devices_cnt);
2753	if (rmrru->devices_cnt && rmrru->devices == NULL)
2754		goto free_rmrru;
2755
2756	list_add(&rmrru->list, &dmar_rmrr_units);
2757
2758	return 0;
2759free_rmrru:
2760	kfree(rmrru);
2761out:
2762	return -ENOMEM;
2763}
2764
2765static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2766{
2767	struct dmar_atsr_unit *atsru;
2768	struct acpi_dmar_atsr *tmp;
2769
2770	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2771				dmar_rcu_check()) {
2772		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2773		if (atsr->segment != tmp->segment)
2774			continue;
2775		if (atsr->header.length != tmp->header.length)
2776			continue;
2777		if (memcmp(atsr, tmp, atsr->header.length) == 0)
2778			return atsru;
2779	}
2780
2781	return NULL;
2782}
2783
2784int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2785{
2786	struct acpi_dmar_atsr *atsr;
2787	struct dmar_atsr_unit *atsru;
2788
2789	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2790		return 0;
2791
2792	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2793	atsru = dmar_find_atsr(atsr);
2794	if (atsru)
2795		return 0;
2796
2797	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2798	if (!atsru)
2799		return -ENOMEM;
2800
2801	/*
2802	 * If memory is allocated from slab by ACPI _DSM method, we need to
2803	 * copy the memory content because the memory buffer will be freed
2804	 * on return.
2805	 */
2806	atsru->hdr = (void *)(atsru + 1);
2807	memcpy(atsru->hdr, hdr, hdr->length);
2808	atsru->include_all = atsr->flags & 0x1;
2809	if (!atsru->include_all) {
2810		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2811				(void *)atsr + atsr->header.length,
2812				&atsru->devices_cnt);
2813		if (atsru->devices_cnt && atsru->devices == NULL) {
2814			kfree(atsru);
2815			return -ENOMEM;
2816		}
2817	}
2818
2819	list_add_rcu(&atsru->list, &dmar_atsr_units);
2820
2821	return 0;
2822}
2823
2824static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2825{
2826	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2827	kfree(atsru);
2828}
2829
2830int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2831{
2832	struct acpi_dmar_atsr *atsr;
2833	struct dmar_atsr_unit *atsru;
2834
2835	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2836	atsru = dmar_find_atsr(atsr);
2837	if (atsru) {
2838		list_del_rcu(&atsru->list);
2839		synchronize_rcu();
2840		intel_iommu_free_atsr(atsru);
2841	}
2842
2843	return 0;
2844}
2845
2846int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2847{
2848	int i;
2849	struct device *dev;
2850	struct acpi_dmar_atsr *atsr;
2851	struct dmar_atsr_unit *atsru;
2852
2853	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2854	atsru = dmar_find_atsr(atsr);
2855	if (!atsru)
2856		return 0;
2857
2858	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2859		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2860					  i, dev)
2861			return -EBUSY;
2862	}
2863
2864	return 0;
2865}
2866
2867static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2868{
2869	struct dmar_satc_unit *satcu;
2870	struct acpi_dmar_satc *tmp;
2871
2872	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2873				dmar_rcu_check()) {
2874		tmp = (struct acpi_dmar_satc *)satcu->hdr;
2875		if (satc->segment != tmp->segment)
2876			continue;
2877		if (satc->header.length != tmp->header.length)
2878			continue;
2879		if (memcmp(satc, tmp, satc->header.length) == 0)
2880			return satcu;
2881	}
2882
2883	return NULL;
2884}
2885
2886int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2887{
2888	struct acpi_dmar_satc *satc;
2889	struct dmar_satc_unit *satcu;
2890
2891	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2892		return 0;
2893
2894	satc = container_of(hdr, struct acpi_dmar_satc, header);
2895	satcu = dmar_find_satc(satc);
2896	if (satcu)
2897		return 0;
2898
2899	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2900	if (!satcu)
2901		return -ENOMEM;
2902
2903	satcu->hdr = (void *)(satcu + 1);
2904	memcpy(satcu->hdr, hdr, hdr->length);
2905	satcu->atc_required = satc->flags & 0x1;
2906	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2907					      (void *)satc + satc->header.length,
2908					      &satcu->devices_cnt);
2909	if (satcu->devices_cnt && !satcu->devices) {
2910		kfree(satcu);
2911		return -ENOMEM;
2912	}
2913	list_add_rcu(&satcu->list, &dmar_satc_units);
2914
2915	return 0;
2916}
2917
2918static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2919{
2920	int sp, ret;
2921	struct intel_iommu *iommu = dmaru->iommu;
2922
2923	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2924	if (ret)
2925		goto out;
2926
2927	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
2928		pr_warn("%s: Doesn't support hardware pass through.\n",
2929			iommu->name);
2930		return -ENXIO;
2931	}
2932
2933	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
2934	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
2935		pr_warn("%s: Doesn't support large page.\n",
2936			iommu->name);
2937		return -ENXIO;
2938	}
2939
2940	/*
2941	 * Disable translation if already enabled prior to OS handover.
2942	 */
2943	if (iommu->gcmd & DMA_GCMD_TE)
2944		iommu_disable_translation(iommu);
2945
2946	ret = iommu_init_domains(iommu);
2947	if (ret == 0)
2948		ret = iommu_alloc_root_entry(iommu);
2949	if (ret)
2950		goto out;
2951
2952	intel_svm_check(iommu);
2953
2954	if (dmaru->ignored) {
2955		/*
2956		 * we always have to disable PMRs or DMA may fail on this device
2957		 */
2958		if (force_on)
2959			iommu_disable_protect_mem_regions(iommu);
2960		return 0;
2961	}
2962
2963	intel_iommu_init_qi(iommu);
2964	iommu_flush_write_buffer(iommu);
2965
2966#ifdef CONFIG_INTEL_IOMMU_SVM
2967	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2968		ret = intel_svm_enable_prq(iommu);
2969		if (ret)
2970			goto disable_iommu;
2971	}
2972#endif
2973	ret = dmar_set_interrupt(iommu);
2974	if (ret)
2975		goto disable_iommu;
2976
2977	iommu_set_root_entry(iommu);
2978	iommu_enable_translation(iommu);
2979
2980	iommu_disable_protect_mem_regions(iommu);
2981	return 0;
2982
2983disable_iommu:
2984	disable_dmar_iommu(iommu);
2985out:
2986	free_dmar_iommu(iommu);
2987	return ret;
2988}
2989
2990int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2991{
2992	int ret = 0;
2993	struct intel_iommu *iommu = dmaru->iommu;
2994
2995	if (!intel_iommu_enabled)
2996		return 0;
2997	if (iommu == NULL)
2998		return -EINVAL;
2999
3000	if (insert) {
3001		ret = intel_iommu_add(dmaru);
3002	} else {
3003		disable_dmar_iommu(iommu);
3004		free_dmar_iommu(iommu);
3005	}
3006
3007	return ret;
3008}
3009
3010static void intel_iommu_free_dmars(void)
3011{
3012	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3013	struct dmar_atsr_unit *atsru, *atsr_n;
3014	struct dmar_satc_unit *satcu, *satc_n;
3015
3016	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3017		list_del(&rmrru->list);
3018		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3019		kfree(rmrru);
3020	}
3021
3022	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3023		list_del(&atsru->list);
3024		intel_iommu_free_atsr(atsru);
3025	}
3026	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3027		list_del(&satcu->list);
3028		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3029		kfree(satcu);
3030	}
3031}
3032
3033static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3034{
3035	struct dmar_satc_unit *satcu;
3036	struct acpi_dmar_satc *satc;
3037	struct device *tmp;
3038	int i;
3039
3040	dev = pci_physfn(dev);
3041	rcu_read_lock();
3042
3043	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3044		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3045		if (satc->segment != pci_domain_nr(dev->bus))
3046			continue;
3047		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3048			if (to_pci_dev(tmp) == dev)
3049				goto out;
3050	}
3051	satcu = NULL;
3052out:
3053	rcu_read_unlock();
3054	return satcu;
3055}
3056
3057static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3058{
3059	int i, ret = 1;
3060	struct pci_bus *bus;
3061	struct pci_dev *bridge = NULL;
3062	struct device *tmp;
3063	struct acpi_dmar_atsr *atsr;
3064	struct dmar_atsr_unit *atsru;
3065	struct dmar_satc_unit *satcu;
3066
3067	dev = pci_physfn(dev);
3068	satcu = dmar_find_matched_satc_unit(dev);
3069	if (satcu)
3070		/*
3071		 * This device supports ATS as it is in SATC table.
3072		 * When IOMMU is in legacy mode, enabling ATS is done
3073		 * automatically by HW for the device that requires
3074		 * ATS, hence OS should not enable this device ATS
3075		 * to avoid duplicated TLB invalidation.
3076		 */
3077		return !(satcu->atc_required && !sm_supported(iommu));
3078
3079	for (bus = dev->bus; bus; bus = bus->parent) {
3080		bridge = bus->self;
3081		/* If it's an integrated device, allow ATS */
3082		if (!bridge)
3083			return 1;
3084		/* Connected via non-PCIe: no ATS */
3085		if (!pci_is_pcie(bridge) ||
3086		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3087			return 0;
3088		/* If we found the root port, look it up in the ATSR */
3089		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3090			break;
3091	}
3092
3093	rcu_read_lock();
3094	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3095		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3096		if (atsr->segment != pci_domain_nr(dev->bus))
3097			continue;
3098
3099		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3100			if (tmp == &bridge->dev)
3101				goto out;
3102
3103		if (atsru->include_all)
3104			goto out;
3105	}
3106	ret = 0;
3107out:
3108	rcu_read_unlock();
3109
3110	return ret;
3111}
3112
3113int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3114{
3115	int ret;
3116	struct dmar_rmrr_unit *rmrru;
3117	struct dmar_atsr_unit *atsru;
3118	struct dmar_satc_unit *satcu;
3119	struct acpi_dmar_atsr *atsr;
3120	struct acpi_dmar_reserved_memory *rmrr;
3121	struct acpi_dmar_satc *satc;
3122
3123	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3124		return 0;
3125
3126	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3127		rmrr = container_of(rmrru->hdr,
3128				    struct acpi_dmar_reserved_memory, header);
3129		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3130			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3131				((void *)rmrr) + rmrr->header.length,
3132				rmrr->segment, rmrru->devices,
3133				rmrru->devices_cnt);
3134			if (ret < 0)
3135				return ret;
3136		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3137			dmar_remove_dev_scope(info, rmrr->segment,
3138				rmrru->devices, rmrru->devices_cnt);
3139		}
3140	}
3141
3142	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3143		if (atsru->include_all)
3144			continue;
3145
3146		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3147		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3148			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3149					(void *)atsr + atsr->header.length,
3150					atsr->segment, atsru->devices,
3151					atsru->devices_cnt);
3152			if (ret > 0)
3153				break;
3154			else if (ret < 0)
3155				return ret;
3156		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3157			if (dmar_remove_dev_scope(info, atsr->segment,
3158					atsru->devices, atsru->devices_cnt))
3159				break;
3160		}
3161	}
3162	list_for_each_entry(satcu, &dmar_satc_units, list) {
3163		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3164		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3165			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3166					(void *)satc + satc->header.length,
3167					satc->segment, satcu->devices,
3168					satcu->devices_cnt);
3169			if (ret > 0)
3170				break;
3171			else if (ret < 0)
3172				return ret;
3173		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3174			if (dmar_remove_dev_scope(info, satc->segment,
3175					satcu->devices, satcu->devices_cnt))
3176				break;
3177		}
3178	}
3179
3180	return 0;
3181}
3182
3183static int intel_iommu_memory_notifier(struct notifier_block *nb,
3184				       unsigned long val, void *v)
3185{
3186	struct memory_notify *mhp = v;
3187	unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3188	unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3189			mhp->nr_pages - 1);
3190
3191	switch (val) {
3192	case MEM_GOING_ONLINE:
3193		if (iommu_domain_identity_map(si_domain,
3194					      start_vpfn, last_vpfn)) {
3195			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3196				start_vpfn, last_vpfn);
3197			return NOTIFY_BAD;
3198		}
3199		break;
3200
3201	case MEM_OFFLINE:
3202	case MEM_CANCEL_ONLINE:
3203		{
3204			LIST_HEAD(freelist);
3205
3206			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3207			iommu_put_pages_list(&freelist);
3208		}
3209		break;
3210	}
3211
3212	return NOTIFY_OK;
3213}
3214
3215static struct notifier_block intel_iommu_memory_nb = {
3216	.notifier_call = intel_iommu_memory_notifier,
3217	.priority = 0
3218};
3219
3220static void intel_disable_iommus(void)
3221{
3222	struct intel_iommu *iommu = NULL;
3223	struct dmar_drhd_unit *drhd;
3224
3225	for_each_iommu(iommu, drhd)
3226		iommu_disable_translation(iommu);
3227}
3228
3229void intel_iommu_shutdown(void)
3230{
3231	struct dmar_drhd_unit *drhd;
3232	struct intel_iommu *iommu = NULL;
3233
3234	if (no_iommu || dmar_disabled)
3235		return;
3236
3237	down_write(&dmar_global_lock);
3238
3239	/* Disable PMRs explicitly here. */
3240	for_each_iommu(iommu, drhd)
3241		iommu_disable_protect_mem_regions(iommu);
3242
3243	/* Make sure the IOMMUs are switched off */
3244	intel_disable_iommus();
3245
3246	up_write(&dmar_global_lock);
3247}
3248
3249static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3250{
3251	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3252
3253	return container_of(iommu_dev, struct intel_iommu, iommu);
3254}
3255
3256static ssize_t version_show(struct device *dev,
3257			    struct device_attribute *attr, char *buf)
3258{
3259	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3260	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3261	return sysfs_emit(buf, "%d:%d\n",
3262			  DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3263}
3264static DEVICE_ATTR_RO(version);
3265
3266static ssize_t address_show(struct device *dev,
3267			    struct device_attribute *attr, char *buf)
3268{
3269	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3270	return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3271}
3272static DEVICE_ATTR_RO(address);
3273
3274static ssize_t cap_show(struct device *dev,
3275			struct device_attribute *attr, char *buf)
3276{
3277	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3278	return sysfs_emit(buf, "%llx\n", iommu->cap);
3279}
3280static DEVICE_ATTR_RO(cap);
3281
3282static ssize_t ecap_show(struct device *dev,
3283			 struct device_attribute *attr, char *buf)
3284{
3285	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3286	return sysfs_emit(buf, "%llx\n", iommu->ecap);
3287}
3288static DEVICE_ATTR_RO(ecap);
3289
3290static ssize_t domains_supported_show(struct device *dev,
3291				      struct device_attribute *attr, char *buf)
3292{
3293	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3294	return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3295}
3296static DEVICE_ATTR_RO(domains_supported);
3297
3298static ssize_t domains_used_show(struct device *dev,
3299				 struct device_attribute *attr, char *buf)
3300{
3301	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3302	return sysfs_emit(buf, "%d\n",
3303			  bitmap_weight(iommu->domain_ids,
3304					cap_ndoms(iommu->cap)));
3305}
3306static DEVICE_ATTR_RO(domains_used);
3307
3308static struct attribute *intel_iommu_attrs[] = {
3309	&dev_attr_version.attr,
3310	&dev_attr_address.attr,
3311	&dev_attr_cap.attr,
3312	&dev_attr_ecap.attr,
3313	&dev_attr_domains_supported.attr,
3314	&dev_attr_domains_used.attr,
3315	NULL,
3316};
3317
3318static struct attribute_group intel_iommu_group = {
3319	.name = "intel-iommu",
3320	.attrs = intel_iommu_attrs,
3321};
3322
3323const struct attribute_group *intel_iommu_groups[] = {
3324	&intel_iommu_group,
3325	NULL,
3326};
3327
3328static bool has_external_pci(void)
3329{
3330	struct pci_dev *pdev = NULL;
3331
3332	for_each_pci_dev(pdev)
3333		if (pdev->external_facing) {
3334			pci_dev_put(pdev);
3335			return true;
3336		}
3337
3338	return false;
3339}
3340
3341static int __init platform_optin_force_iommu(void)
3342{
3343	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3344		return 0;
3345
3346	if (no_iommu || dmar_disabled)
3347		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3348
3349	/*
3350	 * If Intel-IOMMU is disabled by default, we will apply identity
3351	 * map for all devices except those marked as being untrusted.
3352	 */
3353	if (dmar_disabled)
3354		iommu_set_default_passthrough(false);
3355
3356	dmar_disabled = 0;
3357	no_iommu = 0;
3358
3359	return 1;
3360}
3361
3362static int __init probe_acpi_namespace_devices(void)
3363{
3364	struct dmar_drhd_unit *drhd;
3365	/* To avoid a -Wunused-but-set-variable warning. */
3366	struct intel_iommu *iommu __maybe_unused;
3367	struct device *dev;
3368	int i, ret = 0;
3369
3370	for_each_active_iommu(iommu, drhd) {
3371		for_each_active_dev_scope(drhd->devices,
3372					  drhd->devices_cnt, i, dev) {
3373			struct acpi_device_physical_node *pn;
3374			struct acpi_device *adev;
3375
3376			if (dev->bus != &acpi_bus_type)
3377				continue;
3378
3379			adev = to_acpi_device(dev);
3380			mutex_lock(&adev->physical_node_lock);
3381			list_for_each_entry(pn,
3382					    &adev->physical_node_list, node) {
3383				ret = iommu_probe_device(pn->dev);
3384				if (ret)
3385					break;
3386			}
3387			mutex_unlock(&adev->physical_node_lock);
3388
3389			if (ret)
3390				return ret;
3391		}
3392	}
3393
3394	return 0;
3395}
3396
3397static __init int tboot_force_iommu(void)
3398{
3399	if (!tboot_enabled())
3400		return 0;
3401
3402	if (no_iommu || dmar_disabled)
3403		pr_warn("Forcing Intel-IOMMU to enabled\n");
3404
3405	dmar_disabled = 0;
3406	no_iommu = 0;
3407
3408	return 1;
3409}
3410
3411int __init intel_iommu_init(void)
3412{
3413	int ret = -ENODEV;
3414	struct dmar_drhd_unit *drhd;
3415	struct intel_iommu *iommu;
3416
3417	/*
3418	 * Intel IOMMU is required for a TXT/tboot launch or platform
3419	 * opt in, so enforce that.
3420	 */
3421	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3422		    platform_optin_force_iommu();
3423
3424	down_write(&dmar_global_lock);
3425	if (dmar_table_init()) {
3426		if (force_on)
3427			panic("tboot: Failed to initialize DMAR table\n");
3428		goto out_free_dmar;
3429	}
3430
3431	if (dmar_dev_scope_init() < 0) {
3432		if (force_on)
3433			panic("tboot: Failed to initialize DMAR device scope\n");
3434		goto out_free_dmar;
3435	}
3436
3437	up_write(&dmar_global_lock);
3438
3439	/*
3440	 * The bus notifier takes the dmar_global_lock, so lockdep will
3441	 * complain later when we register it under the lock.
3442	 */
3443	dmar_register_bus_notifier();
3444
3445	down_write(&dmar_global_lock);
3446
3447	if (!no_iommu)
3448		intel_iommu_debugfs_init();
3449
3450	if (no_iommu || dmar_disabled) {
3451		/*
3452		 * We exit the function here to ensure IOMMU's remapping and
3453		 * mempool aren't setup, which means that the IOMMU's PMRs
3454		 * won't be disabled via the call to init_dmars(). So disable
3455		 * it explicitly here. The PMRs were setup by tboot prior to
3456		 * calling SENTER, but the kernel is expected to reset/tear
3457		 * down the PMRs.
3458		 */
3459		if (intel_iommu_tboot_noforce) {
3460			for_each_iommu(iommu, drhd)
3461				iommu_disable_protect_mem_regions(iommu);
3462		}
3463
3464		/*
3465		 * Make sure the IOMMUs are switched off, even when we
3466		 * boot into a kexec kernel and the previous kernel left
3467		 * them enabled
3468		 */
3469		intel_disable_iommus();
3470		goto out_free_dmar;
3471	}
3472
3473	if (list_empty(&dmar_rmrr_units))
3474		pr_info("No RMRR found\n");
3475
3476	if (list_empty(&dmar_atsr_units))
3477		pr_info("No ATSR found\n");
3478
3479	if (list_empty(&dmar_satc_units))
3480		pr_info("No SATC found\n");
3481
3482	init_no_remapping_devices();
3483
3484	ret = init_dmars();
3485	if (ret) {
3486		if (force_on)
3487			panic("tboot: Failed to initialize DMARs\n");
3488		pr_err("Initialization failed\n");
3489		goto out_free_dmar;
3490	}
3491	up_write(&dmar_global_lock);
3492
3493	init_iommu_pm_ops();
3494
3495	down_read(&dmar_global_lock);
3496	for_each_active_iommu(iommu, drhd) {
3497		/*
3498		 * The flush queue implementation does not perform
3499		 * page-selective invalidations that are required for efficient
3500		 * TLB flushes in virtual environments.  The benefit of batching
3501		 * is likely to be much lower than the overhead of synchronizing
3502		 * the virtual and physical IOMMU page-tables.
3503		 */
3504		if (cap_caching_mode(iommu->cap) &&
3505		    !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3506			pr_info_once("IOMMU batching disallowed due to virtualization\n");
3507			iommu_set_dma_strict();
3508		}
3509		iommu_device_sysfs_add(&iommu->iommu, NULL,
3510				       intel_iommu_groups,
3511				       "%s", iommu->name);
3512		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3513
3514		iommu_pmu_register(iommu);
3515	}
3516	up_read(&dmar_global_lock);
3517
3518	if (si_domain && !hw_pass_through)
3519		register_memory_notifier(&intel_iommu_memory_nb);
3520
3521	down_read(&dmar_global_lock);
3522	if (probe_acpi_namespace_devices())
3523		pr_warn("ACPI name space devices didn't probe correctly\n");
3524
3525	/* Finally, we enable the DMA remapping hardware. */
3526	for_each_iommu(iommu, drhd) {
3527		if (!drhd->ignored && !translation_pre_enabled(iommu))
3528			iommu_enable_translation(iommu);
3529
3530		iommu_disable_protect_mem_regions(iommu);
3531	}
3532	up_read(&dmar_global_lock);
3533
3534	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3535
3536	intel_iommu_enabled = 1;
3537
3538	return 0;
3539
3540out_free_dmar:
3541	intel_iommu_free_dmars();
3542	up_write(&dmar_global_lock);
3543	return ret;
3544}
3545
3546static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3547{
3548	struct device_domain_info *info = opaque;
3549
3550	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3551	return 0;
3552}
3553
3554/*
3555 * NB - intel-iommu lacks any sort of reference counting for the users of
3556 * dependent devices.  If multiple endpoints have intersecting dependent
3557 * devices, unbinding the driver from any one of them will possibly leave
3558 * the others unable to operate.
3559 */
3560static void domain_context_clear(struct device_domain_info *info)
3561{
3562	if (!dev_is_pci(info->dev))
3563		domain_context_clear_one(info, info->bus, info->devfn);
3564
3565	pci_for_each_dma_alias(to_pci_dev(info->dev),
3566			       &domain_context_clear_one_cb, info);
3567}
3568
3569/*
3570 * Clear the page table pointer in context or pasid table entries so that
3571 * all DMA requests without PASID from the device are blocked. If the page
3572 * table has been set, clean up the data structures.
3573 */
3574void device_block_translation(struct device *dev)
3575{
3576	struct device_domain_info *info = dev_iommu_priv_get(dev);
3577	struct intel_iommu *iommu = info->iommu;
3578	unsigned long flags;
3579
3580	iommu_disable_pci_caps(info);
3581	if (!dev_is_real_dma_subdevice(dev)) {
3582		if (sm_supported(iommu))
3583			intel_pasid_tear_down_entry(iommu, dev,
3584						    IOMMU_NO_PASID, false);
3585		else
3586			domain_context_clear(info);
3587	}
3588
3589	if (!info->domain)
3590		return;
3591
3592	spin_lock_irqsave(&info->domain->lock, flags);
3593	list_del(&info->link);
3594	spin_unlock_irqrestore(&info->domain->lock, flags);
3595
3596	cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3597	domain_detach_iommu(info->domain, iommu);
3598	info->domain = NULL;
3599}
3600
3601static int md_domain_init(struct dmar_domain *domain, int guest_width)
3602{
3603	int adjust_width;
3604
3605	/* calculate AGAW */
3606	domain->gaw = guest_width;
3607	adjust_width = guestwidth_to_adjustwidth(guest_width);
3608	domain->agaw = width_to_agaw(adjust_width);
3609
3610	domain->iommu_coherency = false;
3611	domain->iommu_superpage = 0;
3612	domain->max_addr = 0;
3613
3614	/* always allocate the top pgd */
3615	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
3616	if (!domain->pgd)
3617		return -ENOMEM;
3618	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3619	return 0;
3620}
3621
3622static int blocking_domain_attach_dev(struct iommu_domain *domain,
3623				      struct device *dev)
3624{
3625	device_block_translation(dev);
3626	return 0;
3627}
3628
3629static struct iommu_domain blocking_domain = {
3630	.type = IOMMU_DOMAIN_BLOCKED,
3631	.ops = &(const struct iommu_domain_ops) {
3632		.attach_dev	= blocking_domain_attach_dev,
3633	}
3634};
3635
3636static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3637{
3638	struct dmar_domain *dmar_domain;
3639	struct iommu_domain *domain;
3640
3641	switch (type) {
3642	case IOMMU_DOMAIN_DMA:
3643	case IOMMU_DOMAIN_UNMANAGED:
3644		dmar_domain = alloc_domain(type);
3645		if (!dmar_domain) {
3646			pr_err("Can't allocate dmar_domain\n");
3647			return NULL;
3648		}
3649		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3650			pr_err("Domain initialization failed\n");
3651			domain_exit(dmar_domain);
3652			return NULL;
3653		}
3654
3655		domain = &dmar_domain->domain;
3656		domain->geometry.aperture_start = 0;
3657		domain->geometry.aperture_end   =
3658				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
3659		domain->geometry.force_aperture = true;
3660
3661		return domain;
3662	case IOMMU_DOMAIN_IDENTITY:
3663		return &si_domain->domain;
3664	default:
3665		return NULL;
3666	}
3667
3668	return NULL;
3669}
3670
3671static struct iommu_domain *
3672intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3673			      struct iommu_domain *parent,
3674			      const struct iommu_user_data *user_data)
3675{
3676	struct device_domain_info *info = dev_iommu_priv_get(dev);
3677	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3678	bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3679	struct intel_iommu *iommu = info->iommu;
3680	struct dmar_domain *dmar_domain;
3681	struct iommu_domain *domain;
3682
3683	/* Must be NESTING domain */
3684	if (parent) {
3685		if (!nested_supported(iommu) || flags)
3686			return ERR_PTR(-EOPNOTSUPP);
3687		return intel_nested_domain_alloc(parent, user_data);
3688	}
3689
3690	if (flags &
3691	    (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3692		return ERR_PTR(-EOPNOTSUPP);
3693	if (nested_parent && !nested_supported(iommu))
3694		return ERR_PTR(-EOPNOTSUPP);
3695	if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3696		return ERR_PTR(-EOPNOTSUPP);
3697
3698	/*
3699	 * domain_alloc_user op needs to fully initialize a domain before
3700	 * return, so uses iommu_domain_alloc() here for simple.
3701	 */
3702	domain = iommu_domain_alloc(dev->bus);
3703	if (!domain)
3704		return ERR_PTR(-ENOMEM);
3705
3706	dmar_domain = to_dmar_domain(domain);
3707
3708	if (nested_parent) {
3709		dmar_domain->nested_parent = true;
3710		INIT_LIST_HEAD(&dmar_domain->s1_domains);
3711		spin_lock_init(&dmar_domain->s1_lock);
3712	}
3713
3714	if (dirty_tracking) {
3715		if (dmar_domain->use_first_level) {
3716			iommu_domain_free(domain);
3717			return ERR_PTR(-EOPNOTSUPP);
3718		}
3719		domain->dirty_ops = &intel_dirty_ops;
3720	}
3721
3722	return domain;
3723}
3724
3725static void intel_iommu_domain_free(struct iommu_domain *domain)
3726{
3727	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3728
3729	WARN_ON(dmar_domain->nested_parent &&
3730		!list_empty(&dmar_domain->s1_domains));
3731	if (domain != &si_domain->domain)
3732		domain_exit(dmar_domain);
3733}
3734
3735int prepare_domain_attach_device(struct iommu_domain *domain,
3736				 struct device *dev)
3737{
3738	struct device_domain_info *info = dev_iommu_priv_get(dev);
3739	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3740	struct intel_iommu *iommu = info->iommu;
3741	int addr_width;
3742
3743	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3744		return -EINVAL;
3745
3746	if (domain->dirty_ops && !ssads_supported(iommu))
3747		return -EINVAL;
3748
3749	/* check if this iommu agaw is sufficient for max mapped address */
3750	addr_width = agaw_to_width(iommu->agaw);
3751	if (addr_width > cap_mgaw(iommu->cap))
3752		addr_width = cap_mgaw(iommu->cap);
3753
3754	if (dmar_domain->max_addr > (1LL << addr_width))
3755		return -EINVAL;
3756	dmar_domain->gaw = addr_width;
3757
3758	/*
3759	 * Knock out extra levels of page tables if necessary
3760	 */
3761	while (iommu->agaw < dmar_domain->agaw) {
3762		struct dma_pte *pte;
3763
3764		pte = dmar_domain->pgd;
3765		if (dma_pte_present(pte)) {
3766			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
3767			iommu_free_page(pte);
3768		}
3769		dmar_domain->agaw--;
3770	}
3771
3772	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3773	    context_copied(iommu, info->bus, info->devfn))
3774		return intel_pasid_setup_sm_context(dev);
3775
3776	return 0;
3777}
3778
3779static int intel_iommu_attach_device(struct iommu_domain *domain,
3780				     struct device *dev)
3781{
3782	struct device_domain_info *info = dev_iommu_priv_get(dev);
3783	int ret;
3784
3785	if (info->domain)
3786		device_block_translation(dev);
3787
3788	ret = prepare_domain_attach_device(domain, dev);
3789	if (ret)
3790		return ret;
3791
3792	return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3793}
3794
3795static int intel_iommu_map(struct iommu_domain *domain,
3796			   unsigned long iova, phys_addr_t hpa,
3797			   size_t size, int iommu_prot, gfp_t gfp)
3798{
3799	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3800	u64 max_addr;
3801	int prot = 0;
3802
3803	if (iommu_prot & IOMMU_READ)
3804		prot |= DMA_PTE_READ;
3805	if (iommu_prot & IOMMU_WRITE)
3806		prot |= DMA_PTE_WRITE;
3807	if (dmar_domain->set_pte_snp)
3808		prot |= DMA_PTE_SNP;
3809
3810	max_addr = iova + size;
3811	if (dmar_domain->max_addr < max_addr) {
3812		u64 end;
3813
3814		/* check if minimum agaw is sufficient for mapped address */
3815		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3816		if (end < max_addr) {
3817			pr_err("%s: iommu width (%d) is not "
3818			       "sufficient for the mapped address (%llx)\n",
3819			       __func__, dmar_domain->gaw, max_addr);
3820			return -EFAULT;
3821		}
3822		dmar_domain->max_addr = max_addr;
3823	}
3824	/* Round up size to next multiple of PAGE_SIZE, if it and
3825	   the low bits of hpa would take us onto the next page */
3826	size = aligned_nrpages(hpa, size);
3827	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3828				hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3829}
3830
3831static int intel_iommu_map_pages(struct iommu_domain *domain,
3832				 unsigned long iova, phys_addr_t paddr,
3833				 size_t pgsize, size_t pgcount,
3834				 int prot, gfp_t gfp, size_t *mapped)
3835{
3836	unsigned long pgshift = __ffs(pgsize);
3837	size_t size = pgcount << pgshift;
3838	int ret;
3839
3840	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3841		return -EINVAL;
3842
3843	if (!IS_ALIGNED(iova | paddr, pgsize))
3844		return -EINVAL;
3845
3846	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3847	if (!ret && mapped)
3848		*mapped = size;
3849
3850	return ret;
3851}
3852
3853static size_t intel_iommu_unmap(struct iommu_domain *domain,
3854				unsigned long iova, size_t size,
3855				struct iommu_iotlb_gather *gather)
3856{
3857	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3858	unsigned long start_pfn, last_pfn;
3859	int level = 0;
3860
3861	/* Cope with horrid API which requires us to unmap more than the
3862	   size argument if it happens to be a large-page mapping. */
3863	if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3864				     &level, GFP_ATOMIC)))
3865		return 0;
3866
3867	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3868		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3869
3870	start_pfn = iova >> VTD_PAGE_SHIFT;
3871	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3872
3873	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3874
3875	if (dmar_domain->max_addr == iova + size)
3876		dmar_domain->max_addr = iova;
3877
3878	/*
3879	 * We do not use page-selective IOTLB invalidation in flush queue,
3880	 * so there is no need to track page and sync iotlb.
3881	 */
3882	if (!iommu_iotlb_gather_queued(gather))
3883		iommu_iotlb_gather_add_page(domain, gather, iova, size);
3884
3885	return size;
3886}
3887
3888static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3889				      unsigned long iova,
3890				      size_t pgsize, size_t pgcount,
3891				      struct iommu_iotlb_gather *gather)
3892{
3893	unsigned long pgshift = __ffs(pgsize);
3894	size_t size = pgcount << pgshift;
3895
3896	return intel_iommu_unmap(domain, iova, size, gather);
3897}
3898
3899static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3900				 struct iommu_iotlb_gather *gather)
3901{
3902	cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3903			      gather->end, list_empty(&gather->freelist));
3904	iommu_put_pages_list(&gather->freelist);
3905}
3906
3907static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3908					    dma_addr_t iova)
3909{
3910	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3911	struct dma_pte *pte;
3912	int level = 0;
3913	u64 phys = 0;
3914
3915	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3916			     GFP_ATOMIC);
3917	if (pte && dma_pte_present(pte))
3918		phys = dma_pte_addr(pte) +
3919			(iova & (BIT_MASK(level_to_offset_bits(level) +
3920						VTD_PAGE_SHIFT) - 1));
3921
3922	return phys;
3923}
3924
3925static bool domain_support_force_snooping(struct dmar_domain *domain)
3926{
3927	struct device_domain_info *info;
3928	bool support = true;
3929
3930	assert_spin_locked(&domain->lock);
3931	list_for_each_entry(info, &domain->devices, link) {
3932		if (!ecap_sc_support(info->iommu->ecap)) {
3933			support = false;
3934			break;
3935		}
3936	}
3937
3938	return support;
3939}
3940
3941static void domain_set_force_snooping(struct dmar_domain *domain)
3942{
3943	struct device_domain_info *info;
3944
3945	assert_spin_locked(&domain->lock);
3946	/*
3947	 * Second level page table supports per-PTE snoop control. The
3948	 * iommu_map() interface will handle this by setting SNP bit.
3949	 */
3950	if (!domain->use_first_level) {
3951		domain->set_pte_snp = true;
3952		return;
3953	}
3954
3955	list_for_each_entry(info, &domain->devices, link)
3956		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3957						     IOMMU_NO_PASID);
3958}
3959
3960static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3961{
3962	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3963	unsigned long flags;
3964
3965	if (dmar_domain->force_snooping)
3966		return true;
3967
3968	spin_lock_irqsave(&dmar_domain->lock, flags);
3969	if (!domain_support_force_snooping(dmar_domain) ||
3970	    (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3971		spin_unlock_irqrestore(&dmar_domain->lock, flags);
3972		return false;
3973	}
3974
3975	domain_set_force_snooping(dmar_domain);
3976	dmar_domain->force_snooping = true;
3977	spin_unlock_irqrestore(&dmar_domain->lock, flags);
3978
3979	return true;
3980}
3981
3982static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3983{
3984	struct device_domain_info *info = dev_iommu_priv_get(dev);
3985
3986	switch (cap) {
3987	case IOMMU_CAP_CACHE_COHERENCY:
3988	case IOMMU_CAP_DEFERRED_FLUSH:
3989		return true;
3990	case IOMMU_CAP_PRE_BOOT_PROTECTION:
3991		return dmar_platform_optin();
3992	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3993		return ecap_sc_support(info->iommu->ecap);
3994	case IOMMU_CAP_DIRTY_TRACKING:
3995		return ssads_supported(info->iommu);
3996	default:
3997		return false;
3998	}
3999}
4000
4001static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4002{
4003	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4004	struct device_domain_info *info;
4005	struct intel_iommu *iommu;
4006	u8 bus, devfn;
4007	int ret;
4008
4009	iommu = device_lookup_iommu(dev, &bus, &devfn);
4010	if (!iommu || !iommu->iommu.ops)
4011		return ERR_PTR(-ENODEV);
4012
4013	info = kzalloc(sizeof(*info), GFP_KERNEL);
4014	if (!info)
4015		return ERR_PTR(-ENOMEM);
4016
4017	if (dev_is_real_dma_subdevice(dev)) {
4018		info->bus = pdev->bus->number;
4019		info->devfn = pdev->devfn;
4020		info->segment = pci_domain_nr(pdev->bus);
4021	} else {
4022		info->bus = bus;
4023		info->devfn = devfn;
4024		info->segment = iommu->segment;
4025	}
4026
4027	info->dev = dev;
4028	info->iommu = iommu;
4029	if (dev_is_pci(dev)) {
4030		if (ecap_dev_iotlb_support(iommu->ecap) &&
4031		    pci_ats_supported(pdev) &&
4032		    dmar_ats_supported(pdev, iommu)) {
4033			info->ats_supported = 1;
4034			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4035
4036			/*
4037			 * For IOMMU that supports device IOTLB throttling
4038			 * (DIT), we assign PFSID to the invalidation desc
4039			 * of a VF such that IOMMU HW can gauge queue depth
4040			 * at PF level. If DIT is not set, PFSID will be
4041			 * treated as reserved, which should be set to 0.
4042			 */
4043			if (ecap_dit(iommu->ecap))
4044				info->pfsid = pci_dev_id(pci_physfn(pdev));
4045			info->ats_qdep = pci_ats_queue_depth(pdev);
4046		}
4047		if (sm_supported(iommu)) {
4048			if (pasid_supported(iommu)) {
4049				int features = pci_pasid_features(pdev);
4050
4051				if (features >= 0)
4052					info->pasid_supported = features | 1;
4053			}
4054
4055			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4056			    pci_pri_supported(pdev))
4057				info->pri_supported = 1;
4058		}
4059	}
4060
4061	dev_iommu_priv_set(dev, info);
4062	if (pdev && pci_ats_supported(pdev)) {
4063		ret = device_rbtree_insert(iommu, info);
4064		if (ret)
4065			goto free;
4066	}
4067
4068	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4069		ret = intel_pasid_alloc_table(dev);
4070		if (ret) {
4071			dev_err(dev, "PASID table allocation failed\n");
4072			goto clear_rbtree;
4073		}
4074
4075		if (!context_copied(iommu, info->bus, info->devfn)) {
4076			ret = intel_pasid_setup_sm_context(dev);
4077			if (ret)
4078				goto free_table;
4079		}
4080	}
4081
4082	intel_iommu_debugfs_create_dev(info);
4083
4084	return &iommu->iommu;
4085free_table:
4086	intel_pasid_free_table(dev);
4087clear_rbtree:
4088	device_rbtree_remove(info);
4089free:
4090	kfree(info);
4091
4092	return ERR_PTR(ret);
4093}
4094
4095static void intel_iommu_release_device(struct device *dev)
4096{
4097	struct device_domain_info *info = dev_iommu_priv_get(dev);
4098	struct intel_iommu *iommu = info->iommu;
4099
4100	mutex_lock(&iommu->iopf_lock);
4101	if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
4102		device_rbtree_remove(info);
4103	mutex_unlock(&iommu->iopf_lock);
4104
4105	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
4106	    !context_copied(iommu, info->bus, info->devfn))
4107		intel_pasid_teardown_sm_context(dev);
4108
4109	intel_pasid_free_table(dev);
4110	intel_iommu_debugfs_remove_dev(info);
4111	kfree(info);
4112	set_dma_ops(dev, NULL);
4113}
4114
4115static void intel_iommu_get_resv_regions(struct device *device,
4116					 struct list_head *head)
4117{
4118	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4119	struct iommu_resv_region *reg;
4120	struct dmar_rmrr_unit *rmrr;
4121	struct device *i_dev;
4122	int i;
4123
4124	rcu_read_lock();
4125	for_each_rmrr_units(rmrr) {
4126		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4127					  i, i_dev) {
4128			struct iommu_resv_region *resv;
4129			enum iommu_resv_type type;
4130			size_t length;
4131
4132			if (i_dev != device &&
4133			    !is_downstream_to_pci_bridge(device, i_dev))
4134				continue;
4135
4136			length = rmrr->end_address - rmrr->base_address + 1;
4137
4138			type = device_rmrr_is_relaxable(device) ?
4139				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4140
4141			resv = iommu_alloc_resv_region(rmrr->base_address,
4142						       length, prot, type,
4143						       GFP_ATOMIC);
4144			if (!resv)
4145				break;
4146
4147			list_add_tail(&resv->list, head);
4148		}
4149	}
4150	rcu_read_unlock();
4151
4152#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4153	if (dev_is_pci(device)) {
4154		struct pci_dev *pdev = to_pci_dev(device);
4155
4156		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4157			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4158					IOMMU_RESV_DIRECT_RELAXABLE,
4159					GFP_KERNEL);
4160			if (reg)
4161				list_add_tail(&reg->list, head);
4162		}
4163	}
4164#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4165
4166	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4167				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4168				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4169	if (!reg)
4170		return;
4171	list_add_tail(&reg->list, head);
4172}
4173
4174static struct iommu_group *intel_iommu_device_group(struct device *dev)
4175{
4176	if (dev_is_pci(dev))
4177		return pci_device_group(dev);
4178	return generic_device_group(dev);
4179}
4180
4181static int intel_iommu_enable_sva(struct device *dev)
4182{
4183	struct device_domain_info *info = dev_iommu_priv_get(dev);
4184	struct intel_iommu *iommu;
4185
4186	if (!info || dmar_disabled)
4187		return -EINVAL;
4188
4189	iommu = info->iommu;
4190	if (!iommu)
4191		return -EINVAL;
4192
4193	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4194		return -ENODEV;
4195
4196	if (!info->pasid_enabled || !info->ats_enabled)
4197		return -EINVAL;
4198
4199	/*
4200	 * Devices having device-specific I/O fault handling should not
4201	 * support PCI/PRI. The IOMMU side has no means to check the
4202	 * capability of device-specific IOPF.  Therefore, IOMMU can only
4203	 * default that if the device driver enables SVA on a non-PRI
4204	 * device, it will handle IOPF in its own way.
4205	 */
4206	if (!info->pri_supported)
4207		return 0;
4208
4209	/* Devices supporting PRI should have it enabled. */
4210	if (!info->pri_enabled)
4211		return -EINVAL;
4212
4213	return 0;
4214}
4215
4216static int intel_iommu_enable_iopf(struct device *dev)
4217{
4218	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4219	struct device_domain_info *info = dev_iommu_priv_get(dev);
4220	struct intel_iommu *iommu;
4221	int ret;
4222
4223	if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4224		return -ENODEV;
4225
4226	if (info->pri_enabled)
4227		return -EBUSY;
4228
4229	iommu = info->iommu;
4230	if (!iommu)
4231		return -EINVAL;
4232
4233	/* PASID is required in PRG Response Message. */
4234	if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4235		return -EINVAL;
4236
4237	ret = pci_reset_pri(pdev);
4238	if (ret)
4239		return ret;
4240
4241	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4242	if (ret)
4243		return ret;
4244
4245	ret = pci_enable_pri(pdev, PRQ_DEPTH);
4246	if (ret) {
4247		iopf_queue_remove_device(iommu->iopf_queue, dev);
4248		return ret;
4249	}
4250
4251	info->pri_enabled = 1;
4252
4253	return 0;
4254}
4255
4256static int intel_iommu_disable_iopf(struct device *dev)
4257{
4258	struct device_domain_info *info = dev_iommu_priv_get(dev);
4259	struct intel_iommu *iommu = info->iommu;
4260
4261	if (!info->pri_enabled)
4262		return -EINVAL;
4263
4264	/*
4265	 * PCIe spec states that by clearing PRI enable bit, the Page
4266	 * Request Interface will not issue new page requests, but has
4267	 * outstanding page requests that have been transmitted or are
4268	 * queued for transmission. This is supposed to be called after
4269	 * the device driver has stopped DMA, all PASIDs have been
4270	 * unbound and the outstanding PRQs have been drained.
4271	 */
4272	pci_disable_pri(to_pci_dev(dev));
4273	info->pri_enabled = 0;
4274	iopf_queue_remove_device(iommu->iopf_queue, dev);
4275
4276	return 0;
4277}
4278
4279static int
4280intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4281{
4282	switch (feat) {
4283	case IOMMU_DEV_FEAT_IOPF:
4284		return intel_iommu_enable_iopf(dev);
4285
4286	case IOMMU_DEV_FEAT_SVA:
4287		return intel_iommu_enable_sva(dev);
4288
4289	default:
4290		return -ENODEV;
4291	}
4292}
4293
4294static int
4295intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4296{
4297	switch (feat) {
4298	case IOMMU_DEV_FEAT_IOPF:
4299		return intel_iommu_disable_iopf(dev);
4300
4301	case IOMMU_DEV_FEAT_SVA:
4302		return 0;
4303
4304	default:
4305		return -ENODEV;
4306	}
4307}
4308
4309static bool intel_iommu_is_attach_deferred(struct device *dev)
4310{
4311	struct device_domain_info *info = dev_iommu_priv_get(dev);
4312
4313	return translation_pre_enabled(info->iommu) && !info->domain;
4314}
4315
4316/*
4317 * Check that the device does not live on an external facing PCI port that is
4318 * marked as untrusted. Such devices should not be able to apply quirks and
4319 * thus not be able to bypass the IOMMU restrictions.
4320 */
4321static bool risky_device(struct pci_dev *pdev)
4322{
4323	if (pdev->untrusted) {
4324		pci_info(pdev,
4325			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4326			 pdev->vendor, pdev->device);
4327		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4328		return true;
4329	}
4330	return false;
4331}
4332
4333static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4334				      unsigned long iova, size_t size)
4335{
4336	cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4337
4338	return 0;
4339}
4340
4341static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4342					 struct iommu_domain *domain)
4343{
4344	struct device_domain_info *info = dev_iommu_priv_get(dev);
4345	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4346	struct dev_pasid_info *curr, *dev_pasid = NULL;
4347	struct intel_iommu *iommu = info->iommu;
4348	unsigned long flags;
4349
4350	spin_lock_irqsave(&dmar_domain->lock, flags);
4351	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4352		if (curr->dev == dev && curr->pasid == pasid) {
4353			list_del(&curr->link_domain);
4354			dev_pasid = curr;
4355			break;
4356		}
4357	}
4358	WARN_ON_ONCE(!dev_pasid);
4359	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4360
4361	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4362	domain_detach_iommu(dmar_domain, iommu);
4363	intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4364	kfree(dev_pasid);
4365	intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4366	intel_drain_pasid_prq(dev, pasid);
4367}
4368
4369static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4370				     struct device *dev, ioasid_t pasid)
4371{
4372	struct device_domain_info *info = dev_iommu_priv_get(dev);
4373	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4374	struct intel_iommu *iommu = info->iommu;
4375	struct dev_pasid_info *dev_pasid;
4376	unsigned long flags;
4377	int ret;
4378
4379	if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4380		return -EOPNOTSUPP;
4381
4382	if (domain->dirty_ops)
4383		return -EINVAL;
4384
4385	if (context_copied(iommu, info->bus, info->devfn))
4386		return -EBUSY;
4387
4388	ret = prepare_domain_attach_device(domain, dev);
4389	if (ret)
4390		return ret;
4391
4392	dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4393	if (!dev_pasid)
4394		return -ENOMEM;
4395
4396	ret = domain_attach_iommu(dmar_domain, iommu);
4397	if (ret)
4398		goto out_free;
4399
4400	ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4401	if (ret)
4402		goto out_detach_iommu;
4403
4404	if (domain_type_is_si(dmar_domain))
4405		ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4406	else if (dmar_domain->use_first_level)
4407		ret = domain_setup_first_level(iommu, dmar_domain,
4408					       dev, pasid);
4409	else
4410		ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4411						     dev, pasid);
4412	if (ret)
4413		goto out_unassign_tag;
4414
4415	dev_pasid->dev = dev;
4416	dev_pasid->pasid = pasid;
4417	spin_lock_irqsave(&dmar_domain->lock, flags);
4418	list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4419	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4420
4421	if (domain->type & __IOMMU_DOMAIN_PAGING)
4422		intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4423
4424	return 0;
4425out_unassign_tag:
4426	cache_tag_unassign_domain(dmar_domain, dev, pasid);
4427out_detach_iommu:
4428	domain_detach_iommu(dmar_domain, iommu);
4429out_free:
4430	kfree(dev_pasid);
4431	return ret;
4432}
4433
4434static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4435{
4436	struct device_domain_info *info = dev_iommu_priv_get(dev);
4437	struct intel_iommu *iommu = info->iommu;
4438	struct iommu_hw_info_vtd *vtd;
4439
4440	vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4441	if (!vtd)
4442		return ERR_PTR(-ENOMEM);
4443
4444	vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4445	vtd->cap_reg = iommu->cap;
4446	vtd->ecap_reg = iommu->ecap;
4447	*length = sizeof(*vtd);
4448	*type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4449	return vtd;
4450}
4451
4452/*
4453 * Set dirty tracking for the device list of a domain. The caller must
4454 * hold the domain->lock when calling it.
4455 */
4456static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4457{
4458	struct device_domain_info *info;
4459	int ret = 0;
4460
4461	list_for_each_entry(info, devices, link) {
4462		ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4463						       IOMMU_NO_PASID, enable);
4464		if (ret)
4465			break;
4466	}
4467
4468	return ret;
4469}
4470
4471static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4472					    bool enable)
4473{
4474	struct dmar_domain *s1_domain;
4475	unsigned long flags;
4476	int ret;
4477
4478	spin_lock(&domain->s1_lock);
4479	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4480		spin_lock_irqsave(&s1_domain->lock, flags);
4481		ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4482		spin_unlock_irqrestore(&s1_domain->lock, flags);
4483		if (ret)
4484			goto err_unwind;
4485	}
4486	spin_unlock(&domain->s1_lock);
4487	return 0;
4488
4489err_unwind:
4490	list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4491		spin_lock_irqsave(&s1_domain->lock, flags);
4492		device_set_dirty_tracking(&s1_domain->devices,
4493					  domain->dirty_tracking);
4494		spin_unlock_irqrestore(&s1_domain->lock, flags);
4495	}
4496	spin_unlock(&domain->s1_lock);
4497	return ret;
4498}
4499
4500static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4501					  bool enable)
4502{
4503	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4504	int ret;
4505
4506	spin_lock(&dmar_domain->lock);
4507	if (dmar_domain->dirty_tracking == enable)
4508		goto out_unlock;
4509
4510	ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4511	if (ret)
4512		goto err_unwind;
4513
4514	if (dmar_domain->nested_parent) {
4515		ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4516		if (ret)
4517			goto err_unwind;
4518	}
4519
4520	dmar_domain->dirty_tracking = enable;
4521out_unlock:
4522	spin_unlock(&dmar_domain->lock);
4523
4524	return 0;
4525
4526err_unwind:
4527	device_set_dirty_tracking(&dmar_domain->devices,
4528				  dmar_domain->dirty_tracking);
4529	spin_unlock(&dmar_domain->lock);
4530	return ret;
4531}
4532
4533static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4534					    unsigned long iova, size_t size,
4535					    unsigned long flags,
4536					    struct iommu_dirty_bitmap *dirty)
4537{
4538	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4539	unsigned long end = iova + size - 1;
4540	unsigned long pgsize;
4541
4542	/*
4543	 * IOMMUFD core calls into a dirty tracking disabled domain without an
4544	 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4545	 * have occurred when we stopped dirty tracking. This ensures that we
4546	 * never inherit dirtied bits from a previous cycle.
4547	 */
4548	if (!dmar_domain->dirty_tracking && dirty->bitmap)
4549		return -EINVAL;
4550
4551	do {
4552		struct dma_pte *pte;
4553		int lvl = 0;
4554
4555		pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4556				     GFP_ATOMIC);
4557		pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4558		if (!pte || !dma_pte_present(pte)) {
4559			iova += pgsize;
4560			continue;
4561		}
4562
4563		if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4564			iommu_dirty_bitmap_record(dirty, iova, pgsize);
4565		iova += pgsize;
4566	} while (iova < end);
4567
4568	return 0;
4569}
4570
4571static const struct iommu_dirty_ops intel_dirty_ops = {
4572	.set_dirty_tracking = intel_iommu_set_dirty_tracking,
4573	.read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4574};
4575
4576const struct iommu_ops intel_iommu_ops = {
4577	.blocked_domain		= &blocking_domain,
4578	.release_domain		= &blocking_domain,
4579	.capable		= intel_iommu_capable,
4580	.hw_info		= intel_iommu_hw_info,
4581	.domain_alloc		= intel_iommu_domain_alloc,
4582	.domain_alloc_user	= intel_iommu_domain_alloc_user,
4583	.domain_alloc_sva	= intel_svm_domain_alloc,
4584	.probe_device		= intel_iommu_probe_device,
4585	.release_device		= intel_iommu_release_device,
4586	.get_resv_regions	= intel_iommu_get_resv_regions,
4587	.device_group		= intel_iommu_device_group,
4588	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4589	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4590	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4591	.def_domain_type	= device_def_domain_type,
4592	.remove_dev_pasid	= intel_iommu_remove_dev_pasid,
4593	.pgsize_bitmap		= SZ_4K,
4594#ifdef CONFIG_INTEL_IOMMU_SVM
4595	.page_response		= intel_svm_page_response,
4596#endif
4597	.default_domain_ops = &(const struct iommu_domain_ops) {
4598		.attach_dev		= intel_iommu_attach_device,
4599		.set_dev_pasid		= intel_iommu_set_dev_pasid,
4600		.map_pages		= intel_iommu_map_pages,
4601		.unmap_pages		= intel_iommu_unmap_pages,
4602		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4603		.flush_iotlb_all        = intel_flush_iotlb_all,
4604		.iotlb_sync		= intel_iommu_tlb_sync,
4605		.iova_to_phys		= intel_iommu_iova_to_phys,
4606		.free			= intel_iommu_domain_free,
4607		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4608	}
4609};
4610
4611static void quirk_iommu_igfx(struct pci_dev *dev)
4612{
4613	if (risky_device(dev))
4614		return;
4615
4616	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4617	disable_igfx_iommu = 1;
4618}
4619
4620/* G4x/GM45 integrated gfx dmar support is totally busted. */
4621DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4622DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4623DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4624DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4625DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4626DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4627DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4628
4629/* Broadwell igfx malfunctions with dmar */
4630DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4631DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4632DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4633DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4634DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4635DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4636DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4637DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4638DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4639DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4640DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4641DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4642DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4643DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4644DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4645DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4646DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4647DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4648DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4649DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4650DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4651DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4652DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4653DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4654
4655static void quirk_iommu_rwbf(struct pci_dev *dev)
4656{
4657	if (risky_device(dev))
4658		return;
4659
4660	/*
4661	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4662	 * but needs it. Same seems to hold for the desktop versions.
4663	 */
4664	pci_info(dev, "Forcing write-buffer flush capability\n");
4665	rwbf_quirk = 1;
4666}
4667
4668DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4669DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4670DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4671DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4672DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4673DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4674DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4675
4676#define GGC 0x52
4677#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4678#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4679#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4680#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4681#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4682#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4683#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4684#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4685
4686static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4687{
4688	unsigned short ggc;
4689
4690	if (risky_device(dev))
4691		return;
4692
4693	if (pci_read_config_word(dev, GGC, &ggc))
4694		return;
4695
4696	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4697		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4698		disable_igfx_iommu = 1;
4699	} else if (!disable_igfx_iommu) {
4700		/* we have to ensure the gfx device is idle before we flush */
4701		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4702		iommu_set_dma_strict();
4703	}
4704}
4705DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4706DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4707DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4708DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4709
4710static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4711{
4712	unsigned short ver;
4713
4714	if (!IS_GFX_DEVICE(dev))
4715		return;
4716
4717	ver = (dev->device >> 8) & 0xff;
4718	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4719	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4720	    ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4721		return;
4722
4723	if (risky_device(dev))
4724		return;
4725
4726	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4727	iommu_skip_te_disable = 1;
4728}
4729DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4730
4731/* On Tylersburg chipsets, some BIOSes have been known to enable the
4732   ISOCH DMAR unit for the Azalia sound device, but not give it any
4733   TLB entries, which causes it to deadlock. Check for that.  We do
4734   this in a function called from init_dmars(), instead of in a PCI
4735   quirk, because we don't want to print the obnoxious "BIOS broken"
4736   message if VT-d is actually disabled.
4737*/
4738static void __init check_tylersburg_isoch(void)
4739{
4740	struct pci_dev *pdev;
4741	uint32_t vtisochctrl;
4742
4743	/* If there's no Azalia in the system anyway, forget it. */
4744	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4745	if (!pdev)
4746		return;
4747
4748	if (risky_device(pdev)) {
4749		pci_dev_put(pdev);
4750		return;
4751	}
4752
4753	pci_dev_put(pdev);
4754
4755	/* System Management Registers. Might be hidden, in which case
4756	   we can't do the sanity check. But that's OK, because the
4757	   known-broken BIOSes _don't_ actually hide it, so far. */
4758	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4759	if (!pdev)
4760		return;
4761
4762	if (risky_device(pdev)) {
4763		pci_dev_put(pdev);
4764		return;
4765	}
4766
4767	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4768		pci_dev_put(pdev);
4769		return;
4770	}
4771
4772	pci_dev_put(pdev);
4773
4774	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4775	if (vtisochctrl & 1)
4776		return;
4777
4778	/* Drop all bits other than the number of TLB entries */
4779	vtisochctrl &= 0x1c;
4780
4781	/* If we have the recommended number of TLB entries (16), fine. */
4782	if (vtisochctrl == 0x10)
4783		return;
4784
4785	/* Zero TLB entries? You get to ride the short bus to school. */
4786	if (!vtisochctrl) {
4787		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4788		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4789		     dmi_get_system_info(DMI_BIOS_VENDOR),
4790		     dmi_get_system_info(DMI_BIOS_VERSION),
4791		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4792		iommu_identity_mapping |= IDENTMAP_AZALIA;
4793		return;
4794	}
4795
4796	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4797	       vtisochctrl);
4798}
4799
4800/*
4801 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4802 * invalidation completion before posted writes initiated with translated address
4803 * that utilized translations matching the invalidation address range, violating
4804 * the invalidation completion ordering.
4805 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4806 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4807 * under the control of the trusted/privileged host device driver must use this
4808 * quirk.
4809 * Device TLBs are invalidated under the following six conditions:
4810 * 1. Device driver does DMA API unmap IOVA
4811 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4812 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4813 *    exit_mmap() due to crash
4814 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4815 *    VM has to free pages that were unmapped
4816 * 5. Userspace driver unmaps a DMA buffer
4817 * 6. Cache invalidation in vSVA usage (upcoming)
4818 *
4819 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4820 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4821 * invalidate TLB the same way as normal user unmap which will use this quirk.
4822 * The dTLB invalidation after PASID cache flush does not need this quirk.
4823 *
4824 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4825 */
4826void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4827			       unsigned long address, unsigned long mask,
4828			       u32 pasid, u16 qdep)
4829{
4830	u16 sid;
4831
4832	if (likely(!info->dtlb_extra_inval))
4833		return;
4834
4835	sid = PCI_DEVID(info->bus, info->devfn);
4836	if (pasid == IOMMU_NO_PASID) {
4837		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4838				   qdep, address, mask);
4839	} else {
4840		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4841					 pasid, qdep, address, mask);
4842	}
4843}
4844
4845#define ecmd_get_status_code(res)	(((res) & 0xff) >> 1)
4846
4847/*
4848 * Function to submit a command to the enhanced command interface. The
4849 * valid enhanced command descriptions are defined in Table 47 of the
4850 * VT-d spec. The VT-d hardware implementation may support some but not
4851 * all commands, which can be determined by checking the Enhanced
4852 * Command Capability Register.
4853 *
4854 * Return values:
4855 *  - 0: Command successful without any error;
4856 *  - Negative: software error value;
4857 *  - Nonzero positive: failure status code defined in Table 48.
4858 */
4859int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4860{
4861	unsigned long flags;
4862	u64 res;
4863	int ret;
4864
4865	if (!cap_ecmds(iommu->cap))
4866		return -ENODEV;
4867
4868	raw_spin_lock_irqsave(&iommu->register_lock, flags);
4869
4870	res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4871	if (res & DMA_ECMD_ECRSP_IP) {
4872		ret = -EBUSY;
4873		goto err;
4874	}
4875
4876	/*
4877	 * Unconditionally write the operand B, because
4878	 * - There is no side effect if an ecmd doesn't require an
4879	 *   operand B, but we set the register to some value.
4880	 * - It's not invoked in any critical path. The extra MMIO
4881	 *   write doesn't bring any performance concerns.
4882	 */
4883	dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4884	dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4885
4886	IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4887		      !(res & DMA_ECMD_ECRSP_IP), res);
4888
4889	if (res & DMA_ECMD_ECRSP_IP) {
4890		ret = -ETIMEDOUT;
4891		goto err;
4892	}
4893
4894	ret = ecmd_get_status_code(res);
4895err:
4896	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4897
4898	return ret;
4899}
4900