1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
4 * Author: Joerg Roedel <jroedel@suse.de>
5 *         Leo Duran <leo.duran@amd.com>
6 */
7
8#define pr_fmt(fmt)     "AMD-Vi: " fmt
9#define dev_fmt(fmt)    pr_fmt(fmt)
10
11#include <linux/ratelimit.h>
12#include <linux/pci.h>
13#include <linux/acpi.h>
14#include <linux/pci-ats.h>
15#include <linux/bitmap.h>
16#include <linux/slab.h>
17#include <linux/debugfs.h>
18#include <linux/scatterlist.h>
19#include <linux/dma-map-ops.h>
20#include <linux/dma-direct.h>
21#include <linux/iommu-helper.h>
22#include <linux/delay.h>
23#include <linux/amd-iommu.h>
24#include <linux/notifier.h>
25#include <linux/export.h>
26#include <linux/irq.h>
27#include <linux/msi.h>
28#include <linux/irqdomain.h>
29#include <linux/percpu.h>
30#include <linux/io-pgtable.h>
31#include <linux/cc_platform.h>
32#include <asm/irq_remapping.h>
33#include <asm/io_apic.h>
34#include <asm/apic.h>
35#include <asm/hw_irq.h>
36#include <asm/proto.h>
37#include <asm/iommu.h>
38#include <asm/gart.h>
39#include <asm/dma.h>
40#include <uapi/linux/iommufd.h>
41
42#include "amd_iommu.h"
43#include "../dma-iommu.h"
44#include "../irq_remapping.h"
45#include "../iommu-pages.h"
46
47#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
48
49/* Reserved IOVA ranges */
50#define MSI_RANGE_START		(0xfee00000)
51#define MSI_RANGE_END		(0xfeefffff)
52#define HT_RANGE_START		(0xfd00000000ULL)
53#define HT_RANGE_END		(0xffffffffffULL)
54
55#define DEFAULT_PGTABLE_LEVEL	PAGE_MODE_3_LEVEL
56
57static DEFINE_SPINLOCK(pd_bitmap_lock);
58
59LIST_HEAD(ioapic_map);
60LIST_HEAD(hpet_map);
61LIST_HEAD(acpihid_map);
62
63const struct iommu_ops amd_iommu_ops;
64static const struct iommu_dirty_ops amd_dirty_ops;
65
66int amd_iommu_max_glx_val = -1;
67
68/*
69 * general struct to manage commands send to an IOMMU
70 */
71struct iommu_cmd {
72	u32 data[4];
73};
74
75struct kmem_cache *amd_iommu_irq_cache;
76
77static void detach_device(struct device *dev);
78
79static void set_dte_entry(struct amd_iommu *iommu,
80			  struct iommu_dev_data *dev_data);
81
82/****************************************************************************
83 *
84 * Helper functions
85 *
86 ****************************************************************************/
87
88static inline bool pdom_is_v2_pgtbl_mode(struct protection_domain *pdom)
89{
90	return (pdom && (pdom->pd_mode == PD_MODE_V2));
91}
92
93static inline bool pdom_is_in_pt_mode(struct protection_domain *pdom)
94{
95	return (pdom->domain.type == IOMMU_DOMAIN_IDENTITY);
96}
97
98/*
99 * We cannot support PASID w/ existing v1 page table in the same domain
100 * since it will be nested. However, existing domain w/ v2 page table
101 * or passthrough mode can be used for PASID.
102 */
103static inline bool pdom_is_sva_capable(struct protection_domain *pdom)
104{
105	return pdom_is_v2_pgtbl_mode(pdom) || pdom_is_in_pt_mode(pdom);
106}
107
108static inline int get_acpihid_device_id(struct device *dev,
109					struct acpihid_map_entry **entry)
110{
111	struct acpi_device *adev = ACPI_COMPANION(dev);
112	struct acpihid_map_entry *p;
113
114	if (!adev)
115		return -ENODEV;
116
117	list_for_each_entry(p, &acpihid_map, list) {
118		if (acpi_dev_hid_uid_match(adev, p->hid,
119					   p->uid[0] ? p->uid : NULL)) {
120			if (entry)
121				*entry = p;
122			return p->devid;
123		}
124	}
125	return -EINVAL;
126}
127
128static inline int get_device_sbdf_id(struct device *dev)
129{
130	int sbdf;
131
132	if (dev_is_pci(dev))
133		sbdf = get_pci_sbdf_id(to_pci_dev(dev));
134	else
135		sbdf = get_acpihid_device_id(dev, NULL);
136
137	return sbdf;
138}
139
140struct dev_table_entry *get_dev_table(struct amd_iommu *iommu)
141{
142	struct dev_table_entry *dev_table;
143	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
144
145	BUG_ON(pci_seg == NULL);
146	dev_table = pci_seg->dev_table;
147	BUG_ON(dev_table == NULL);
148
149	return dev_table;
150}
151
152static inline u16 get_device_segment(struct device *dev)
153{
154	u16 seg;
155
156	if (dev_is_pci(dev)) {
157		struct pci_dev *pdev = to_pci_dev(dev);
158
159		seg = pci_domain_nr(pdev->bus);
160	} else {
161		u32 devid = get_acpihid_device_id(dev, NULL);
162
163		seg = PCI_SBDF_TO_SEGID(devid);
164	}
165
166	return seg;
167}
168
169/* Writes the specific IOMMU for a device into the PCI segment rlookup table */
170void amd_iommu_set_rlookup_table(struct amd_iommu *iommu, u16 devid)
171{
172	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
173
174	pci_seg->rlookup_table[devid] = iommu;
175}
176
177static struct amd_iommu *__rlookup_amd_iommu(u16 seg, u16 devid)
178{
179	struct amd_iommu_pci_seg *pci_seg;
180
181	for_each_pci_segment(pci_seg) {
182		if (pci_seg->id == seg)
183			return pci_seg->rlookup_table[devid];
184	}
185	return NULL;
186}
187
188static struct amd_iommu *rlookup_amd_iommu(struct device *dev)
189{
190	u16 seg = get_device_segment(dev);
191	int devid = get_device_sbdf_id(dev);
192
193	if (devid < 0)
194		return NULL;
195	return __rlookup_amd_iommu(seg, PCI_SBDF_TO_DEVID(devid));
196}
197
198static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid)
199{
200	struct iommu_dev_data *dev_data;
201	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
202
203	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
204	if (!dev_data)
205		return NULL;
206
207	spin_lock_init(&dev_data->lock);
208	dev_data->devid = devid;
209	ratelimit_default_init(&dev_data->rs);
210
211	llist_add(&dev_data->dev_data_list, &pci_seg->dev_data_list);
212	return dev_data;
213}
214
215static struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid)
216{
217	struct iommu_dev_data *dev_data;
218	struct llist_node *node;
219	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
220
221	if (llist_empty(&pci_seg->dev_data_list))
222		return NULL;
223
224	node = pci_seg->dev_data_list.first;
225	llist_for_each_entry(dev_data, node, dev_data_list) {
226		if (dev_data->devid == devid)
227			return dev_data;
228	}
229
230	return NULL;
231}
232
233static int clone_alias(struct pci_dev *pdev, u16 alias, void *data)
234{
235	struct amd_iommu *iommu;
236	struct dev_table_entry *dev_table;
237	u16 devid = pci_dev_id(pdev);
238
239	if (devid == alias)
240		return 0;
241
242	iommu = rlookup_amd_iommu(&pdev->dev);
243	if (!iommu)
244		return 0;
245
246	amd_iommu_set_rlookup_table(iommu, alias);
247	dev_table = get_dev_table(iommu);
248	memcpy(dev_table[alias].data,
249	       dev_table[devid].data,
250	       sizeof(dev_table[alias].data));
251
252	return 0;
253}
254
255static void clone_aliases(struct amd_iommu *iommu, struct device *dev)
256{
257	struct pci_dev *pdev;
258
259	if (!dev_is_pci(dev))
260		return;
261	pdev = to_pci_dev(dev);
262
263	/*
264	 * The IVRS alias stored in the alias table may not be
265	 * part of the PCI DMA aliases if it's bus differs
266	 * from the original device.
267	 */
268	clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL);
269
270	pci_for_each_dma_alias(pdev, clone_alias, NULL);
271}
272
273static void setup_aliases(struct amd_iommu *iommu, struct device *dev)
274{
275	struct pci_dev *pdev = to_pci_dev(dev);
276	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
277	u16 ivrs_alias;
278
279	/* For ACPI HID devices, there are no aliases */
280	if (!dev_is_pci(dev))
281		return;
282
283	/*
284	 * Add the IVRS alias to the pci aliases if it is on the same
285	 * bus. The IVRS table may know about a quirk that we don't.
286	 */
287	ivrs_alias = pci_seg->alias_table[pci_dev_id(pdev)];
288	if (ivrs_alias != pci_dev_id(pdev) &&
289	    PCI_BUS_NUM(ivrs_alias) == pdev->bus->number)
290		pci_add_dma_alias(pdev, ivrs_alias & 0xff, 1);
291
292	clone_aliases(iommu, dev);
293}
294
295static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid)
296{
297	struct iommu_dev_data *dev_data;
298
299	dev_data = search_dev_data(iommu, devid);
300
301	if (dev_data == NULL) {
302		dev_data = alloc_dev_data(iommu, devid);
303		if (!dev_data)
304			return NULL;
305
306		if (translation_pre_enabled(iommu))
307			dev_data->defer_attach = true;
308	}
309
310	return dev_data;
311}
312
313/*
314* Find or create an IOMMU group for a acpihid device.
315*/
316static struct iommu_group *acpihid_device_group(struct device *dev)
317{
318	struct acpihid_map_entry *p, *entry = NULL;
319	int devid;
320
321	devid = get_acpihid_device_id(dev, &entry);
322	if (devid < 0)
323		return ERR_PTR(devid);
324
325	list_for_each_entry(p, &acpihid_map, list) {
326		if ((devid == p->devid) && p->group)
327			entry->group = p->group;
328	}
329
330	if (!entry->group)
331		entry->group = generic_device_group(dev);
332	else
333		iommu_group_ref_get(entry->group);
334
335	return entry->group;
336}
337
338static inline bool pdev_pasid_supported(struct iommu_dev_data *dev_data)
339{
340	return (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP);
341}
342
343static u32 pdev_get_caps(struct pci_dev *pdev)
344{
345	int features;
346	u32 flags = 0;
347
348	if (pci_ats_supported(pdev))
349		flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP;
350
351	if (pci_pri_supported(pdev))
352		flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP;
353
354	features = pci_pasid_features(pdev);
355	if (features >= 0) {
356		flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP;
357
358		if (features & PCI_PASID_CAP_EXEC)
359			flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP;
360
361		if (features & PCI_PASID_CAP_PRIV)
362			flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP;
363	}
364
365	return flags;
366}
367
368static inline int pdev_enable_cap_ats(struct pci_dev *pdev)
369{
370	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
371	int ret = -EINVAL;
372
373	if (dev_data->ats_enabled)
374		return 0;
375
376	if (amd_iommu_iotlb_sup &&
377	    (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP)) {
378		ret = pci_enable_ats(pdev, PAGE_SHIFT);
379		if (!ret) {
380			dev_data->ats_enabled = 1;
381			dev_data->ats_qdep    = pci_ats_queue_depth(pdev);
382		}
383	}
384
385	return ret;
386}
387
388static inline void pdev_disable_cap_ats(struct pci_dev *pdev)
389{
390	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
391
392	if (dev_data->ats_enabled) {
393		pci_disable_ats(pdev);
394		dev_data->ats_enabled = 0;
395	}
396}
397
398static inline int pdev_enable_cap_pri(struct pci_dev *pdev)
399{
400	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
401	int ret = -EINVAL;
402
403	if (dev_data->pri_enabled)
404		return 0;
405
406	if (!dev_data->ats_enabled)
407		return 0;
408
409	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PRI_SUP) {
410		/*
411		 * First reset the PRI state of the device.
412		 * FIXME: Hardcode number of outstanding requests for now
413		 */
414		if (!pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32)) {
415			dev_data->pri_enabled = 1;
416			dev_data->pri_tlp     = pci_prg_resp_pasid_required(pdev);
417
418			ret = 0;
419		}
420	}
421
422	return ret;
423}
424
425static inline void pdev_disable_cap_pri(struct pci_dev *pdev)
426{
427	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
428
429	if (dev_data->pri_enabled) {
430		pci_disable_pri(pdev);
431		dev_data->pri_enabled = 0;
432	}
433}
434
435static inline int pdev_enable_cap_pasid(struct pci_dev *pdev)
436{
437	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
438	int ret = -EINVAL;
439
440	if (dev_data->pasid_enabled)
441		return 0;
442
443	if (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_PASID_SUP) {
444		/* Only allow access to user-accessible pages */
445		ret = pci_enable_pasid(pdev, 0);
446		if (!ret)
447			dev_data->pasid_enabled = 1;
448	}
449
450	return ret;
451}
452
453static inline void pdev_disable_cap_pasid(struct pci_dev *pdev)
454{
455	struct iommu_dev_data *dev_data = dev_iommu_priv_get(&pdev->dev);
456
457	if (dev_data->pasid_enabled) {
458		pci_disable_pasid(pdev);
459		dev_data->pasid_enabled = 0;
460	}
461}
462
463static void pdev_enable_caps(struct pci_dev *pdev)
464{
465	pdev_enable_cap_ats(pdev);
466	pdev_enable_cap_pasid(pdev);
467	pdev_enable_cap_pri(pdev);
468}
469
470static void pdev_disable_caps(struct pci_dev *pdev)
471{
472	pdev_disable_cap_ats(pdev);
473	pdev_disable_cap_pasid(pdev);
474	pdev_disable_cap_pri(pdev);
475}
476
477/*
478 * This function checks if the driver got a valid device from the caller to
479 * avoid dereferencing invalid pointers.
480 */
481static bool check_device(struct device *dev)
482{
483	struct amd_iommu_pci_seg *pci_seg;
484	struct amd_iommu *iommu;
485	int devid, sbdf;
486
487	if (!dev)
488		return false;
489
490	sbdf = get_device_sbdf_id(dev);
491	if (sbdf < 0)
492		return false;
493	devid = PCI_SBDF_TO_DEVID(sbdf);
494
495	iommu = rlookup_amd_iommu(dev);
496	if (!iommu)
497		return false;
498
499	/* Out of our scope? */
500	pci_seg = iommu->pci_seg;
501	if (devid > pci_seg->last_bdf)
502		return false;
503
504	return true;
505}
506
507static int iommu_init_device(struct amd_iommu *iommu, struct device *dev)
508{
509	struct iommu_dev_data *dev_data;
510	int devid, sbdf;
511
512	if (dev_iommu_priv_get(dev))
513		return 0;
514
515	sbdf = get_device_sbdf_id(dev);
516	if (sbdf < 0)
517		return sbdf;
518
519	devid = PCI_SBDF_TO_DEVID(sbdf);
520	dev_data = find_dev_data(iommu, devid);
521	if (!dev_data)
522		return -ENOMEM;
523
524	dev_data->dev = dev;
525	setup_aliases(iommu, dev);
526
527	/*
528	 * By default we use passthrough mode for IOMMUv2 capable device.
529	 * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
530	 * invalid address), we ignore the capability for the device so
531	 * it'll be forced to go into translation mode.
532	 */
533	if ((iommu_default_passthrough() || !amd_iommu_force_isolation) &&
534	    dev_is_pci(dev) && amd_iommu_gt_ppr_supported()) {
535		dev_data->flags = pdev_get_caps(to_pci_dev(dev));
536	}
537
538	dev_iommu_priv_set(dev, dev_data);
539
540	return 0;
541}
542
543static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev)
544{
545	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
546	struct dev_table_entry *dev_table = get_dev_table(iommu);
547	int devid, sbdf;
548
549	sbdf = get_device_sbdf_id(dev);
550	if (sbdf < 0)
551		return;
552
553	devid = PCI_SBDF_TO_DEVID(sbdf);
554	pci_seg->rlookup_table[devid] = NULL;
555	memset(&dev_table[devid], 0, sizeof(struct dev_table_entry));
556
557	setup_aliases(iommu, dev);
558}
559
560static void amd_iommu_uninit_device(struct device *dev)
561{
562	struct iommu_dev_data *dev_data;
563
564	dev_data = dev_iommu_priv_get(dev);
565	if (!dev_data)
566		return;
567
568	if (dev_data->domain)
569		detach_device(dev);
570
571	/*
572	 * We keep dev_data around for unplugged devices and reuse it when the
573	 * device is re-plugged - not doing so would introduce a ton of races.
574	 */
575}
576
577/****************************************************************************
578 *
579 * Interrupt handling functions
580 *
581 ****************************************************************************/
582
583static void dump_dte_entry(struct amd_iommu *iommu, u16 devid)
584{
585	int i;
586	struct dev_table_entry *dev_table = get_dev_table(iommu);
587
588	for (i = 0; i < 4; ++i)
589		pr_err("DTE[%d]: %016llx\n", i, dev_table[devid].data[i]);
590}
591
592static void dump_command(unsigned long phys_addr)
593{
594	struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr);
595	int i;
596
597	for (i = 0; i < 4; ++i)
598		pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
599}
600
601static void amd_iommu_report_rmp_hw_error(struct amd_iommu *iommu, volatile u32 *event)
602{
603	struct iommu_dev_data *dev_data = NULL;
604	int devid, vmg_tag, flags;
605	struct pci_dev *pdev;
606	u64 spa;
607
608	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
609	vmg_tag = (event[1]) & 0xFFFF;
610	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
611	spa     = ((u64)event[3] << 32) | (event[2] & 0xFFFFFFF8);
612
613	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
614					   devid & 0xff);
615	if (pdev)
616		dev_data = dev_iommu_priv_get(&pdev->dev);
617
618	if (dev_data) {
619		if (__ratelimit(&dev_data->rs)) {
620			pci_err(pdev, "Event logged [RMP_HW_ERROR vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
621				vmg_tag, spa, flags);
622		}
623	} else {
624		pr_err_ratelimited("Event logged [RMP_HW_ERROR device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, spa=0x%llx, flags=0x%04x]\n",
625			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
626			vmg_tag, spa, flags);
627	}
628
629	if (pdev)
630		pci_dev_put(pdev);
631}
632
633static void amd_iommu_report_rmp_fault(struct amd_iommu *iommu, volatile u32 *event)
634{
635	struct iommu_dev_data *dev_data = NULL;
636	int devid, flags_rmp, vmg_tag, flags;
637	struct pci_dev *pdev;
638	u64 gpa;
639
640	devid     = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
641	flags_rmp = (event[0] >> EVENT_FLAGS_SHIFT) & 0xFF;
642	vmg_tag   = (event[1]) & 0xFFFF;
643	flags     = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
644	gpa       = ((u64)event[3] << 32) | event[2];
645
646	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
647					   devid & 0xff);
648	if (pdev)
649		dev_data = dev_iommu_priv_get(&pdev->dev);
650
651	if (dev_data) {
652		if (__ratelimit(&dev_data->rs)) {
653			pci_err(pdev, "Event logged [RMP_PAGE_FAULT vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
654				vmg_tag, gpa, flags_rmp, flags);
655		}
656	} else {
657		pr_err_ratelimited("Event logged [RMP_PAGE_FAULT device=%04x:%02x:%02x.%x, vmg_tag=0x%04x, gpa=0x%llx, flags_rmp=0x%04x, flags=0x%04x]\n",
658			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
659			vmg_tag, gpa, flags_rmp, flags);
660	}
661
662	if (pdev)
663		pci_dev_put(pdev);
664}
665
666#define IS_IOMMU_MEM_TRANSACTION(flags)		\
667	(((flags) & EVENT_FLAG_I) == 0)
668
669#define IS_WRITE_REQUEST(flags)			\
670	((flags) & EVENT_FLAG_RW)
671
672static void amd_iommu_report_page_fault(struct amd_iommu *iommu,
673					u16 devid, u16 domain_id,
674					u64 address, int flags)
675{
676	struct iommu_dev_data *dev_data = NULL;
677	struct pci_dev *pdev;
678
679	pdev = pci_get_domain_bus_and_slot(iommu->pci_seg->id, PCI_BUS_NUM(devid),
680					   devid & 0xff);
681	if (pdev)
682		dev_data = dev_iommu_priv_get(&pdev->dev);
683
684	if (dev_data) {
685		/*
686		 * If this is a DMA fault (for which the I(nterrupt)
687		 * bit will be unset), allow report_iommu_fault() to
688		 * prevent logging it.
689		 */
690		if (IS_IOMMU_MEM_TRANSACTION(flags)) {
691			/* Device not attached to domain properly */
692			if (dev_data->domain == NULL) {
693				pr_err_ratelimited("Event logged [Device not attached to domain properly]\n");
694				pr_err_ratelimited("  device=%04x:%02x:%02x.%x domain=0x%04x\n",
695						   iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
696						   PCI_FUNC(devid), domain_id);
697				goto out;
698			}
699
700			if (!report_iommu_fault(&dev_data->domain->domain,
701						&pdev->dev, address,
702						IS_WRITE_REQUEST(flags) ?
703							IOMMU_FAULT_WRITE :
704							IOMMU_FAULT_READ))
705				goto out;
706		}
707
708		if (__ratelimit(&dev_data->rs)) {
709			pci_err(pdev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
710				domain_id, address, flags);
711		}
712	} else {
713		pr_err_ratelimited("Event logged [IO_PAGE_FAULT device=%04x:%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
714			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
715			domain_id, address, flags);
716	}
717
718out:
719	if (pdev)
720		pci_dev_put(pdev);
721}
722
723static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
724{
725	struct device *dev = iommu->iommu.dev;
726	int type, devid, flags, tag;
727	volatile u32 *event = __evt;
728	int count = 0;
729	u64 address;
730	u32 pasid;
731
732retry:
733	type    = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
734	devid   = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
735	pasid   = (event[0] & EVENT_DOMID_MASK_HI) |
736		  (event[1] & EVENT_DOMID_MASK_LO);
737	flags   = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
738	address = (u64)(((u64)event[3]) << 32) | event[2];
739
740	if (type == 0) {
741		/* Did we hit the erratum? */
742		if (++count == LOOP_TIMEOUT) {
743			pr_err("No event written to event log\n");
744			return;
745		}
746		udelay(1);
747		goto retry;
748	}
749
750	if (type == EVENT_TYPE_IO_FAULT) {
751		amd_iommu_report_page_fault(iommu, devid, pasid, address, flags);
752		return;
753	}
754
755	switch (type) {
756	case EVENT_TYPE_ILL_DEV:
757		dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
758			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
759			pasid, address, flags);
760		dump_dte_entry(iommu, devid);
761		break;
762	case EVENT_TYPE_DEV_TAB_ERR:
763		dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x "
764			"address=0x%llx flags=0x%04x]\n",
765			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
766			address, flags);
767		break;
768	case EVENT_TYPE_PAGE_TAB_ERR:
769		dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%04x:%02x:%02x.%x pasid=0x%04x address=0x%llx flags=0x%04x]\n",
770			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
771			pasid, address, flags);
772		break;
773	case EVENT_TYPE_ILL_CMD:
774		dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
775		dump_command(address);
776		break;
777	case EVENT_TYPE_CMD_HARD_ERR:
778		dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
779			address, flags);
780		break;
781	case EVENT_TYPE_IOTLB_INV_TO:
782		dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%04x:%02x:%02x.%x address=0x%llx]\n",
783			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
784			address);
785		break;
786	case EVENT_TYPE_INV_DEV_REQ:
787		dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
788			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
789			pasid, address, flags);
790		break;
791	case EVENT_TYPE_RMP_FAULT:
792		amd_iommu_report_rmp_fault(iommu, event);
793		break;
794	case EVENT_TYPE_RMP_HW_ERR:
795		amd_iommu_report_rmp_hw_error(iommu, event);
796		break;
797	case EVENT_TYPE_INV_PPR_REQ:
798		pasid = PPR_PASID(*((u64 *)__evt));
799		tag = event[1] & 0x03FF;
800		dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%04x:%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x tag=0x%03x]\n",
801			iommu->pci_seg->id, PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
802			pasid, address, flags, tag);
803		break;
804	default:
805		dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
806			event[0], event[1], event[2], event[3]);
807	}
808
809	/*
810	 * To detect the hardware errata 732 we need to clear the
811	 * entry back to zero. This issue does not exist on SNP
812	 * enabled system. Also this buffer is not writeable on
813	 * SNP enabled system.
814	 */
815	if (!amd_iommu_snp_en)
816		memset(__evt, 0, 4 * sizeof(u32));
817}
818
819static void iommu_poll_events(struct amd_iommu *iommu)
820{
821	u32 head, tail;
822
823	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
824	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
825
826	while (head != tail) {
827		iommu_print_event(iommu, iommu->evt_buf + head);
828		head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE;
829	}
830
831	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
832}
833
834#ifdef CONFIG_IRQ_REMAP
835static int (*iommu_ga_log_notifier)(u32);
836
837int amd_iommu_register_ga_log_notifier(int (*notifier)(u32))
838{
839	iommu_ga_log_notifier = notifier;
840
841	return 0;
842}
843EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier);
844
845static void iommu_poll_ga_log(struct amd_iommu *iommu)
846{
847	u32 head, tail;
848
849	if (iommu->ga_log == NULL)
850		return;
851
852	head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
853	tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET);
854
855	while (head != tail) {
856		volatile u64 *raw;
857		u64 log_entry;
858
859		raw = (u64 *)(iommu->ga_log + head);
860
861		/* Avoid memcpy function-call overhead */
862		log_entry = *raw;
863
864		/* Update head pointer of hardware ring-buffer */
865		head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE;
866		writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET);
867
868		/* Handle GA entry */
869		switch (GA_REQ_TYPE(log_entry)) {
870		case GA_GUEST_NR:
871			if (!iommu_ga_log_notifier)
872				break;
873
874			pr_debug("%s: devid=%#x, ga_tag=%#x\n",
875				 __func__, GA_DEVID(log_entry),
876				 GA_TAG(log_entry));
877
878			if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
879				pr_err("GA log notifier failed.\n");
880			break;
881		default:
882			break;
883		}
884	}
885}
886
887static void
888amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu)
889{
890	if (!irq_remapping_enabled || !dev_is_pci(dev) ||
891	    !pci_dev_has_default_msi_parent_domain(to_pci_dev(dev)))
892		return;
893
894	dev_set_msi_domain(dev, iommu->ir_domain);
895}
896
897#else /* CONFIG_IRQ_REMAP */
898static inline void
899amd_iommu_set_pci_msi_domain(struct device *dev, struct amd_iommu *iommu) { }
900#endif /* !CONFIG_IRQ_REMAP */
901
902static void amd_iommu_handle_irq(void *data, const char *evt_type,
903				 u32 int_mask, u32 overflow_mask,
904				 void (*int_handler)(struct amd_iommu *),
905				 void (*overflow_handler)(struct amd_iommu *))
906{
907	struct amd_iommu *iommu = (struct amd_iommu *) data;
908	u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
909	u32 mask = int_mask | overflow_mask;
910
911	while (status & mask) {
912		/* Enable interrupt sources again */
913		writel(mask, iommu->mmio_base + MMIO_STATUS_OFFSET);
914
915		if (int_handler) {
916			pr_devel("Processing IOMMU (ivhd%d) %s Log\n",
917				 iommu->index, evt_type);
918			int_handler(iommu);
919		}
920
921		if ((status & overflow_mask) && overflow_handler)
922			overflow_handler(iommu);
923
924		/*
925		 * Hardware bug: ERBT1312
926		 * When re-enabling interrupt (by writing 1
927		 * to clear the bit), the hardware might also try to set
928		 * the interrupt bit in the event status register.
929		 * In this scenario, the bit will be set, and disable
930		 * subsequent interrupts.
931		 *
932		 * Workaround: The IOMMU driver should read back the
933		 * status register and check if the interrupt bits are cleared.
934		 * If not, driver will need to go through the interrupt handler
935		 * again and re-clear the bits
936		 */
937		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
938	}
939}
940
941irqreturn_t amd_iommu_int_thread_evtlog(int irq, void *data)
942{
943	amd_iommu_handle_irq(data, "Evt", MMIO_STATUS_EVT_INT_MASK,
944			     MMIO_STATUS_EVT_OVERFLOW_MASK,
945			     iommu_poll_events, amd_iommu_restart_event_logging);
946
947	return IRQ_HANDLED;
948}
949
950irqreturn_t amd_iommu_int_thread_pprlog(int irq, void *data)
951{
952	amd_iommu_handle_irq(data, "PPR", MMIO_STATUS_PPR_INT_MASK,
953			     MMIO_STATUS_PPR_OVERFLOW_MASK,
954			     amd_iommu_poll_ppr_log, amd_iommu_restart_ppr_log);
955
956	return IRQ_HANDLED;
957}
958
959irqreturn_t amd_iommu_int_thread_galog(int irq, void *data)
960{
961#ifdef CONFIG_IRQ_REMAP
962	amd_iommu_handle_irq(data, "GA", MMIO_STATUS_GALOG_INT_MASK,
963			     MMIO_STATUS_GALOG_OVERFLOW_MASK,
964			     iommu_poll_ga_log, amd_iommu_restart_ga_log);
965#endif
966
967	return IRQ_HANDLED;
968}
969
970irqreturn_t amd_iommu_int_thread(int irq, void *data)
971{
972	amd_iommu_int_thread_evtlog(irq, data);
973	amd_iommu_int_thread_pprlog(irq, data);
974	amd_iommu_int_thread_galog(irq, data);
975
976	return IRQ_HANDLED;
977}
978
979irqreturn_t amd_iommu_int_handler(int irq, void *data)
980{
981	return IRQ_WAKE_THREAD;
982}
983
984/****************************************************************************
985 *
986 * IOMMU command queuing functions
987 *
988 ****************************************************************************/
989
990static int wait_on_sem(struct amd_iommu *iommu, u64 data)
991{
992	int i = 0;
993
994	while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
995		udelay(1);
996		i += 1;
997	}
998
999	if (i == LOOP_TIMEOUT) {
1000		pr_alert("Completion-Wait loop timed out\n");
1001		return -EIO;
1002	}
1003
1004	return 0;
1005}
1006
1007static void copy_cmd_to_buffer(struct amd_iommu *iommu,
1008			       struct iommu_cmd *cmd)
1009{
1010	u8 *target;
1011	u32 tail;
1012
1013	/* Copy command to buffer */
1014	tail = iommu->cmd_buf_tail;
1015	target = iommu->cmd_buf + tail;
1016	memcpy(target, cmd, sizeof(*cmd));
1017
1018	tail = (tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1019	iommu->cmd_buf_tail = tail;
1020
1021	/* Tell the IOMMU about it */
1022	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
1023}
1024
1025static void build_completion_wait(struct iommu_cmd *cmd,
1026				  struct amd_iommu *iommu,
1027				  u64 data)
1028{
1029	u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem);
1030
1031	memset(cmd, 0, sizeof(*cmd));
1032	cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK;
1033	cmd->data[1] = upper_32_bits(paddr);
1034	cmd->data[2] = lower_32_bits(data);
1035	cmd->data[3] = upper_32_bits(data);
1036	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
1037}
1038
1039static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
1040{
1041	memset(cmd, 0, sizeof(*cmd));
1042	cmd->data[0] = devid;
1043	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
1044}
1045
1046/*
1047 * Builds an invalidation address which is suitable for one page or multiple
1048 * pages. Sets the size bit (S) as needed is more than one page is flushed.
1049 */
1050static inline u64 build_inv_address(u64 address, size_t size)
1051{
1052	u64 pages, end, msb_diff;
1053
1054	pages = iommu_num_pages(address, size, PAGE_SIZE);
1055
1056	if (pages == 1)
1057		return address & PAGE_MASK;
1058
1059	end = address + size - 1;
1060
1061	/*
1062	 * msb_diff would hold the index of the most significant bit that
1063	 * flipped between the start and end.
1064	 */
1065	msb_diff = fls64(end ^ address) - 1;
1066
1067	/*
1068	 * Bits 63:52 are sign extended. If for some reason bit 51 is different
1069	 * between the start and the end, invalidate everything.
1070	 */
1071	if (unlikely(msb_diff > 51)) {
1072		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
1073	} else {
1074		/*
1075		 * The msb-bit must be clear on the address. Just set all the
1076		 * lower bits.
1077		 */
1078		address |= (1ull << msb_diff) - 1;
1079	}
1080
1081	/* Clear bits 11:0 */
1082	address &= PAGE_MASK;
1083
1084	/* Set the size bit - we flush more than one 4kb page */
1085	return address | CMD_INV_IOMMU_PAGES_SIZE_MASK;
1086}
1087
1088static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
1089				  size_t size, u16 domid,
1090				  ioasid_t pasid, bool gn)
1091{
1092	u64 inv_address = build_inv_address(address, size);
1093
1094	memset(cmd, 0, sizeof(*cmd));
1095
1096	cmd->data[1] |= domid;
1097	cmd->data[2]  = lower_32_bits(inv_address);
1098	cmd->data[3]  = upper_32_bits(inv_address);
1099	/* PDE bit - we want to flush everything, not only the PTEs */
1100	cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
1101	if (gn) {
1102		cmd->data[0] |= pasid;
1103		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1104	}
1105	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
1106}
1107
1108static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
1109				  u64 address, size_t size,
1110				  ioasid_t pasid, bool gn)
1111{
1112	u64 inv_address = build_inv_address(address, size);
1113
1114	memset(cmd, 0, sizeof(*cmd));
1115
1116	cmd->data[0]  = devid;
1117	cmd->data[0] |= (qdep & 0xff) << 24;
1118	cmd->data[1]  = devid;
1119	cmd->data[2]  = lower_32_bits(inv_address);
1120	cmd->data[3]  = upper_32_bits(inv_address);
1121	if (gn) {
1122		cmd->data[0] |= ((pasid >> 8) & 0xff) << 16;
1123		cmd->data[1] |= (pasid & 0xff) << 16;
1124		cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK;
1125	}
1126
1127	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
1128}
1129
1130static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, u32 pasid,
1131			       int status, int tag, u8 gn)
1132{
1133	memset(cmd, 0, sizeof(*cmd));
1134
1135	cmd->data[0]  = devid;
1136	if (gn) {
1137		cmd->data[1]  = pasid;
1138		cmd->data[2]  = CMD_INV_IOMMU_PAGES_GN_MASK;
1139	}
1140	cmd->data[3]  = tag & 0x1ff;
1141	cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT;
1142
1143	CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR);
1144}
1145
1146static void build_inv_all(struct iommu_cmd *cmd)
1147{
1148	memset(cmd, 0, sizeof(*cmd));
1149	CMD_SET_TYPE(cmd, CMD_INV_ALL);
1150}
1151
1152static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
1153{
1154	memset(cmd, 0, sizeof(*cmd));
1155	cmd->data[0] = devid;
1156	CMD_SET_TYPE(cmd, CMD_INV_IRT);
1157}
1158
1159/*
1160 * Writes the command to the IOMMUs command buffer and informs the
1161 * hardware about the new command.
1162 */
1163static int __iommu_queue_command_sync(struct amd_iommu *iommu,
1164				      struct iommu_cmd *cmd,
1165				      bool sync)
1166{
1167	unsigned int count = 0;
1168	u32 left, next_tail;
1169
1170	next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
1171again:
1172	left      = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
1173
1174	if (left <= 0x20) {
1175		/* Skip udelay() the first time around */
1176		if (count++) {
1177			if (count == LOOP_TIMEOUT) {
1178				pr_err("Command buffer timeout\n");
1179				return -EIO;
1180			}
1181
1182			udelay(1);
1183		}
1184
1185		/* Update head and recheck remaining space */
1186		iommu->cmd_buf_head = readl(iommu->mmio_base +
1187					    MMIO_CMD_HEAD_OFFSET);
1188
1189		goto again;
1190	}
1191
1192	copy_cmd_to_buffer(iommu, cmd);
1193
1194	/* Do we need to make sure all commands are processed? */
1195	iommu->need_sync = sync;
1196
1197	return 0;
1198}
1199
1200static int iommu_queue_command_sync(struct amd_iommu *iommu,
1201				    struct iommu_cmd *cmd,
1202				    bool sync)
1203{
1204	unsigned long flags;
1205	int ret;
1206
1207	raw_spin_lock_irqsave(&iommu->lock, flags);
1208	ret = __iommu_queue_command_sync(iommu, cmd, sync);
1209	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1210
1211	return ret;
1212}
1213
1214static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
1215{
1216	return iommu_queue_command_sync(iommu, cmd, true);
1217}
1218
1219/*
1220 * This function queues a completion wait command into the command
1221 * buffer of an IOMMU
1222 */
1223static int iommu_completion_wait(struct amd_iommu *iommu)
1224{
1225	struct iommu_cmd cmd;
1226	unsigned long flags;
1227	int ret;
1228	u64 data;
1229
1230	if (!iommu->need_sync)
1231		return 0;
1232
1233	data = atomic64_add_return(1, &iommu->cmd_sem_val);
1234	build_completion_wait(&cmd, iommu, data);
1235
1236	raw_spin_lock_irqsave(&iommu->lock, flags);
1237
1238	ret = __iommu_queue_command_sync(iommu, &cmd, false);
1239	if (ret)
1240		goto out_unlock;
1241
1242	ret = wait_on_sem(iommu, data);
1243
1244out_unlock:
1245	raw_spin_unlock_irqrestore(&iommu->lock, flags);
1246
1247	return ret;
1248}
1249
1250static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
1251{
1252	struct iommu_cmd cmd;
1253
1254	build_inv_dte(&cmd, devid);
1255
1256	return iommu_queue_command(iommu, &cmd);
1257}
1258
1259static void amd_iommu_flush_dte_all(struct amd_iommu *iommu)
1260{
1261	u32 devid;
1262	u16 last_bdf = iommu->pci_seg->last_bdf;
1263
1264	for (devid = 0; devid <= last_bdf; ++devid)
1265		iommu_flush_dte(iommu, devid);
1266
1267	iommu_completion_wait(iommu);
1268}
1269
1270/*
1271 * This function uses heavy locking and may disable irqs for some time. But
1272 * this is no issue because it is only called during resume.
1273 */
1274static void amd_iommu_flush_tlb_all(struct amd_iommu *iommu)
1275{
1276	u32 dom_id;
1277	u16 last_bdf = iommu->pci_seg->last_bdf;
1278
1279	for (dom_id = 0; dom_id <= last_bdf; ++dom_id) {
1280		struct iommu_cmd cmd;
1281		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1282				      dom_id, IOMMU_NO_PASID, false);
1283		iommu_queue_command(iommu, &cmd);
1284	}
1285
1286	iommu_completion_wait(iommu);
1287}
1288
1289static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
1290{
1291	struct iommu_cmd cmd;
1292
1293	build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
1294			      dom_id, IOMMU_NO_PASID, false);
1295	iommu_queue_command(iommu, &cmd);
1296
1297	iommu_completion_wait(iommu);
1298}
1299
1300static void amd_iommu_flush_all(struct amd_iommu *iommu)
1301{
1302	struct iommu_cmd cmd;
1303
1304	build_inv_all(&cmd);
1305
1306	iommu_queue_command(iommu, &cmd);
1307	iommu_completion_wait(iommu);
1308}
1309
1310static void iommu_flush_irt(struct amd_iommu *iommu, u16 devid)
1311{
1312	struct iommu_cmd cmd;
1313
1314	build_inv_irt(&cmd, devid);
1315
1316	iommu_queue_command(iommu, &cmd);
1317}
1318
1319static void amd_iommu_flush_irt_all(struct amd_iommu *iommu)
1320{
1321	u32 devid;
1322	u16 last_bdf = iommu->pci_seg->last_bdf;
1323
1324	if (iommu->irtcachedis_enabled)
1325		return;
1326
1327	for (devid = 0; devid <= last_bdf; devid++)
1328		iommu_flush_irt(iommu, devid);
1329
1330	iommu_completion_wait(iommu);
1331}
1332
1333void amd_iommu_flush_all_caches(struct amd_iommu *iommu)
1334{
1335	if (check_feature(FEATURE_IA)) {
1336		amd_iommu_flush_all(iommu);
1337	} else {
1338		amd_iommu_flush_dte_all(iommu);
1339		amd_iommu_flush_irt_all(iommu);
1340		amd_iommu_flush_tlb_all(iommu);
1341	}
1342}
1343
1344/*
1345 * Command send function for flushing on-device TLB
1346 */
1347static int device_flush_iotlb(struct iommu_dev_data *dev_data, u64 address,
1348			      size_t size, ioasid_t pasid, bool gn)
1349{
1350	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1351	struct iommu_cmd cmd;
1352	int qdep = dev_data->ats_qdep;
1353
1354	build_inv_iotlb_pages(&cmd, dev_data->devid, qdep, address,
1355			      size, pasid, gn);
1356
1357	return iommu_queue_command(iommu, &cmd);
1358}
1359
1360static int device_flush_dte_alias(struct pci_dev *pdev, u16 alias, void *data)
1361{
1362	struct amd_iommu *iommu = data;
1363
1364	return iommu_flush_dte(iommu, alias);
1365}
1366
1367/*
1368 * Command send function for invalidating a device table entry
1369 */
1370static int device_flush_dte(struct iommu_dev_data *dev_data)
1371{
1372	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1373	struct pci_dev *pdev = NULL;
1374	struct amd_iommu_pci_seg *pci_seg;
1375	u16 alias;
1376	int ret;
1377
1378	if (dev_is_pci(dev_data->dev))
1379		pdev = to_pci_dev(dev_data->dev);
1380
1381	if (pdev)
1382		ret = pci_for_each_dma_alias(pdev,
1383					     device_flush_dte_alias, iommu);
1384	else
1385		ret = iommu_flush_dte(iommu, dev_data->devid);
1386	if (ret)
1387		return ret;
1388
1389	pci_seg = iommu->pci_seg;
1390	alias = pci_seg->alias_table[dev_data->devid];
1391	if (alias != dev_data->devid) {
1392		ret = iommu_flush_dte(iommu, alias);
1393		if (ret)
1394			return ret;
1395	}
1396
1397	if (dev_data->ats_enabled) {
1398		/* Invalidate the entire contents of an IOTLB */
1399		ret = device_flush_iotlb(dev_data, 0, ~0UL,
1400					 IOMMU_NO_PASID, false);
1401	}
1402
1403	return ret;
1404}
1405
1406static int domain_flush_pages_v2(struct protection_domain *pdom,
1407				 u64 address, size_t size)
1408{
1409	struct iommu_dev_data *dev_data;
1410	struct iommu_cmd cmd;
1411	int ret = 0;
1412
1413	list_for_each_entry(dev_data, &pdom->dev_list, list) {
1414		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1415		u16 domid = dev_data->gcr3_info.domid;
1416
1417		build_inv_iommu_pages(&cmd, address, size,
1418				      domid, IOMMU_NO_PASID, true);
1419
1420		ret |= iommu_queue_command(iommu, &cmd);
1421	}
1422
1423	return ret;
1424}
1425
1426static int domain_flush_pages_v1(struct protection_domain *pdom,
1427				 u64 address, size_t size)
1428{
1429	struct iommu_cmd cmd;
1430	int ret = 0, i;
1431
1432	build_inv_iommu_pages(&cmd, address, size,
1433			      pdom->id, IOMMU_NO_PASID, false);
1434
1435	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1436		if (!pdom->dev_iommu[i])
1437			continue;
1438
1439		/*
1440		 * Devices of this domain are behind this IOMMU
1441		 * We need a TLB flush
1442		 */
1443		ret |= iommu_queue_command(amd_iommus[i], &cmd);
1444	}
1445
1446	return ret;
1447}
1448
1449/*
1450 * TLB invalidation function which is called from the mapping functions.
1451 * It flushes range of PTEs of the domain.
1452 */
1453static void __domain_flush_pages(struct protection_domain *domain,
1454				 u64 address, size_t size)
1455{
1456	struct iommu_dev_data *dev_data;
1457	int ret = 0;
1458	ioasid_t pasid = IOMMU_NO_PASID;
1459	bool gn = false;
1460
1461	if (pdom_is_v2_pgtbl_mode(domain)) {
1462		gn = true;
1463		ret = domain_flush_pages_v2(domain, address, size);
1464	} else {
1465		ret = domain_flush_pages_v1(domain, address, size);
1466	}
1467
1468	list_for_each_entry(dev_data, &domain->dev_list, list) {
1469
1470		if (!dev_data->ats_enabled)
1471			continue;
1472
1473		ret |= device_flush_iotlb(dev_data, address, size, pasid, gn);
1474	}
1475
1476	WARN_ON(ret);
1477}
1478
1479void amd_iommu_domain_flush_pages(struct protection_domain *domain,
1480				  u64 address, size_t size)
1481{
1482	if (likely(!amd_iommu_np_cache)) {
1483		__domain_flush_pages(domain, address, size);
1484
1485		/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1486		amd_iommu_domain_flush_complete(domain);
1487
1488		return;
1489	}
1490
1491	/*
1492	 * When NpCache is on, we infer that we run in a VM and use a vIOMMU.
1493	 * In such setups it is best to avoid flushes of ranges which are not
1494	 * naturally aligned, since it would lead to flushes of unmodified
1495	 * PTEs. Such flushes would require the hypervisor to do more work than
1496	 * necessary. Therefore, perform repeated flushes of aligned ranges
1497	 * until you cover the range. Each iteration flushes the smaller
1498	 * between the natural alignment of the address that we flush and the
1499	 * greatest naturally aligned region that fits in the range.
1500	 */
1501	while (size != 0) {
1502		int addr_alignment = __ffs(address);
1503		int size_alignment = __fls(size);
1504		int min_alignment;
1505		size_t flush_size;
1506
1507		/*
1508		 * size is always non-zero, but address might be zero, causing
1509		 * addr_alignment to be negative. As the casting of the
1510		 * argument in __ffs(address) to long might trim the high bits
1511		 * of the address on x86-32, cast to long when doing the check.
1512		 */
1513		if (likely((unsigned long)address != 0))
1514			min_alignment = min(addr_alignment, size_alignment);
1515		else
1516			min_alignment = size_alignment;
1517
1518		flush_size = 1ul << min_alignment;
1519
1520		__domain_flush_pages(domain, address, flush_size);
1521		address += flush_size;
1522		size -= flush_size;
1523	}
1524
1525	/* Wait until IOMMU TLB and all device IOTLB flushes are complete */
1526	amd_iommu_domain_flush_complete(domain);
1527}
1528
1529/* Flush the whole IO/TLB for a given protection domain - including PDE */
1530static void amd_iommu_domain_flush_all(struct protection_domain *domain)
1531{
1532	amd_iommu_domain_flush_pages(domain, 0,
1533				     CMD_INV_IOMMU_ALL_PAGES_ADDRESS);
1534}
1535
1536void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
1537				     ioasid_t pasid, u64 address, size_t size)
1538{
1539	struct iommu_cmd cmd;
1540	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1541
1542	build_inv_iommu_pages(&cmd, address, size,
1543			      dev_data->gcr3_info.domid, pasid, true);
1544	iommu_queue_command(iommu, &cmd);
1545
1546	if (dev_data->ats_enabled)
1547		device_flush_iotlb(dev_data, address, size, pasid, true);
1548
1549	iommu_completion_wait(iommu);
1550}
1551
1552void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data,
1553				   ioasid_t pasid)
1554{
1555	amd_iommu_dev_flush_pasid_pages(dev_data, 0,
1556					CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid);
1557}
1558
1559void amd_iommu_domain_flush_complete(struct protection_domain *domain)
1560{
1561	int i;
1562
1563	for (i = 0; i < amd_iommu_get_num_iommus(); ++i) {
1564		if (domain && !domain->dev_iommu[i])
1565			continue;
1566
1567		/*
1568		 * Devices of this domain are behind this IOMMU
1569		 * We need to wait for completion of all commands.
1570		 */
1571		iommu_completion_wait(amd_iommus[i]);
1572	}
1573}
1574
1575/* Flush the not present cache if it exists */
1576static void domain_flush_np_cache(struct protection_domain *domain,
1577		dma_addr_t iova, size_t size)
1578{
1579	if (unlikely(amd_iommu_np_cache)) {
1580		unsigned long flags;
1581
1582		spin_lock_irqsave(&domain->lock, flags);
1583		amd_iommu_domain_flush_pages(domain, iova, size);
1584		spin_unlock_irqrestore(&domain->lock, flags);
1585	}
1586}
1587
1588
1589/*
1590 * This function flushes the DTEs for all devices in domain
1591 */
1592static void domain_flush_devices(struct protection_domain *domain)
1593{
1594	struct iommu_dev_data *dev_data;
1595
1596	list_for_each_entry(dev_data, &domain->dev_list, list)
1597		device_flush_dte(dev_data);
1598}
1599
1600static void update_device_table(struct protection_domain *domain)
1601{
1602	struct iommu_dev_data *dev_data;
1603
1604	list_for_each_entry(dev_data, &domain->dev_list, list) {
1605		struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev);
1606
1607		set_dte_entry(iommu, dev_data);
1608		clone_aliases(iommu, dev_data->dev);
1609	}
1610}
1611
1612void amd_iommu_update_and_flush_device_table(struct protection_domain *domain)
1613{
1614	update_device_table(domain);
1615	domain_flush_devices(domain);
1616}
1617
1618void amd_iommu_domain_update(struct protection_domain *domain)
1619{
1620	/* Update device table */
1621	amd_iommu_update_and_flush_device_table(domain);
1622
1623	/* Flush domain TLB(s) and wait for completion */
1624	amd_iommu_domain_flush_all(domain);
1625}
1626
1627int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag)
1628{
1629	struct iommu_dev_data *dev_data;
1630	struct amd_iommu *iommu;
1631	struct iommu_cmd cmd;
1632
1633	dev_data = dev_iommu_priv_get(dev);
1634	iommu    = get_amd_iommu_from_dev(dev);
1635
1636	build_complete_ppr(&cmd, dev_data->devid, pasid, status,
1637			   tag, dev_data->pri_tlp);
1638
1639	return iommu_queue_command(iommu, &cmd);
1640}
1641
1642/****************************************************************************
1643 *
1644 * The next functions belong to the domain allocation. A domain is
1645 * allocated for every IOMMU as the default domain. If device isolation
1646 * is enabled, every device get its own domain. The most important thing
1647 * about domains is the page table mapping the DMA address space they
1648 * contain.
1649 *
1650 ****************************************************************************/
1651
1652static u16 domain_id_alloc(void)
1653{
1654	unsigned long flags;
1655	int id;
1656
1657	spin_lock_irqsave(&pd_bitmap_lock, flags);
1658	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
1659	BUG_ON(id == 0);
1660	if (id > 0 && id < MAX_DOMAIN_ID)
1661		__set_bit(id, amd_iommu_pd_alloc_bitmap);
1662	else
1663		id = 0;
1664	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1665
1666	return id;
1667}
1668
1669static void domain_id_free(int id)
1670{
1671	unsigned long flags;
1672
1673	spin_lock_irqsave(&pd_bitmap_lock, flags);
1674	if (id > 0 && id < MAX_DOMAIN_ID)
1675		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
1676	spin_unlock_irqrestore(&pd_bitmap_lock, flags);
1677}
1678
1679static void free_gcr3_tbl_level1(u64 *tbl)
1680{
1681	u64 *ptr;
1682	int i;
1683
1684	for (i = 0; i < 512; ++i) {
1685		if (!(tbl[i] & GCR3_VALID))
1686			continue;
1687
1688		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1689
1690		iommu_free_page(ptr);
1691	}
1692}
1693
1694static void free_gcr3_tbl_level2(u64 *tbl)
1695{
1696	u64 *ptr;
1697	int i;
1698
1699	for (i = 0; i < 512; ++i) {
1700		if (!(tbl[i] & GCR3_VALID))
1701			continue;
1702
1703		ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK);
1704
1705		free_gcr3_tbl_level1(ptr);
1706	}
1707}
1708
1709static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info)
1710{
1711	if (gcr3_info->glx == 2)
1712		free_gcr3_tbl_level2(gcr3_info->gcr3_tbl);
1713	else if (gcr3_info->glx == 1)
1714		free_gcr3_tbl_level1(gcr3_info->gcr3_tbl);
1715	else
1716		WARN_ON_ONCE(gcr3_info->glx != 0);
1717
1718	gcr3_info->glx = 0;
1719
1720	/* Free per device domain ID */
1721	domain_id_free(gcr3_info->domid);
1722
1723	iommu_free_page(gcr3_info->gcr3_tbl);
1724	gcr3_info->gcr3_tbl = NULL;
1725}
1726
1727/*
1728 * Number of GCR3 table levels required. Level must be 4-Kbyte
1729 * page and can contain up to 512 entries.
1730 */
1731static int get_gcr3_levels(int pasids)
1732{
1733	int levels;
1734
1735	if (pasids == -1)
1736		return amd_iommu_max_glx_val;
1737
1738	levels = get_count_order(pasids);
1739
1740	return levels ? (DIV_ROUND_UP(levels, 9) - 1) : levels;
1741}
1742
1743static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info,
1744			    struct amd_iommu *iommu, int pasids)
1745{
1746	int levels = get_gcr3_levels(pasids);
1747	int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
1748
1749	if (levels > amd_iommu_max_glx_val)
1750		return -EINVAL;
1751
1752	if (gcr3_info->gcr3_tbl)
1753		return -EBUSY;
1754
1755	/* Allocate per device domain ID */
1756	gcr3_info->domid = domain_id_alloc();
1757
1758	gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC);
1759	if (gcr3_info->gcr3_tbl == NULL) {
1760		domain_id_free(gcr3_info->domid);
1761		return -ENOMEM;
1762	}
1763
1764	gcr3_info->glx = levels;
1765
1766	return 0;
1767}
1768
1769static u64 *__get_gcr3_pte(struct gcr3_tbl_info *gcr3_info,
1770			   ioasid_t pasid, bool alloc)
1771{
1772	int index;
1773	u64 *pte;
1774	u64 *root = gcr3_info->gcr3_tbl;
1775	int level = gcr3_info->glx;
1776
1777	while (true) {
1778
1779		index = (pasid >> (9 * level)) & 0x1ff;
1780		pte   = &root[index];
1781
1782		if (level == 0)
1783			break;
1784
1785		if (!(*pte & GCR3_VALID)) {
1786			if (!alloc)
1787				return NULL;
1788
1789			root = (void *)get_zeroed_page(GFP_ATOMIC);
1790			if (root == NULL)
1791				return NULL;
1792
1793			*pte = iommu_virt_to_phys(root) | GCR3_VALID;
1794		}
1795
1796		root = iommu_phys_to_virt(*pte & PAGE_MASK);
1797
1798		level -= 1;
1799	}
1800
1801	return pte;
1802}
1803
1804static int update_gcr3(struct iommu_dev_data *dev_data,
1805		       ioasid_t pasid, unsigned long gcr3, bool set)
1806{
1807	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1808	u64 *pte;
1809
1810	pte = __get_gcr3_pte(gcr3_info, pasid, true);
1811	if (pte == NULL)
1812		return -ENOMEM;
1813
1814	if (set)
1815		*pte = (gcr3 & PAGE_MASK) | GCR3_VALID;
1816	else
1817		*pte = 0;
1818
1819	amd_iommu_dev_flush_pasid_all(dev_data, pasid);
1820	return 0;
1821}
1822
1823int amd_iommu_set_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid,
1824		       unsigned long gcr3)
1825{
1826	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1827	int ret;
1828
1829	iommu_group_mutex_assert(dev_data->dev);
1830
1831	ret = update_gcr3(dev_data, pasid, gcr3, true);
1832	if (ret)
1833		return ret;
1834
1835	gcr3_info->pasid_cnt++;
1836	return ret;
1837}
1838
1839int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
1840{
1841	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1842	int ret;
1843
1844	iommu_group_mutex_assert(dev_data->dev);
1845
1846	ret = update_gcr3(dev_data, pasid, 0, false);
1847	if (ret)
1848		return ret;
1849
1850	gcr3_info->pasid_cnt--;
1851	return ret;
1852}
1853
1854static void set_dte_entry(struct amd_iommu *iommu,
1855			  struct iommu_dev_data *dev_data)
1856{
1857	u64 pte_root = 0;
1858	u64 flags = 0;
1859	u32 old_domid;
1860	u16 devid = dev_data->devid;
1861	u16 domid;
1862	struct protection_domain *domain = dev_data->domain;
1863	struct dev_table_entry *dev_table = get_dev_table(iommu);
1864	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
1865
1866	if (gcr3_info && gcr3_info->gcr3_tbl)
1867		domid = dev_data->gcr3_info.domid;
1868	else
1869		domid = domain->id;
1870
1871	if (domain->iop.mode != PAGE_MODE_NONE)
1872		pte_root = iommu_virt_to_phys(domain->iop.root);
1873
1874	pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
1875		    << DEV_ENTRY_MODE_SHIFT;
1876
1877	pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
1878
1879	/*
1880	 * When SNP is enabled, Only set TV bit when IOMMU
1881	 * page translation is in use.
1882	 */
1883	if (!amd_iommu_snp_en || (domid != 0))
1884		pte_root |= DTE_FLAG_TV;
1885
1886	flags = dev_table[devid].data[1];
1887
1888	if (dev_data->ats_enabled)
1889		flags |= DTE_FLAG_IOTLB;
1890
1891	if (dev_data->ppr)
1892		pte_root |= 1ULL << DEV_ENTRY_PPR;
1893
1894	if (domain->dirty_tracking)
1895		pte_root |= DTE_FLAG_HAD;
1896
1897	if (gcr3_info && gcr3_info->gcr3_tbl) {
1898		u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
1899		u64 glx  = gcr3_info->glx;
1900		u64 tmp;
1901
1902		pte_root |= DTE_FLAG_GV;
1903		pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
1904
1905		/* First mask out possible old values for GCR3 table */
1906		tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
1907		flags    &= ~tmp;
1908
1909		tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
1910		flags    &= ~tmp;
1911
1912		/* Encode GCR3 table into DTE */
1913		tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
1914		pte_root |= tmp;
1915
1916		tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
1917		flags    |= tmp;
1918
1919		tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
1920		flags    |= tmp;
1921
1922		if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
1923			dev_table[devid].data[2] |=
1924				((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
1925		}
1926
1927		/* GIOV is supported with V2 page table mode only */
1928		if (pdom_is_v2_pgtbl_mode(domain))
1929			pte_root |= DTE_FLAG_GIOV;
1930	}
1931
1932	flags &= ~DEV_DOMID_MASK;
1933	flags |= domid;
1934
1935	old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
1936	dev_table[devid].data[1]  = flags;
1937	dev_table[devid].data[0]  = pte_root;
1938
1939	/*
1940	 * A kdump kernel might be replacing a domain ID that was copied from
1941	 * the previous kernel--if so, it needs to flush the translation cache
1942	 * entries for the old domain ID that is being overwritten
1943	 */
1944	if (old_domid) {
1945		amd_iommu_flush_tlb_domid(iommu, old_domid);
1946	}
1947}
1948
1949static void clear_dte_entry(struct amd_iommu *iommu, u16 devid)
1950{
1951	struct dev_table_entry *dev_table = get_dev_table(iommu);
1952
1953	/* remove entry from the device table seen by the hardware */
1954	dev_table[devid].data[0]  = DTE_FLAG_V;
1955
1956	if (!amd_iommu_snp_en)
1957		dev_table[devid].data[0] |= DTE_FLAG_TV;
1958
1959	dev_table[devid].data[1] &= DTE_FLAG_MASK;
1960
1961	amd_iommu_apply_erratum_63(iommu, devid);
1962}
1963
1964/* Update and flush DTE for the given device */
1965void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set)
1966{
1967	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
1968
1969	if (set)
1970		set_dte_entry(iommu, dev_data);
1971	else
1972		clear_dte_entry(iommu, dev_data->devid);
1973
1974	clone_aliases(iommu, dev_data->dev);
1975	device_flush_dte(dev_data);
1976	iommu_completion_wait(iommu);
1977}
1978
1979/*
1980 * If domain is SVA capable then initialize GCR3 table. Also if domain is
1981 * in v2 page table mode then update GCR3[0].
1982 */
1983static int init_gcr3_table(struct iommu_dev_data *dev_data,
1984			   struct protection_domain *pdom)
1985{
1986	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
1987	int max_pasids = dev_data->max_pasids;
1988	int ret = 0;
1989
1990	 /*
1991	  * If domain is in pt mode then setup GCR3 table only if device
1992	  * is PASID capable
1993	  */
1994	if (pdom_is_in_pt_mode(pdom) && !pdev_pasid_supported(dev_data))
1995		return ret;
1996
1997	/*
1998	 * By default, setup GCR3 table to support MAX PASIDs
1999	 * supported by the device/IOMMU.
2000	 */
2001	ret = setup_gcr3_table(&dev_data->gcr3_info, iommu,
2002			       max_pasids > 0 ?  max_pasids : 1);
2003	if (ret)
2004		return ret;
2005
2006	/* Setup GCR3[0] only if domain is setup with v2 page table mode */
2007	if (!pdom_is_v2_pgtbl_mode(pdom))
2008		return ret;
2009
2010	ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true);
2011	if (ret)
2012		free_gcr3_table(&dev_data->gcr3_info);
2013
2014	return ret;
2015}
2016
2017static void destroy_gcr3_table(struct iommu_dev_data *dev_data,
2018			       struct protection_domain *pdom)
2019{
2020	struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
2021
2022	if (pdom_is_v2_pgtbl_mode(pdom))
2023		update_gcr3(dev_data, 0, 0, false);
2024
2025	if (gcr3_info->gcr3_tbl == NULL)
2026		return;
2027
2028	free_gcr3_table(gcr3_info);
2029}
2030
2031static int do_attach(struct iommu_dev_data *dev_data,
2032		     struct protection_domain *domain)
2033{
2034	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2035	int ret = 0;
2036
2037	/* Update data structures */
2038	dev_data->domain = domain;
2039	list_add(&dev_data->list, &domain->dev_list);
2040
2041	/* Update NUMA Node ID */
2042	if (domain->nid == NUMA_NO_NODE)
2043		domain->nid = dev_to_node(dev_data->dev);
2044
2045	/* Do reference counting */
2046	domain->dev_iommu[iommu->index] += 1;
2047	domain->dev_cnt                 += 1;
2048
2049	/* Setup GCR3 table */
2050	if (pdom_is_sva_capable(domain)) {
2051		ret = init_gcr3_table(dev_data, domain);
2052		if (ret)
2053			return ret;
2054	}
2055
2056	return ret;
2057}
2058
2059static void do_detach(struct iommu_dev_data *dev_data)
2060{
2061	struct protection_domain *domain = dev_data->domain;
2062	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2063
2064	/* Clear GCR3 table */
2065	if (pdom_is_sva_capable(domain))
2066		destroy_gcr3_table(dev_data, domain);
2067
2068	/* Update data structures */
2069	dev_data->domain = NULL;
2070	list_del(&dev_data->list);
2071
2072	/* Clear DTE and flush the entry */
2073	amd_iommu_dev_update_dte(dev_data, false);
2074
2075	/* Flush IOTLB and wait for the flushes to finish */
2076	amd_iommu_domain_flush_all(domain);
2077
2078	/* decrease reference counters - needs to happen after the flushes */
2079	domain->dev_iommu[iommu->index] -= 1;
2080	domain->dev_cnt                 -= 1;
2081}
2082
2083/*
2084 * If a device is not yet associated with a domain, this function makes the
2085 * device visible in the domain
2086 */
2087static int attach_device(struct device *dev,
2088			 struct protection_domain *domain)
2089{
2090	struct iommu_dev_data *dev_data;
2091	unsigned long flags;
2092	int ret = 0;
2093
2094	spin_lock_irqsave(&domain->lock, flags);
2095
2096	dev_data = dev_iommu_priv_get(dev);
2097
2098	spin_lock(&dev_data->lock);
2099
2100	if (dev_data->domain != NULL) {
2101		ret = -EBUSY;
2102		goto out;
2103	}
2104
2105	ret = do_attach(dev_data, domain);
2106
2107out:
2108	spin_unlock(&dev_data->lock);
2109
2110	spin_unlock_irqrestore(&domain->lock, flags);
2111
2112	return ret;
2113}
2114
2115/*
2116 * Removes a device from a protection domain (with devtable_lock held)
2117 */
2118static void detach_device(struct device *dev)
2119{
2120	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2121	struct protection_domain *domain = dev_data->domain;
2122	struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
2123	unsigned long flags;
2124	bool ppr = dev_data->ppr;
2125
2126	spin_lock_irqsave(&domain->lock, flags);
2127
2128	spin_lock(&dev_data->lock);
2129
2130	/*
2131	 * First check if the device is still attached. It might already
2132	 * be detached from its domain because the generic
2133	 * iommu_detach_group code detached it and we try again here in
2134	 * our alias handling.
2135	 */
2136	if (WARN_ON(!dev_data->domain))
2137		goto out;
2138
2139	if (ppr) {
2140		iopf_queue_flush_dev(dev);
2141
2142		/* Updated here so that it gets reflected in DTE */
2143		dev_data->ppr = false;
2144	}
2145
2146	do_detach(dev_data);
2147
2148out:
2149	spin_unlock(&dev_data->lock);
2150
2151	spin_unlock_irqrestore(&domain->lock, flags);
2152
2153	/* Remove IOPF handler */
2154	if (ppr)
2155		amd_iommu_iopf_remove_device(iommu, dev_data);
2156
2157	if (dev_is_pci(dev))
2158		pdev_disable_caps(to_pci_dev(dev));
2159
2160}
2161
2162static struct iommu_device *amd_iommu_probe_device(struct device *dev)
2163{
2164	struct iommu_device *iommu_dev;
2165	struct amd_iommu *iommu;
2166	struct iommu_dev_data *dev_data;
2167	int ret;
2168
2169	if (!check_device(dev))
2170		return ERR_PTR(-ENODEV);
2171
2172	iommu = rlookup_amd_iommu(dev);
2173	if (!iommu)
2174		return ERR_PTR(-ENODEV);
2175
2176	/* Not registered yet? */
2177	if (!iommu->iommu.ops)
2178		return ERR_PTR(-ENODEV);
2179
2180	if (dev_iommu_priv_get(dev))
2181		return &iommu->iommu;
2182
2183	ret = iommu_init_device(iommu, dev);
2184	if (ret) {
2185		dev_err(dev, "Failed to initialize - trying to proceed anyway\n");
2186		iommu_dev = ERR_PTR(ret);
2187		iommu_ignore_device(iommu, dev);
2188	} else {
2189		amd_iommu_set_pci_msi_domain(dev, iommu);
2190		iommu_dev = &iommu->iommu;
2191	}
2192
2193	/*
2194	 * If IOMMU and device supports PASID then it will contain max
2195	 * supported PASIDs, else it will be zero.
2196	 */
2197	dev_data = dev_iommu_priv_get(dev);
2198	if (amd_iommu_pasid_supported() && dev_is_pci(dev) &&
2199	    pdev_pasid_supported(dev_data)) {
2200		dev_data->max_pasids = min_t(u32, iommu->iommu.max_pasids,
2201					     pci_max_pasids(to_pci_dev(dev)));
2202	}
2203
2204	iommu_completion_wait(iommu);
2205
2206	return iommu_dev;
2207}
2208
2209static void amd_iommu_release_device(struct device *dev)
2210{
2211	struct amd_iommu *iommu;
2212
2213	if (!check_device(dev))
2214		return;
2215
2216	iommu = rlookup_amd_iommu(dev);
2217	if (!iommu)
2218		return;
2219
2220	amd_iommu_uninit_device(dev);
2221	iommu_completion_wait(iommu);
2222}
2223
2224static struct iommu_group *amd_iommu_device_group(struct device *dev)
2225{
2226	if (dev_is_pci(dev))
2227		return pci_device_group(dev);
2228
2229	return acpihid_device_group(dev);
2230}
2231
2232/*****************************************************************************
2233 *
2234 * The following functions belong to the exported interface of AMD IOMMU
2235 *
2236 * This interface allows access to lower level functions of the IOMMU
2237 * like protection domain handling and assignement of devices to domains
2238 * which is not possible with the dma_ops interface.
2239 *
2240 *****************************************************************************/
2241
2242static void cleanup_domain(struct protection_domain *domain)
2243{
2244	struct iommu_dev_data *entry;
2245
2246	lockdep_assert_held(&domain->lock);
2247
2248	if (!domain->dev_cnt)
2249		return;
2250
2251	while (!list_empty(&domain->dev_list)) {
2252		entry = list_first_entry(&domain->dev_list,
2253					 struct iommu_dev_data, list);
2254		BUG_ON(!entry->domain);
2255		do_detach(entry);
2256	}
2257	WARN_ON(domain->dev_cnt != 0);
2258}
2259
2260void protection_domain_free(struct protection_domain *domain)
2261{
2262	if (!domain)
2263		return;
2264
2265	if (domain->iop.pgtbl_cfg.tlb)
2266		free_io_pgtable_ops(&domain->iop.iop.ops);
2267
2268	if (domain->iop.root)
2269		iommu_free_page(domain->iop.root);
2270
2271	if (domain->id)
2272		domain_id_free(domain->id);
2273
2274	kfree(domain);
2275}
2276
2277static int protection_domain_init_v1(struct protection_domain *domain, int mode)
2278{
2279	u64 *pt_root = NULL;
2280
2281	BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL);
2282
2283	if (mode != PAGE_MODE_NONE) {
2284		pt_root = iommu_alloc_page(GFP_KERNEL);
2285		if (!pt_root)
2286			return -ENOMEM;
2287	}
2288
2289	domain->pd_mode = PD_MODE_V1;
2290	amd_iommu_domain_set_pgtable(domain, pt_root, mode);
2291
2292	return 0;
2293}
2294
2295static int protection_domain_init_v2(struct protection_domain *pdom)
2296{
2297	pdom->pd_mode = PD_MODE_V2;
2298	pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2;
2299
2300	return 0;
2301}
2302
2303struct protection_domain *protection_domain_alloc(unsigned int type)
2304{
2305	struct io_pgtable_ops *pgtbl_ops;
2306	struct protection_domain *domain;
2307	int pgtable;
2308	int ret;
2309
2310	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2311	if (!domain)
2312		return NULL;
2313
2314	domain->id = domain_id_alloc();
2315	if (!domain->id)
2316		goto out_err;
2317
2318	spin_lock_init(&domain->lock);
2319	INIT_LIST_HEAD(&domain->dev_list);
2320	INIT_LIST_HEAD(&domain->dev_data_list);
2321	domain->nid = NUMA_NO_NODE;
2322
2323	switch (type) {
2324	/* No need to allocate io pgtable ops in passthrough mode */
2325	case IOMMU_DOMAIN_IDENTITY:
2326	case IOMMU_DOMAIN_SVA:
2327		return domain;
2328	case IOMMU_DOMAIN_DMA:
2329		pgtable = amd_iommu_pgtable;
2330		break;
2331	/*
2332	 * Force IOMMU v1 page table when allocating
2333	 * domain for pass-through devices.
2334	 */
2335	case IOMMU_DOMAIN_UNMANAGED:
2336		pgtable = AMD_IOMMU_V1;
2337		break;
2338	default:
2339		goto out_err;
2340	}
2341
2342	switch (pgtable) {
2343	case AMD_IOMMU_V1:
2344		ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL);
2345		break;
2346	case AMD_IOMMU_V2:
2347		ret = protection_domain_init_v2(domain);
2348		break;
2349	default:
2350		ret = -EINVAL;
2351		break;
2352	}
2353
2354	if (ret)
2355		goto out_err;
2356
2357	pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain);
2358	if (!pgtbl_ops)
2359		goto out_err;
2360
2361	return domain;
2362out_err:
2363	protection_domain_free(domain);
2364	return NULL;
2365}
2366
2367static inline u64 dma_max_address(void)
2368{
2369	if (amd_iommu_pgtable == AMD_IOMMU_V1)
2370		return ~0ULL;
2371
2372	/* V2 with 4/5 level page table */
2373	return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
2374}
2375
2376static bool amd_iommu_hd_support(struct amd_iommu *iommu)
2377{
2378	return iommu && (iommu->features & FEATURE_HDSUP);
2379}
2380
2381static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
2382						  struct device *dev, u32 flags)
2383{
2384	bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
2385	struct protection_domain *domain;
2386	struct amd_iommu *iommu = NULL;
2387
2388	if (dev)
2389		iommu = get_amd_iommu_from_dev(dev);
2390
2391	/*
2392	 * Since DTE[Mode]=0 is prohibited on SNP-enabled system,
2393	 * default to use IOMMU_DOMAIN_DMA[_FQ].
2394	 */
2395	if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
2396		return ERR_PTR(-EINVAL);
2397
2398	if (dirty_tracking && !amd_iommu_hd_support(iommu))
2399		return ERR_PTR(-EOPNOTSUPP);
2400
2401	domain = protection_domain_alloc(type);
2402	if (!domain)
2403		return ERR_PTR(-ENOMEM);
2404
2405	domain->domain.geometry.aperture_start = 0;
2406	domain->domain.geometry.aperture_end   = dma_max_address();
2407	domain->domain.geometry.force_aperture = true;
2408
2409	if (iommu) {
2410		domain->domain.type = type;
2411		domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
2412		domain->domain.ops = iommu->iommu.ops->default_domain_ops;
2413
2414		if (dirty_tracking)
2415			domain->domain.dirty_ops = &amd_dirty_ops;
2416	}
2417
2418	return &domain->domain;
2419}
2420
2421static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
2422{
2423	struct iommu_domain *domain;
2424
2425	domain = do_iommu_domain_alloc(type, NULL, 0);
2426	if (IS_ERR(domain))
2427		return NULL;
2428
2429	return domain;
2430}
2431
2432static struct iommu_domain *
2433amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
2434			    struct iommu_domain *parent,
2435			    const struct iommu_user_data *user_data)
2436
2437{
2438	unsigned int type = IOMMU_DOMAIN_UNMANAGED;
2439
2440	if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
2441		return ERR_PTR(-EOPNOTSUPP);
2442
2443	return do_iommu_domain_alloc(type, dev, flags);
2444}
2445
2446void amd_iommu_domain_free(struct iommu_domain *dom)
2447{
2448	struct protection_domain *domain;
2449	unsigned long flags;
2450
2451	if (!dom)
2452		return;
2453
2454	domain = to_pdomain(dom);
2455
2456	spin_lock_irqsave(&domain->lock, flags);
2457
2458	cleanup_domain(domain);
2459
2460	spin_unlock_irqrestore(&domain->lock, flags);
2461
2462	protection_domain_free(domain);
2463}
2464
2465static int amd_iommu_attach_device(struct iommu_domain *dom,
2466				   struct device *dev)
2467{
2468	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2469	struct protection_domain *domain = to_pdomain(dom);
2470	struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2471	struct pci_dev *pdev;
2472	int ret;
2473
2474	/*
2475	 * Skip attach device to domain if new domain is same as
2476	 * devices current domain
2477	 */
2478	if (dev_data->domain == domain)
2479		return 0;
2480
2481	dev_data->defer_attach = false;
2482
2483	/*
2484	 * Restrict to devices with compatible IOMMU hardware support
2485	 * when enforcement of dirty tracking is enabled.
2486	 */
2487	if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
2488		return -EINVAL;
2489
2490	if (dev_data->domain)
2491		detach_device(dev);
2492
2493	ret = attach_device(dev, domain);
2494
2495#ifdef CONFIG_IRQ_REMAP
2496	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
2497		if (dom->type == IOMMU_DOMAIN_UNMANAGED)
2498			dev_data->use_vapic = 1;
2499		else
2500			dev_data->use_vapic = 0;
2501	}
2502#endif
2503
2504	pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL;
2505	if (pdev && pdom_is_sva_capable(domain)) {
2506		pdev_enable_caps(pdev);
2507
2508		/*
2509		 * Device can continue to function even if IOPF
2510		 * enablement failed. Hence in error path just
2511		 * disable device PRI support.
2512		 */
2513		if (amd_iommu_iopf_add_device(iommu, dev_data))
2514			pdev_disable_cap_pri(pdev);
2515	} else if (pdev) {
2516		pdev_enable_cap_ats(pdev);
2517	}
2518
2519	/* Update device table */
2520	amd_iommu_dev_update_dte(dev_data, true);
2521
2522	return ret;
2523}
2524
2525static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom,
2526				    unsigned long iova, size_t size)
2527{
2528	struct protection_domain *domain = to_pdomain(dom);
2529	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2530
2531	if (ops->map_pages)
2532		domain_flush_np_cache(domain, iova, size);
2533	return 0;
2534}
2535
2536static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova,
2537			       phys_addr_t paddr, size_t pgsize, size_t pgcount,
2538			       int iommu_prot, gfp_t gfp, size_t *mapped)
2539{
2540	struct protection_domain *domain = to_pdomain(dom);
2541	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2542	int prot = 0;
2543	int ret = -EINVAL;
2544
2545	if ((domain->pd_mode == PD_MODE_V1) &&
2546	    (domain->iop.mode == PAGE_MODE_NONE))
2547		return -EINVAL;
2548
2549	if (iommu_prot & IOMMU_READ)
2550		prot |= IOMMU_PROT_IR;
2551	if (iommu_prot & IOMMU_WRITE)
2552		prot |= IOMMU_PROT_IW;
2553
2554	if (ops->map_pages) {
2555		ret = ops->map_pages(ops, iova, paddr, pgsize,
2556				     pgcount, prot, gfp, mapped);
2557	}
2558
2559	return ret;
2560}
2561
2562static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain,
2563					    struct iommu_iotlb_gather *gather,
2564					    unsigned long iova, size_t size)
2565{
2566	/*
2567	 * AMD's IOMMU can flush as many pages as necessary in a single flush.
2568	 * Unless we run in a virtual machine, which can be inferred according
2569	 * to whether "non-present cache" is on, it is probably best to prefer
2570	 * (potentially) too extensive TLB flushing (i.e., more misses) over
2571	 * mutliple TLB flushes (i.e., more flushes). For virtual machines the
2572	 * hypervisor needs to synchronize the host IOMMU PTEs with those of
2573	 * the guest, and the trade-off is different: unnecessary TLB flushes
2574	 * should be avoided.
2575	 */
2576	if (amd_iommu_np_cache &&
2577	    iommu_iotlb_gather_is_disjoint(gather, iova, size))
2578		iommu_iotlb_sync(domain, gather);
2579
2580	iommu_iotlb_gather_add_range(gather, iova, size);
2581}
2582
2583static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova,
2584				    size_t pgsize, size_t pgcount,
2585				    struct iommu_iotlb_gather *gather)
2586{
2587	struct protection_domain *domain = to_pdomain(dom);
2588	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2589	size_t r;
2590
2591	if ((domain->pd_mode == PD_MODE_V1) &&
2592	    (domain->iop.mode == PAGE_MODE_NONE))
2593		return 0;
2594
2595	r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0;
2596
2597	if (r)
2598		amd_iommu_iotlb_gather_add_page(dom, gather, iova, r);
2599
2600	return r;
2601}
2602
2603static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
2604					  dma_addr_t iova)
2605{
2606	struct protection_domain *domain = to_pdomain(dom);
2607	struct io_pgtable_ops *ops = &domain->iop.iop.ops;
2608
2609	return ops->iova_to_phys(ops, iova);
2610}
2611
2612static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
2613{
2614	switch (cap) {
2615	case IOMMU_CAP_CACHE_COHERENCY:
2616		return true;
2617	case IOMMU_CAP_NOEXEC:
2618		return false;
2619	case IOMMU_CAP_PRE_BOOT_PROTECTION:
2620		return amdr_ivrs_remap_support;
2621	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
2622		return true;
2623	case IOMMU_CAP_DEFERRED_FLUSH:
2624		return true;
2625	case IOMMU_CAP_DIRTY_TRACKING: {
2626		struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
2627
2628		return amd_iommu_hd_support(iommu);
2629	}
2630	default:
2631		break;
2632	}
2633
2634	return false;
2635}
2636
2637static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
2638					bool enable)
2639{
2640	struct protection_domain *pdomain = to_pdomain(domain);
2641	struct dev_table_entry *dev_table;
2642	struct iommu_dev_data *dev_data;
2643	bool domain_flush = false;
2644	struct amd_iommu *iommu;
2645	unsigned long flags;
2646	u64 pte_root;
2647
2648	spin_lock_irqsave(&pdomain->lock, flags);
2649	if (!(pdomain->dirty_tracking ^ enable)) {
2650		spin_unlock_irqrestore(&pdomain->lock, flags);
2651		return 0;
2652	}
2653
2654	list_for_each_entry(dev_data, &pdomain->dev_list, list) {
2655		iommu = get_amd_iommu_from_dev_data(dev_data);
2656
2657		dev_table = get_dev_table(iommu);
2658		pte_root = dev_table[dev_data->devid].data[0];
2659
2660		pte_root = (enable ? pte_root | DTE_FLAG_HAD :
2661				     pte_root & ~DTE_FLAG_HAD);
2662
2663		/* Flush device DTE */
2664		dev_table[dev_data->devid].data[0] = pte_root;
2665		device_flush_dte(dev_data);
2666		domain_flush = true;
2667	}
2668
2669	/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
2670	if (domain_flush)
2671		amd_iommu_domain_flush_all(pdomain);
2672
2673	pdomain->dirty_tracking = enable;
2674	spin_unlock_irqrestore(&pdomain->lock, flags);
2675
2676	return 0;
2677}
2678
2679static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
2680					  unsigned long iova, size_t size,
2681					  unsigned long flags,
2682					  struct iommu_dirty_bitmap *dirty)
2683{
2684	struct protection_domain *pdomain = to_pdomain(domain);
2685	struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
2686	unsigned long lflags;
2687
2688	if (!ops || !ops->read_and_clear_dirty)
2689		return -EOPNOTSUPP;
2690
2691	spin_lock_irqsave(&pdomain->lock, lflags);
2692	if (!pdomain->dirty_tracking && dirty->bitmap) {
2693		spin_unlock_irqrestore(&pdomain->lock, lflags);
2694		return -EINVAL;
2695	}
2696	spin_unlock_irqrestore(&pdomain->lock, lflags);
2697
2698	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
2699}
2700
2701static void amd_iommu_get_resv_regions(struct device *dev,
2702				       struct list_head *head)
2703{
2704	struct iommu_resv_region *region;
2705	struct unity_map_entry *entry;
2706	struct amd_iommu *iommu;
2707	struct amd_iommu_pci_seg *pci_seg;
2708	int devid, sbdf;
2709
2710	sbdf = get_device_sbdf_id(dev);
2711	if (sbdf < 0)
2712		return;
2713
2714	devid = PCI_SBDF_TO_DEVID(sbdf);
2715	iommu = get_amd_iommu_from_dev(dev);
2716	pci_seg = iommu->pci_seg;
2717
2718	list_for_each_entry(entry, &pci_seg->unity_map, list) {
2719		int type, prot = 0;
2720		size_t length;
2721
2722		if (devid < entry->devid_start || devid > entry->devid_end)
2723			continue;
2724
2725		type   = IOMMU_RESV_DIRECT;
2726		length = entry->address_end - entry->address_start;
2727		if (entry->prot & IOMMU_PROT_IR)
2728			prot |= IOMMU_READ;
2729		if (entry->prot & IOMMU_PROT_IW)
2730			prot |= IOMMU_WRITE;
2731		if (entry->prot & IOMMU_UNITY_MAP_FLAG_EXCL_RANGE)
2732			/* Exclusion range */
2733			type = IOMMU_RESV_RESERVED;
2734
2735		region = iommu_alloc_resv_region(entry->address_start,
2736						 length, prot, type,
2737						 GFP_KERNEL);
2738		if (!region) {
2739			dev_err(dev, "Out of memory allocating dm-regions\n");
2740			return;
2741		}
2742		list_add_tail(&region->list, head);
2743	}
2744
2745	region = iommu_alloc_resv_region(MSI_RANGE_START,
2746					 MSI_RANGE_END - MSI_RANGE_START + 1,
2747					 0, IOMMU_RESV_MSI, GFP_KERNEL);
2748	if (!region)
2749		return;
2750	list_add_tail(&region->list, head);
2751
2752	region = iommu_alloc_resv_region(HT_RANGE_START,
2753					 HT_RANGE_END - HT_RANGE_START + 1,
2754					 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
2755	if (!region)
2756		return;
2757	list_add_tail(&region->list, head);
2758}
2759
2760bool amd_iommu_is_attach_deferred(struct device *dev)
2761{
2762	struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
2763
2764	return dev_data->defer_attach;
2765}
2766
2767static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
2768{
2769	struct protection_domain *dom = to_pdomain(domain);
2770	unsigned long flags;
2771
2772	spin_lock_irqsave(&dom->lock, flags);
2773	amd_iommu_domain_flush_all(dom);
2774	spin_unlock_irqrestore(&dom->lock, flags);
2775}
2776
2777static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
2778				 struct iommu_iotlb_gather *gather)
2779{
2780	struct protection_domain *dom = to_pdomain(domain);
2781	unsigned long flags;
2782
2783	spin_lock_irqsave(&dom->lock, flags);
2784	amd_iommu_domain_flush_pages(dom, gather->start,
2785				     gather->end - gather->start + 1);
2786	spin_unlock_irqrestore(&dom->lock, flags);
2787}
2788
2789static int amd_iommu_def_domain_type(struct device *dev)
2790{
2791	struct iommu_dev_data *dev_data;
2792
2793	dev_data = dev_iommu_priv_get(dev);
2794	if (!dev_data)
2795		return 0;
2796
2797	/* Always use DMA domain for untrusted device */
2798	if (dev_is_pci(dev) && to_pci_dev(dev)->untrusted)
2799		return IOMMU_DOMAIN_DMA;
2800
2801	/*
2802	 * Do not identity map IOMMUv2 capable devices when:
2803	 *  - memory encryption is active, because some of those devices
2804	 *    (AMD GPUs) don't have the encryption bit in their DMA-mask
2805	 *    and require remapping.
2806	 *  - SNP is enabled, because it prohibits DTE[Mode]=0.
2807	 */
2808	if (pdev_pasid_supported(dev_data) &&
2809	    !cc_platform_has(CC_ATTR_MEM_ENCRYPT) &&
2810	    !amd_iommu_snp_en) {
2811		return IOMMU_DOMAIN_IDENTITY;
2812	}
2813
2814	return 0;
2815}
2816
2817static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
2818{
2819	/* IOMMU_PTE_FC is always set */
2820	return true;
2821}
2822
2823static const struct iommu_dirty_ops amd_dirty_ops = {
2824	.set_dirty_tracking = amd_iommu_set_dirty_tracking,
2825	.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
2826};
2827
2828static int amd_iommu_dev_enable_feature(struct device *dev,
2829					enum iommu_dev_features feat)
2830{
2831	int ret = 0;
2832
2833	switch (feat) {
2834	case IOMMU_DEV_FEAT_IOPF:
2835	case IOMMU_DEV_FEAT_SVA:
2836		break;
2837	default:
2838		ret = -EINVAL;
2839		break;
2840	}
2841	return ret;
2842}
2843
2844static int amd_iommu_dev_disable_feature(struct device *dev,
2845					 enum iommu_dev_features feat)
2846{
2847	int ret = 0;
2848
2849	switch (feat) {
2850	case IOMMU_DEV_FEAT_IOPF:
2851	case IOMMU_DEV_FEAT_SVA:
2852		break;
2853	default:
2854		ret = -EINVAL;
2855		break;
2856	}
2857	return ret;
2858}
2859
2860const struct iommu_ops amd_iommu_ops = {
2861	.capable = amd_iommu_capable,
2862	.domain_alloc = amd_iommu_domain_alloc,
2863	.domain_alloc_user = amd_iommu_domain_alloc_user,
2864	.domain_alloc_sva = amd_iommu_domain_alloc_sva,
2865	.probe_device = amd_iommu_probe_device,
2866	.release_device = amd_iommu_release_device,
2867	.device_group = amd_iommu_device_group,
2868	.get_resv_regions = amd_iommu_get_resv_regions,
2869	.is_attach_deferred = amd_iommu_is_attach_deferred,
2870	.pgsize_bitmap	= AMD_IOMMU_PGSIZES,
2871	.def_domain_type = amd_iommu_def_domain_type,
2872	.dev_enable_feat = amd_iommu_dev_enable_feature,
2873	.dev_disable_feat = amd_iommu_dev_disable_feature,
2874	.remove_dev_pasid = amd_iommu_remove_dev_pasid,
2875	.page_response = amd_iommu_page_response,
2876	.default_domain_ops = &(const struct iommu_domain_ops) {
2877		.attach_dev	= amd_iommu_attach_device,
2878		.map_pages	= amd_iommu_map_pages,
2879		.unmap_pages	= amd_iommu_unmap_pages,
2880		.iotlb_sync_map	= amd_iommu_iotlb_sync_map,
2881		.iova_to_phys	= amd_iommu_iova_to_phys,
2882		.flush_iotlb_all = amd_iommu_flush_iotlb_all,
2883		.iotlb_sync	= amd_iommu_iotlb_sync,
2884		.free		= amd_iommu_domain_free,
2885		.enforce_cache_coherency = amd_iommu_enforce_cache_coherency,
2886	}
2887};
2888
2889#ifdef CONFIG_IRQ_REMAP
2890
2891/*****************************************************************************
2892 *
2893 * Interrupt Remapping Implementation
2894 *
2895 *****************************************************************************/
2896
2897static struct irq_chip amd_ir_chip;
2898static DEFINE_SPINLOCK(iommu_table_lock);
2899
2900static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
2901{
2902	int ret;
2903	u64 data;
2904	unsigned long flags;
2905	struct iommu_cmd cmd, cmd2;
2906
2907	if (iommu->irtcachedis_enabled)
2908		return;
2909
2910	build_inv_irt(&cmd, devid);
2911	data = atomic64_add_return(1, &iommu->cmd_sem_val);
2912	build_completion_wait(&cmd2, iommu, data);
2913
2914	raw_spin_lock_irqsave(&iommu->lock, flags);
2915	ret = __iommu_queue_command_sync(iommu, &cmd, true);
2916	if (ret)
2917		goto out;
2918	ret = __iommu_queue_command_sync(iommu, &cmd2, false);
2919	if (ret)
2920		goto out;
2921	wait_on_sem(iommu, data);
2922out:
2923	raw_spin_unlock_irqrestore(&iommu->lock, flags);
2924}
2925
2926static void set_dte_irq_entry(struct amd_iommu *iommu, u16 devid,
2927			      struct irq_remap_table *table)
2928{
2929	u64 dte;
2930	struct dev_table_entry *dev_table = get_dev_table(iommu);
2931
2932	dte	= dev_table[devid].data[2];
2933	dte	&= ~DTE_IRQ_PHYS_ADDR_MASK;
2934	dte	|= iommu_virt_to_phys(table->table);
2935	dte	|= DTE_IRQ_REMAP_INTCTL;
2936	dte	|= DTE_INTTABLEN;
2937	dte	|= DTE_IRQ_REMAP_ENABLE;
2938
2939	dev_table[devid].data[2] = dte;
2940}
2941
2942static struct irq_remap_table *get_irq_table(struct amd_iommu *iommu, u16 devid)
2943{
2944	struct irq_remap_table *table;
2945	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2946
2947	if (WARN_ONCE(!pci_seg->rlookup_table[devid],
2948		      "%s: no iommu for devid %x:%x\n",
2949		      __func__, pci_seg->id, devid))
2950		return NULL;
2951
2952	table = pci_seg->irq_lookup_table[devid];
2953	if (WARN_ONCE(!table, "%s: no table for devid %x:%x\n",
2954		      __func__, pci_seg->id, devid))
2955		return NULL;
2956
2957	return table;
2958}
2959
2960static struct irq_remap_table *__alloc_irq_table(void)
2961{
2962	struct irq_remap_table *table;
2963
2964	table = kzalloc(sizeof(*table), GFP_KERNEL);
2965	if (!table)
2966		return NULL;
2967
2968	table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL);
2969	if (!table->table) {
2970		kfree(table);
2971		return NULL;
2972	}
2973	raw_spin_lock_init(&table->lock);
2974
2975	if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
2976		memset(table->table, 0,
2977		       MAX_IRQS_PER_TABLE * sizeof(u32));
2978	else
2979		memset(table->table, 0,
2980		       (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2)));
2981	return table;
2982}
2983
2984static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid,
2985				  struct irq_remap_table *table)
2986{
2987	struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg;
2988
2989	pci_seg->irq_lookup_table[devid] = table;
2990	set_dte_irq_entry(iommu, devid, table);
2991	iommu_flush_dte(iommu, devid);
2992}
2993
2994static int set_remap_table_entry_alias(struct pci_dev *pdev, u16 alias,
2995				       void *data)
2996{
2997	struct irq_remap_table *table = data;
2998	struct amd_iommu_pci_seg *pci_seg;
2999	struct amd_iommu *iommu = rlookup_amd_iommu(&pdev->dev);
3000
3001	if (!iommu)
3002		return -EINVAL;
3003
3004	pci_seg = iommu->pci_seg;
3005	pci_seg->irq_lookup_table[alias] = table;
3006	set_dte_irq_entry(iommu, alias, table);
3007	iommu_flush_dte(pci_seg->rlookup_table[alias], alias);
3008
3009	return 0;
3010}
3011
3012static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
3013					       u16 devid, struct pci_dev *pdev)
3014{
3015	struct irq_remap_table *table = NULL;
3016	struct irq_remap_table *new_table = NULL;
3017	struct amd_iommu_pci_seg *pci_seg;
3018	unsigned long flags;
3019	u16 alias;
3020
3021	spin_lock_irqsave(&iommu_table_lock, flags);
3022
3023	pci_seg = iommu->pci_seg;
3024	table = pci_seg->irq_lookup_table[devid];
3025	if (table)
3026		goto out_unlock;
3027
3028	alias = pci_seg->alias_table[devid];
3029	table = pci_seg->irq_lookup_table[alias];
3030	if (table) {
3031		set_remap_table_entry(iommu, devid, table);
3032		goto out_wait;
3033	}
3034	spin_unlock_irqrestore(&iommu_table_lock, flags);
3035
3036	/* Nothing there yet, allocate new irq remapping table */
3037	new_table = __alloc_irq_table();
3038	if (!new_table)
3039		return NULL;
3040
3041	spin_lock_irqsave(&iommu_table_lock, flags);
3042
3043	table = pci_seg->irq_lookup_table[devid];
3044	if (table)
3045		goto out_unlock;
3046
3047	table = pci_seg->irq_lookup_table[alias];
3048	if (table) {
3049		set_remap_table_entry(iommu, devid, table);
3050		goto out_wait;
3051	}
3052
3053	table = new_table;
3054	new_table = NULL;
3055
3056	if (pdev)
3057		pci_for_each_dma_alias(pdev, set_remap_table_entry_alias,
3058				       table);
3059	else
3060		set_remap_table_entry(iommu, devid, table);
3061
3062	if (devid != alias)
3063		set_remap_table_entry(iommu, alias, table);
3064
3065out_wait:
3066	iommu_completion_wait(iommu);
3067
3068out_unlock:
3069	spin_unlock_irqrestore(&iommu_table_lock, flags);
3070
3071	if (new_table) {
3072		kmem_cache_free(amd_iommu_irq_cache, new_table->table);
3073		kfree(new_table);
3074	}
3075	return table;
3076}
3077
3078static int alloc_irq_index(struct amd_iommu *iommu, u16 devid, int count,
3079			   bool align, struct pci_dev *pdev)
3080{
3081	struct irq_remap_table *table;
3082	int index, c, alignment = 1;
3083	unsigned long flags;
3084
3085	table = alloc_irq_table(iommu, devid, pdev);
3086	if (!table)
3087		return -ENODEV;
3088
3089	if (align)
3090		alignment = roundup_pow_of_two(count);
3091
3092	raw_spin_lock_irqsave(&table->lock, flags);
3093
3094	/* Scan table for free entries */
3095	for (index = ALIGN(table->min_index, alignment), c = 0;
3096	     index < MAX_IRQS_PER_TABLE;) {
3097		if (!iommu->irte_ops->is_allocated(table, index)) {
3098			c += 1;
3099		} else {
3100			c     = 0;
3101			index = ALIGN(index + 1, alignment);
3102			continue;
3103		}
3104
3105		if (c == count)	{
3106			for (; c != 0; --c)
3107				iommu->irte_ops->set_allocated(table, index - c + 1);
3108
3109			index -= count - 1;
3110			goto out;
3111		}
3112
3113		index++;
3114	}
3115
3116	index = -ENOSPC;
3117
3118out:
3119	raw_spin_unlock_irqrestore(&table->lock, flags);
3120
3121	return index;
3122}
3123
3124static int __modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3125			    struct irte_ga *irte)
3126{
3127	struct irq_remap_table *table;
3128	struct irte_ga *entry;
3129	unsigned long flags;
3130	u128 old;
3131
3132	table = get_irq_table(iommu, devid);
3133	if (!table)
3134		return -ENOMEM;
3135
3136	raw_spin_lock_irqsave(&table->lock, flags);
3137
3138	entry = (struct irte_ga *)table->table;
3139	entry = &entry[index];
3140
3141	/*
3142	 * We use cmpxchg16 to atomically update the 128-bit IRTE,
3143	 * and it cannot be updated by the hardware or other processors
3144	 * behind us, so the return value of cmpxchg16 should be the
3145	 * same as the old value.
3146	 */
3147	old = entry->irte;
3148	WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
3149
3150	raw_spin_unlock_irqrestore(&table->lock, flags);
3151
3152	return 0;
3153}
3154
3155static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
3156			  struct irte_ga *irte)
3157{
3158	bool ret;
3159
3160	ret = __modify_irte_ga(iommu, devid, index, irte);
3161	if (ret)
3162		return ret;
3163
3164	iommu_flush_irt_and_complete(iommu, devid);
3165
3166	return 0;
3167}
3168
3169static int modify_irte(struct amd_iommu *iommu,
3170		       u16 devid, int index, union irte *irte)
3171{
3172	struct irq_remap_table *table;
3173	unsigned long flags;
3174
3175	table = get_irq_table(iommu, devid);
3176	if (!table)
3177		return -ENOMEM;
3178
3179	raw_spin_lock_irqsave(&table->lock, flags);
3180	table->table[index] = irte->val;
3181	raw_spin_unlock_irqrestore(&table->lock, flags);
3182
3183	iommu_flush_irt_and_complete(iommu, devid);
3184
3185	return 0;
3186}
3187
3188static void free_irte(struct amd_iommu *iommu, u16 devid, int index)
3189{
3190	struct irq_remap_table *table;
3191	unsigned long flags;
3192
3193	table = get_irq_table(iommu, devid);
3194	if (!table)
3195		return;
3196
3197	raw_spin_lock_irqsave(&table->lock, flags);
3198	iommu->irte_ops->clear_allocated(table, index);
3199	raw_spin_unlock_irqrestore(&table->lock, flags);
3200
3201	iommu_flush_irt_and_complete(iommu, devid);
3202}
3203
3204static void irte_prepare(void *entry,
3205			 u32 delivery_mode, bool dest_mode,
3206			 u8 vector, u32 dest_apicid, int devid)
3207{
3208	union irte *irte = (union irte *) entry;
3209
3210	irte->val                = 0;
3211	irte->fields.vector      = vector;
3212	irte->fields.int_type    = delivery_mode;
3213	irte->fields.destination = dest_apicid;
3214	irte->fields.dm          = dest_mode;
3215	irte->fields.valid       = 1;
3216}
3217
3218static void irte_ga_prepare(void *entry,
3219			    u32 delivery_mode, bool dest_mode,
3220			    u8 vector, u32 dest_apicid, int devid)
3221{
3222	struct irte_ga *irte = (struct irte_ga *) entry;
3223
3224	irte->lo.val                      = 0;
3225	irte->hi.val                      = 0;
3226	irte->lo.fields_remap.int_type    = delivery_mode;
3227	irte->lo.fields_remap.dm          = dest_mode;
3228	irte->hi.fields.vector            = vector;
3229	irte->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(dest_apicid);
3230	irte->hi.fields.destination       = APICID_TO_IRTE_DEST_HI(dest_apicid);
3231	irte->lo.fields_remap.valid       = 1;
3232}
3233
3234static void irte_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3235{
3236	union irte *irte = (union irte *) entry;
3237
3238	irte->fields.valid = 1;
3239	modify_irte(iommu, devid, index, irte);
3240}
3241
3242static void irte_ga_activate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3243{
3244	struct irte_ga *irte = (struct irte_ga *) entry;
3245
3246	irte->lo.fields_remap.valid = 1;
3247	modify_irte_ga(iommu, devid, index, irte);
3248}
3249
3250static void irte_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3251{
3252	union irte *irte = (union irte *) entry;
3253
3254	irte->fields.valid = 0;
3255	modify_irte(iommu, devid, index, irte);
3256}
3257
3258static void irte_ga_deactivate(struct amd_iommu *iommu, void *entry, u16 devid, u16 index)
3259{
3260	struct irte_ga *irte = (struct irte_ga *) entry;
3261
3262	irte->lo.fields_remap.valid = 0;
3263	modify_irte_ga(iommu, devid, index, irte);
3264}
3265
3266static void irte_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3267			      u8 vector, u32 dest_apicid)
3268{
3269	union irte *irte = (union irte *) entry;
3270
3271	irte->fields.vector = vector;
3272	irte->fields.destination = dest_apicid;
3273	modify_irte(iommu, devid, index, irte);
3274}
3275
3276static void irte_ga_set_affinity(struct amd_iommu *iommu, void *entry, u16 devid, u16 index,
3277				 u8 vector, u32 dest_apicid)
3278{
3279	struct irte_ga *irte = (struct irte_ga *) entry;
3280
3281	if (!irte->lo.fields_remap.guest_mode) {
3282		irte->hi.fields.vector = vector;
3283		irte->lo.fields_remap.destination =
3284					APICID_TO_IRTE_DEST_LO(dest_apicid);
3285		irte->hi.fields.destination =
3286					APICID_TO_IRTE_DEST_HI(dest_apicid);
3287		modify_irte_ga(iommu, devid, index, irte);
3288	}
3289}
3290
3291#define IRTE_ALLOCATED (~1U)
3292static void irte_set_allocated(struct irq_remap_table *table, int index)
3293{
3294	table->table[index] = IRTE_ALLOCATED;
3295}
3296
3297static void irte_ga_set_allocated(struct irq_remap_table *table, int index)
3298{
3299	struct irte_ga *ptr = (struct irte_ga *)table->table;
3300	struct irte_ga *irte = &ptr[index];
3301
3302	memset(&irte->lo.val, 0, sizeof(u64));
3303	memset(&irte->hi.val, 0, sizeof(u64));
3304	irte->hi.fields.vector = 0xff;
3305}
3306
3307static bool irte_is_allocated(struct irq_remap_table *table, int index)
3308{
3309	union irte *ptr = (union irte *)table->table;
3310	union irte *irte = &ptr[index];
3311
3312	return irte->val != 0;
3313}
3314
3315static bool irte_ga_is_allocated(struct irq_remap_table *table, int index)
3316{
3317	struct irte_ga *ptr = (struct irte_ga *)table->table;
3318	struct irte_ga *irte = &ptr[index];
3319
3320	return irte->hi.fields.vector != 0;
3321}
3322
3323static void irte_clear_allocated(struct irq_remap_table *table, int index)
3324{
3325	table->table[index] = 0;
3326}
3327
3328static void irte_ga_clear_allocated(struct irq_remap_table *table, int index)
3329{
3330	struct irte_ga *ptr = (struct irte_ga *)table->table;
3331	struct irte_ga *irte = &ptr[index];
3332
3333	memset(&irte->lo.val, 0, sizeof(u64));
3334	memset(&irte->hi.val, 0, sizeof(u64));
3335}
3336
3337static int get_devid(struct irq_alloc_info *info)
3338{
3339	switch (info->type) {
3340	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3341		return get_ioapic_devid(info->devid);
3342	case X86_IRQ_ALLOC_TYPE_HPET:
3343		return get_hpet_devid(info->devid);
3344	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3345	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3346		return get_device_sbdf_id(msi_desc_to_dev(info->desc));
3347	default:
3348		WARN_ON_ONCE(1);
3349		return -1;
3350	}
3351}
3352
3353struct irq_remap_ops amd_iommu_irq_ops = {
3354	.prepare		= amd_iommu_prepare,
3355	.enable			= amd_iommu_enable,
3356	.disable		= amd_iommu_disable,
3357	.reenable		= amd_iommu_reenable,
3358	.enable_faulting	= amd_iommu_enable_faulting,
3359};
3360
3361static void fill_msi_msg(struct msi_msg *msg, u32 index)
3362{
3363	msg->data = index;
3364	msg->address_lo = 0;
3365	msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
3366	msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
3367}
3368
3369static void irq_remapping_prepare_irte(struct amd_ir_data *data,
3370				       struct irq_cfg *irq_cfg,
3371				       struct irq_alloc_info *info,
3372				       int devid, int index, int sub_handle)
3373{
3374	struct irq_2_irte *irte_info = &data->irq_2_irte;
3375	struct amd_iommu *iommu = data->iommu;
3376
3377	if (!iommu)
3378		return;
3379
3380	data->irq_2_irte.devid = devid;
3381	data->irq_2_irte.index = index + sub_handle;
3382	iommu->irte_ops->prepare(data->entry, APIC_DELIVERY_MODE_FIXED,
3383				 apic->dest_mode_logical, irq_cfg->vector,
3384				 irq_cfg->dest_apicid, devid);
3385
3386	switch (info->type) {
3387	case X86_IRQ_ALLOC_TYPE_IOAPIC:
3388	case X86_IRQ_ALLOC_TYPE_HPET:
3389	case X86_IRQ_ALLOC_TYPE_PCI_MSI:
3390	case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
3391		fill_msi_msg(&data->msi_entry, irte_info->index);
3392		break;
3393
3394	default:
3395		BUG_ON(1);
3396		break;
3397	}
3398}
3399
3400struct amd_irte_ops irte_32_ops = {
3401	.prepare = irte_prepare,
3402	.activate = irte_activate,
3403	.deactivate = irte_deactivate,
3404	.set_affinity = irte_set_affinity,
3405	.set_allocated = irte_set_allocated,
3406	.is_allocated = irte_is_allocated,
3407	.clear_allocated = irte_clear_allocated,
3408};
3409
3410struct amd_irte_ops irte_128_ops = {
3411	.prepare = irte_ga_prepare,
3412	.activate = irte_ga_activate,
3413	.deactivate = irte_ga_deactivate,
3414	.set_affinity = irte_ga_set_affinity,
3415	.set_allocated = irte_ga_set_allocated,
3416	.is_allocated = irte_ga_is_allocated,
3417	.clear_allocated = irte_ga_clear_allocated,
3418};
3419
3420static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq,
3421			       unsigned int nr_irqs, void *arg)
3422{
3423	struct irq_alloc_info *info = arg;
3424	struct irq_data *irq_data;
3425	struct amd_ir_data *data = NULL;
3426	struct amd_iommu *iommu;
3427	struct irq_cfg *cfg;
3428	int i, ret, devid, seg, sbdf;
3429	int index;
3430
3431	if (!info)
3432		return -EINVAL;
3433	if (nr_irqs > 1 && info->type != X86_IRQ_ALLOC_TYPE_PCI_MSI)
3434		return -EINVAL;
3435
3436	sbdf = get_devid(info);
3437	if (sbdf < 0)
3438		return -EINVAL;
3439
3440	seg = PCI_SBDF_TO_SEGID(sbdf);
3441	devid = PCI_SBDF_TO_DEVID(sbdf);
3442	iommu = __rlookup_amd_iommu(seg, devid);
3443	if (!iommu)
3444		return -EINVAL;
3445
3446	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
3447	if (ret < 0)
3448		return ret;
3449
3450	if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) {
3451		struct irq_remap_table *table;
3452
3453		table = alloc_irq_table(iommu, devid, NULL);
3454		if (table) {
3455			if (!table->min_index) {
3456				/*
3457				 * Keep the first 32 indexes free for IOAPIC
3458				 * interrupts.
3459				 */
3460				table->min_index = 32;
3461				for (i = 0; i < 32; ++i)
3462					iommu->irte_ops->set_allocated(table, i);
3463			}
3464			WARN_ON(table->min_index != 32);
3465			index = info->ioapic.pin;
3466		} else {
3467			index = -ENOMEM;
3468		}
3469	} else if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI ||
3470		   info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX) {
3471		bool align = (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI);
3472
3473		index = alloc_irq_index(iommu, devid, nr_irqs, align,
3474					msi_desc_to_pci_dev(info->desc));
3475	} else {
3476		index = alloc_irq_index(iommu, devid, nr_irqs, false, NULL);
3477	}
3478
3479	if (index < 0) {
3480		pr_warn("Failed to allocate IRTE\n");
3481		ret = index;
3482		goto out_free_parent;
3483	}
3484
3485	for (i = 0; i < nr_irqs; i++) {
3486		irq_data = irq_domain_get_irq_data(domain, virq + i);
3487		cfg = irq_data ? irqd_cfg(irq_data) : NULL;
3488		if (!cfg) {
3489			ret = -EINVAL;
3490			goto out_free_data;
3491		}
3492
3493		ret = -ENOMEM;
3494		data = kzalloc(sizeof(*data), GFP_KERNEL);
3495		if (!data)
3496			goto out_free_data;
3497
3498		if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir))
3499			data->entry = kzalloc(sizeof(union irte), GFP_KERNEL);
3500		else
3501			data->entry = kzalloc(sizeof(struct irte_ga),
3502						     GFP_KERNEL);
3503		if (!data->entry) {
3504			kfree(data);
3505			goto out_free_data;
3506		}
3507
3508		data->iommu = iommu;
3509		irq_data->hwirq = (devid << 16) + i;
3510		irq_data->chip_data = data;
3511		irq_data->chip = &amd_ir_chip;
3512		irq_remapping_prepare_irte(data, cfg, info, devid, index, i);
3513		irq_set_status_flags(virq + i, IRQ_MOVE_PCNTXT);
3514	}
3515
3516	return 0;
3517
3518out_free_data:
3519	for (i--; i >= 0; i--) {
3520		irq_data = irq_domain_get_irq_data(domain, virq + i);
3521		if (irq_data)
3522			kfree(irq_data->chip_data);
3523	}
3524	for (i = 0; i < nr_irqs; i++)
3525		free_irte(iommu, devid, index + i);
3526out_free_parent:
3527	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3528	return ret;
3529}
3530
3531static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
3532			       unsigned int nr_irqs)
3533{
3534	struct irq_2_irte *irte_info;
3535	struct irq_data *irq_data;
3536	struct amd_ir_data *data;
3537	int i;
3538
3539	for (i = 0; i < nr_irqs; i++) {
3540		irq_data = irq_domain_get_irq_data(domain, virq  + i);
3541		if (irq_data && irq_data->chip_data) {
3542			data = irq_data->chip_data;
3543			irte_info = &data->irq_2_irte;
3544			free_irte(data->iommu, irte_info->devid, irte_info->index);
3545			kfree(data->entry);
3546			kfree(data);
3547		}
3548	}
3549	irq_domain_free_irqs_common(domain, virq, nr_irqs);
3550}
3551
3552static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3553			       struct amd_ir_data *ir_data,
3554			       struct irq_2_irte *irte_info,
3555			       struct irq_cfg *cfg);
3556
3557static int irq_remapping_activate(struct irq_domain *domain,
3558				  struct irq_data *irq_data, bool reserve)
3559{
3560	struct amd_ir_data *data = irq_data->chip_data;
3561	struct irq_2_irte *irte_info = &data->irq_2_irte;
3562	struct amd_iommu *iommu = data->iommu;
3563	struct irq_cfg *cfg = irqd_cfg(irq_data);
3564
3565	if (!iommu)
3566		return 0;
3567
3568	iommu->irte_ops->activate(iommu, data->entry, irte_info->devid,
3569				  irte_info->index);
3570	amd_ir_update_irte(irq_data, iommu, data, irte_info, cfg);
3571	return 0;
3572}
3573
3574static void irq_remapping_deactivate(struct irq_domain *domain,
3575				     struct irq_data *irq_data)
3576{
3577	struct amd_ir_data *data = irq_data->chip_data;
3578	struct irq_2_irte *irte_info = &data->irq_2_irte;
3579	struct amd_iommu *iommu = data->iommu;
3580
3581	if (iommu)
3582		iommu->irte_ops->deactivate(iommu, data->entry, irte_info->devid,
3583					    irte_info->index);
3584}
3585
3586static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec,
3587				enum irq_domain_bus_token bus_token)
3588{
3589	struct amd_iommu *iommu;
3590	int devid = -1;
3591
3592	if (!amd_iommu_irq_remap)
3593		return 0;
3594
3595	if (x86_fwspec_is_ioapic(fwspec))
3596		devid = get_ioapic_devid(fwspec->param[0]);
3597	else if (x86_fwspec_is_hpet(fwspec))
3598		devid = get_hpet_devid(fwspec->param[0]);
3599
3600	if (devid < 0)
3601		return 0;
3602	iommu = __rlookup_amd_iommu((devid >> 16), (devid & 0xffff));
3603
3604	return iommu && iommu->ir_domain == d;
3605}
3606
3607static const struct irq_domain_ops amd_ir_domain_ops = {
3608	.select = irq_remapping_select,
3609	.alloc = irq_remapping_alloc,
3610	.free = irq_remapping_free,
3611	.activate = irq_remapping_activate,
3612	.deactivate = irq_remapping_deactivate,
3613};
3614
3615int amd_iommu_activate_guest_mode(void *data)
3616{
3617	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3618	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3619	u64 valid;
3620
3621	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
3622		return 0;
3623
3624	valid = entry->lo.fields_vapic.valid;
3625
3626	entry->lo.val = 0;
3627	entry->hi.val = 0;
3628
3629	entry->lo.fields_vapic.valid       = valid;
3630	entry->lo.fields_vapic.guest_mode  = 1;
3631	entry->lo.fields_vapic.ga_log_intr = 1;
3632	entry->hi.fields.ga_root_ptr       = ir_data->ga_root_ptr;
3633	entry->hi.fields.vector            = ir_data->ga_vector;
3634	entry->lo.fields_vapic.ga_tag      = ir_data->ga_tag;
3635
3636	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3637			      ir_data->irq_2_irte.index, entry);
3638}
3639EXPORT_SYMBOL(amd_iommu_activate_guest_mode);
3640
3641int amd_iommu_deactivate_guest_mode(void *data)
3642{
3643	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3644	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3645	struct irq_cfg *cfg = ir_data->cfg;
3646	u64 valid;
3647
3648	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3649	    !entry || !entry->lo.fields_vapic.guest_mode)
3650		return 0;
3651
3652	valid = entry->lo.fields_remap.valid;
3653
3654	entry->lo.val = 0;
3655	entry->hi.val = 0;
3656
3657	entry->lo.fields_remap.valid       = valid;
3658	entry->lo.fields_remap.dm          = apic->dest_mode_logical;
3659	entry->lo.fields_remap.int_type    = APIC_DELIVERY_MODE_FIXED;
3660	entry->hi.fields.vector            = cfg->vector;
3661	entry->lo.fields_remap.destination =
3662				APICID_TO_IRTE_DEST_LO(cfg->dest_apicid);
3663	entry->hi.fields.destination =
3664				APICID_TO_IRTE_DEST_HI(cfg->dest_apicid);
3665
3666	return modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3667			      ir_data->irq_2_irte.index, entry);
3668}
3669EXPORT_SYMBOL(amd_iommu_deactivate_guest_mode);
3670
3671static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
3672{
3673	int ret;
3674	struct amd_iommu_pi_data *pi_data = vcpu_info;
3675	struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data;
3676	struct amd_ir_data *ir_data = data->chip_data;
3677	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3678	struct iommu_dev_data *dev_data;
3679
3680	if (ir_data->iommu == NULL)
3681		return -EINVAL;
3682
3683	dev_data = search_dev_data(ir_data->iommu, irte_info->devid);
3684
3685	/* Note:
3686	 * This device has never been set up for guest mode.
3687	 * we should not modify the IRTE
3688	 */
3689	if (!dev_data || !dev_data->use_vapic)
3690		return 0;
3691
3692	ir_data->cfg = irqd_cfg(data);
3693	pi_data->ir_data = ir_data;
3694
3695	/* Note:
3696	 * SVM tries to set up for VAPIC mode, but we are in
3697	 * legacy mode. So, we force legacy mode instead.
3698	 */
3699	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
3700		pr_debug("%s: Fall back to using intr legacy remap\n",
3701			 __func__);
3702		pi_data->is_guest_mode = false;
3703	}
3704
3705	pi_data->prev_ga_tag = ir_data->cached_ga_tag;
3706	if (pi_data->is_guest_mode) {
3707		ir_data->ga_root_ptr = (pi_data->base >> 12);
3708		ir_data->ga_vector = vcpu_pi_info->vector;
3709		ir_data->ga_tag = pi_data->ga_tag;
3710		ret = amd_iommu_activate_guest_mode(ir_data);
3711		if (!ret)
3712			ir_data->cached_ga_tag = pi_data->ga_tag;
3713	} else {
3714		ret = amd_iommu_deactivate_guest_mode(ir_data);
3715
3716		/*
3717		 * This communicates the ga_tag back to the caller
3718		 * so that it can do all the necessary clean up.
3719		 */
3720		if (!ret)
3721			ir_data->cached_ga_tag = 0;
3722	}
3723
3724	return ret;
3725}
3726
3727
3728static void amd_ir_update_irte(struct irq_data *irqd, struct amd_iommu *iommu,
3729			       struct amd_ir_data *ir_data,
3730			       struct irq_2_irte *irte_info,
3731			       struct irq_cfg *cfg)
3732{
3733
3734	/*
3735	 * Atomically updates the IRTE with the new destination, vector
3736	 * and flushes the interrupt entry cache.
3737	 */
3738	iommu->irte_ops->set_affinity(iommu, ir_data->entry, irte_info->devid,
3739				      irte_info->index, cfg->vector,
3740				      cfg->dest_apicid);
3741}
3742
3743static int amd_ir_set_affinity(struct irq_data *data,
3744			       const struct cpumask *mask, bool force)
3745{
3746	struct amd_ir_data *ir_data = data->chip_data;
3747	struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
3748	struct irq_cfg *cfg = irqd_cfg(data);
3749	struct irq_data *parent = data->parent_data;
3750	struct amd_iommu *iommu = ir_data->iommu;
3751	int ret;
3752
3753	if (!iommu)
3754		return -ENODEV;
3755
3756	ret = parent->chip->irq_set_affinity(parent, mask, force);
3757	if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
3758		return ret;
3759
3760	amd_ir_update_irte(data, iommu, ir_data, irte_info, cfg);
3761	/*
3762	 * After this point, all the interrupts will start arriving
3763	 * at the new destination. So, time to cleanup the previous
3764	 * vector allocation.
3765	 */
3766	vector_schedule_cleanup(cfg);
3767
3768	return IRQ_SET_MASK_OK_DONE;
3769}
3770
3771static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg)
3772{
3773	struct amd_ir_data *ir_data = irq_data->chip_data;
3774
3775	*msg = ir_data->msi_entry;
3776}
3777
3778static struct irq_chip amd_ir_chip = {
3779	.name			= "AMD-IR",
3780	.irq_ack		= apic_ack_irq,
3781	.irq_set_affinity	= amd_ir_set_affinity,
3782	.irq_set_vcpu_affinity	= amd_ir_set_vcpu_affinity,
3783	.irq_compose_msi_msg	= ir_compose_msi_msg,
3784};
3785
3786static const struct msi_parent_ops amdvi_msi_parent_ops = {
3787	.supported_flags	= X86_VECTOR_MSI_FLAGS_SUPPORTED | MSI_FLAG_MULTI_PCI_MSI,
3788	.prefix			= "IR-",
3789	.init_dev_msi_info	= msi_parent_init_dev_msi_info,
3790};
3791
3792int amd_iommu_create_irq_domain(struct amd_iommu *iommu)
3793{
3794	struct fwnode_handle *fn;
3795
3796	fn = irq_domain_alloc_named_id_fwnode("AMD-IR", iommu->index);
3797	if (!fn)
3798		return -ENOMEM;
3799	iommu->ir_domain = irq_domain_create_hierarchy(arch_get_ir_parent_domain(), 0, 0,
3800						       fn, &amd_ir_domain_ops, iommu);
3801	if (!iommu->ir_domain) {
3802		irq_domain_free_fwnode(fn);
3803		return -ENOMEM;
3804	}
3805
3806	irq_domain_update_bus_token(iommu->ir_domain,  DOMAIN_BUS_AMDVI);
3807	iommu->ir_domain->flags |= IRQ_DOMAIN_FLAG_MSI_PARENT |
3808				   IRQ_DOMAIN_FLAG_ISOLATED_MSI;
3809	iommu->ir_domain->msi_parent_ops = &amdvi_msi_parent_ops;
3810
3811	return 0;
3812}
3813
3814int amd_iommu_update_ga(int cpu, bool is_run, void *data)
3815{
3816	struct amd_ir_data *ir_data = (struct amd_ir_data *)data;
3817	struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
3818
3819	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
3820	    !entry || !entry->lo.fields_vapic.guest_mode)
3821		return 0;
3822
3823	if (!ir_data->iommu)
3824		return -ENODEV;
3825
3826	if (cpu >= 0) {
3827		entry->lo.fields_vapic.destination =
3828					APICID_TO_IRTE_DEST_LO(cpu);
3829		entry->hi.fields.destination =
3830					APICID_TO_IRTE_DEST_HI(cpu);
3831	}
3832	entry->lo.fields_vapic.is_run = is_run;
3833
3834	return __modify_irte_ga(ir_data->iommu, ir_data->irq_2_irte.devid,
3835				ir_data->irq_2_irte.index, entry);
3836}
3837EXPORT_SYMBOL(amd_iommu_update_ga);
3838#endif
3839