1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
4 *
5 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
6 *          Suresh B Siddha <suresh.b.siddha@intel.com>
7 *
8 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
9 *
10 * Basic principles:
11 *
12 * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
13 * the kernel to set one of a handful of 'caching type' attributes for physical
14 * memory ranges: uncached, write-combining, write-through, write-protected,
15 * and the most commonly used and default attribute: write-back caching.
16 *
17 * PAT support supersedes and augments MTRR support in a compatible fashion: MTRR is
18 * a hardware interface to enumerate a limited number of physical memory ranges
19 * and set their caching attributes explicitly, programmed into the CPU via MSRs.
20 * Even modern CPUs have MTRRs enabled - but these are typically not touched
21 * by the kernel or by user-space (such as the X server), we rely on PAT for any
22 * additional cache attribute logic.
23 *
24 * PAT doesn't work via explicit memory ranges, but uses page table entries to add
25 * cache attribute information to the mapped memory range: there's 3 bits used,
26 * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
27 * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
28 *
29 * ( There's a metric ton of finer details, such as compatibility with CPU quirks
30 *   that only support 4 types of PAT entries, and interaction with MTRRs, see
31 *   below for details. )
32 */
33
34#include <linux/seq_file.h>
35#include <linux/memblock.h>
36#include <linux/debugfs.h>
37#include <linux/ioport.h>
38#include <linux/kernel.h>
39#include <linux/pfn_t.h>
40#include <linux/slab.h>
41#include <linux/mm.h>
42#include <linux/highmem.h>
43#include <linux/fs.h>
44#include <linux/rbtree.h>
45
46#include <asm/cacheflush.h>
47#include <asm/cacheinfo.h>
48#include <asm/processor.h>
49#include <asm/tlbflush.h>
50#include <asm/x86_init.h>
51#include <asm/fcntl.h>
52#include <asm/e820/api.h>
53#include <asm/mtrr.h>
54#include <asm/page.h>
55#include <asm/msr.h>
56#include <asm/memtype.h>
57#include <asm/io.h>
58
59#include "memtype.h"
60#include "../mm_internal.h"
61
62#undef pr_fmt
63#define pr_fmt(fmt) "" fmt
64
65static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
66static u64 __ro_after_init pat_msr_val;
67
68/*
69 * PAT support is enabled by default, but can be disabled for
70 * various user-requested or hardware-forced reasons:
71 */
72static void __init pat_disable(const char *msg_reason)
73{
74	if (pat_disabled)
75		return;
76
77	pat_disabled = true;
78	pr_info("x86/PAT: %s\n", msg_reason);
79
80	memory_caching_control &= ~CACHE_PAT;
81}
82
83static int __init nopat(char *str)
84{
85	pat_disable("PAT support disabled via boot option.");
86	return 0;
87}
88early_param("nopat", nopat);
89
90bool pat_enabled(void)
91{
92	return !pat_disabled;
93}
94EXPORT_SYMBOL_GPL(pat_enabled);
95
96int pat_debug_enable;
97
98static int __init pat_debug_setup(char *str)
99{
100	pat_debug_enable = 1;
101	return 1;
102}
103__setup("debugpat", pat_debug_setup);
104
105#ifdef CONFIG_X86_PAT
106/*
107 * X86 PAT uses page flags arch_1 and uncached together to keep track of
108 * memory type of pages that have backing page struct.
109 *
110 * X86 PAT supports 4 different memory types:
111 *  - _PAGE_CACHE_MODE_WB
112 *  - _PAGE_CACHE_MODE_WC
113 *  - _PAGE_CACHE_MODE_UC_MINUS
114 *  - _PAGE_CACHE_MODE_WT
115 *
116 * _PAGE_CACHE_MODE_WB is the default type.
117 */
118
119#define _PGMT_WB		0
120#define _PGMT_WC		(1UL << PG_arch_1)
121#define _PGMT_UC_MINUS		(1UL << PG_uncached)
122#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
123#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
124#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
125
126static inline enum page_cache_mode get_page_memtype(struct page *pg)
127{
128	unsigned long pg_flags = pg->flags & _PGMT_MASK;
129
130	if (pg_flags == _PGMT_WB)
131		return _PAGE_CACHE_MODE_WB;
132	else if (pg_flags == _PGMT_WC)
133		return _PAGE_CACHE_MODE_WC;
134	else if (pg_flags == _PGMT_UC_MINUS)
135		return _PAGE_CACHE_MODE_UC_MINUS;
136	else
137		return _PAGE_CACHE_MODE_WT;
138}
139
140static inline void set_page_memtype(struct page *pg,
141				    enum page_cache_mode memtype)
142{
143	unsigned long memtype_flags;
144	unsigned long old_flags;
145	unsigned long new_flags;
146
147	switch (memtype) {
148	case _PAGE_CACHE_MODE_WC:
149		memtype_flags = _PGMT_WC;
150		break;
151	case _PAGE_CACHE_MODE_UC_MINUS:
152		memtype_flags = _PGMT_UC_MINUS;
153		break;
154	case _PAGE_CACHE_MODE_WT:
155		memtype_flags = _PGMT_WT;
156		break;
157	case _PAGE_CACHE_MODE_WB:
158	default:
159		memtype_flags = _PGMT_WB;
160		break;
161	}
162
163	old_flags = READ_ONCE(pg->flags);
164	do {
165		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
166	} while (!try_cmpxchg(&pg->flags, &old_flags, new_flags));
167}
168#else
169static inline enum page_cache_mode get_page_memtype(struct page *pg)
170{
171	return -1;
172}
173static inline void set_page_memtype(struct page *pg,
174				    enum page_cache_mode memtype)
175{
176}
177#endif
178
179enum {
180	PAT_UC = 0,		/* uncached */
181	PAT_WC = 1,		/* Write combining */
182	PAT_WT = 4,		/* Write Through */
183	PAT_WP = 5,		/* Write Protected */
184	PAT_WB = 6,		/* Write Back (default) */
185	PAT_UC_MINUS = 7,	/* UC, but can be overridden by MTRR */
186};
187
188#define CM(c) (_PAGE_CACHE_MODE_ ## c)
189
190static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
191						      char *msg)
192{
193	enum page_cache_mode cache;
194	char *cache_mode;
195
196	switch (pat_val) {
197	case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
198	case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
199	case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
200	case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
201	case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
202	case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
203	default:           cache = CM(WB);       cache_mode = "WB  "; break;
204	}
205
206	memcpy(msg, cache_mode, 4);
207
208	return cache;
209}
210
211#undef CM
212
213/*
214 * Update the cache mode to pgprot translation tables according to PAT
215 * configuration.
216 * Using lower indices is preferred, so we start with highest index.
217 */
218static void __init init_cache_modes(u64 pat)
219{
220	enum page_cache_mode cache;
221	char pat_msg[33];
222	int i;
223
224	pat_msg[32] = 0;
225	for (i = 7; i >= 0; i--) {
226		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
227					   pat_msg + 4 * i);
228		update_cache_mode_entry(i, cache);
229	}
230	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
231}
232
233void pat_cpu_init(void)
234{
235	if (!boot_cpu_has(X86_FEATURE_PAT)) {
236		/*
237		 * If this happens we are on a secondary CPU, but switched to
238		 * PAT on the boot CPU. We have no way to undo PAT.
239		 */
240		panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
241	}
242
243	wrmsrl(MSR_IA32_CR_PAT, pat_msr_val);
244
245	__flush_tlb_all();
246}
247
248/**
249 * pat_bp_init - Initialize the PAT MSR value and PAT table
250 *
251 * This function initializes PAT MSR value and PAT table with an OS-defined
252 * value to enable additional cache attributes, WC, WT and WP.
253 *
254 * This function prepares the calls of pat_cpu_init() via cache_cpu_init()
255 * on all CPUs.
256 */
257void __init pat_bp_init(void)
258{
259	struct cpuinfo_x86 *c = &boot_cpu_data;
260#define PAT(p0, p1, p2, p3, p4, p5, p6, p7)			\
261	(((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) |		\
262	((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) |	\
263	((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) |	\
264	((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
265
266
267	if (!IS_ENABLED(CONFIG_X86_PAT))
268		pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
269
270	if (!cpu_feature_enabled(X86_FEATURE_PAT))
271		pat_disable("PAT not supported by the CPU.");
272	else
273		rdmsrl(MSR_IA32_CR_PAT, pat_msr_val);
274
275	if (!pat_msr_val) {
276		pat_disable("PAT support disabled by the firmware.");
277
278		/*
279		 * No PAT. Emulate the PAT table that corresponds to the two
280		 * cache bits, PWT (Write Through) and PCD (Cache Disable).
281		 * This setup is also the same as the BIOS default setup.
282		 *
283		 * PTE encoding:
284		 *
285		 *       PCD
286		 *       |PWT  PAT
287		 *       ||    slot
288		 *       00    0    WB : _PAGE_CACHE_MODE_WB
289		 *       01    1    WT : _PAGE_CACHE_MODE_WT
290		 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
291		 *       11    3    UC : _PAGE_CACHE_MODE_UC
292		 *
293		 * NOTE: When WC or WP is used, it is redirected to UC- per
294		 * the default setup in __cachemode2pte_tbl[].
295		 */
296		pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
297	}
298
299	/*
300	 * Xen PV doesn't allow to set PAT MSR, but all cache modes are
301	 * supported.
302	 */
303	if (pat_disabled || cpu_feature_enabled(X86_FEATURE_XENPV)) {
304		init_cache_modes(pat_msr_val);
305		return;
306	}
307
308	if ((c->x86_vendor == X86_VENDOR_INTEL) &&
309	    (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
310	     ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
311		/*
312		 * PAT support with the lower four entries. Intel Pentium 2,
313		 * 3, M, and 4 are affected by PAT errata, which makes the
314		 * upper four entries unusable. To be on the safe side, we don't
315		 * use those.
316		 *
317		 *  PTE encoding:
318		 *      PAT
319		 *      |PCD
320		 *      ||PWT  PAT
321		 *      |||    slot
322		 *      000    0    WB : _PAGE_CACHE_MODE_WB
323		 *      001    1    WC : _PAGE_CACHE_MODE_WC
324		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
325		 *      011    3    UC : _PAGE_CACHE_MODE_UC
326		 * PAT bit unused
327		 *
328		 * NOTE: When WT or WP is used, it is redirected to UC- per
329		 * the default setup in __cachemode2pte_tbl[].
330		 */
331		pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
332	} else {
333		/*
334		 * Full PAT support.  We put WT in slot 7 to improve
335		 * robustness in the presence of errata that might cause
336		 * the high PAT bit to be ignored.  This way, a buggy slot 7
337		 * access will hit slot 3, and slot 3 is UC, so at worst
338		 * we lose performance without causing a correctness issue.
339		 * Pentium 4 erratum N46 is an example for such an erratum,
340		 * although we try not to use PAT at all on affected CPUs.
341		 *
342		 *  PTE encoding:
343		 *      PAT
344		 *      |PCD
345		 *      ||PWT  PAT
346		 *      |||    slot
347		 *      000    0    WB : _PAGE_CACHE_MODE_WB
348		 *      001    1    WC : _PAGE_CACHE_MODE_WC
349		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
350		 *      011    3    UC : _PAGE_CACHE_MODE_UC
351		 *      100    4    WB : Reserved
352		 *      101    5    WP : _PAGE_CACHE_MODE_WP
353		 *      110    6    UC-: Reserved
354		 *      111    7    WT : _PAGE_CACHE_MODE_WT
355		 *
356		 * The reserved slots are unused, but mapped to their
357		 * corresponding types in the presence of PAT errata.
358		 */
359		pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
360	}
361
362	memory_caching_control |= CACHE_PAT;
363
364	init_cache_modes(pat_msr_val);
365#undef PAT
366}
367
368static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
369
370/*
371 * Does intersection of PAT memory type and MTRR memory type and returns
372 * the resulting memory type as PAT understands it.
373 * (Type in pat and mtrr will not have same value)
374 * The intersection is based on "Effective Memory Type" tables in IA-32
375 * SDM vol 3a
376 */
377static unsigned long pat_x_mtrr_type(u64 start, u64 end,
378				     enum page_cache_mode req_type)
379{
380	/*
381	 * Look for MTRR hint to get the effective type in case where PAT
382	 * request is for WB.
383	 */
384	if (req_type == _PAGE_CACHE_MODE_WB) {
385		u8 mtrr_type, uniform;
386
387		mtrr_type = mtrr_type_lookup(start, end, &uniform);
388		if (mtrr_type != MTRR_TYPE_WRBACK)
389			return _PAGE_CACHE_MODE_UC_MINUS;
390
391		return _PAGE_CACHE_MODE_WB;
392	}
393
394	return req_type;
395}
396
397struct pagerange_state {
398	unsigned long		cur_pfn;
399	int			ram;
400	int			not_ram;
401};
402
403static int
404pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
405{
406	struct pagerange_state *state = arg;
407
408	state->not_ram	|= initial_pfn > state->cur_pfn;
409	state->ram	|= total_nr_pages > 0;
410	state->cur_pfn	 = initial_pfn + total_nr_pages;
411
412	return state->ram && state->not_ram;
413}
414
415static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
416{
417	int ret = 0;
418	unsigned long start_pfn = start >> PAGE_SHIFT;
419	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
420	struct pagerange_state state = {start_pfn, 0, 0};
421
422	/*
423	 * For legacy reasons, physical address range in the legacy ISA
424	 * region is tracked as non-RAM. This will allow users of
425	 * /dev/mem to map portions of legacy ISA region, even when
426	 * some of those portions are listed(or not even listed) with
427	 * different e820 types(RAM/reserved/..)
428	 */
429	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
430		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
431
432	if (start_pfn < end_pfn) {
433		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
434				&state, pagerange_is_ram_callback);
435	}
436
437	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
438}
439
440/*
441 * For RAM pages, we use page flags to mark the pages with appropriate type.
442 * The page flags are limited to four types, WB (default), WC, WT and UC-.
443 * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
444 * a new memory type is only allowed for a page mapped with the default WB
445 * type.
446 *
447 * Here we do two passes:
448 * - Find the memtype of all the pages in the range, look for any conflicts.
449 * - In case of no conflicts, set the new memtype for pages in the range.
450 */
451static int reserve_ram_pages_type(u64 start, u64 end,
452				  enum page_cache_mode req_type,
453				  enum page_cache_mode *new_type)
454{
455	struct page *page;
456	u64 pfn;
457
458	if (req_type == _PAGE_CACHE_MODE_WP) {
459		if (new_type)
460			*new_type = _PAGE_CACHE_MODE_UC_MINUS;
461		return -EINVAL;
462	}
463
464	if (req_type == _PAGE_CACHE_MODE_UC) {
465		/* We do not support strong UC */
466		WARN_ON_ONCE(1);
467		req_type = _PAGE_CACHE_MODE_UC_MINUS;
468	}
469
470	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
471		enum page_cache_mode type;
472
473		page = pfn_to_page(pfn);
474		type = get_page_memtype(page);
475		if (type != _PAGE_CACHE_MODE_WB) {
476			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
477				start, end - 1, type, req_type);
478			if (new_type)
479				*new_type = type;
480
481			return -EBUSY;
482		}
483	}
484
485	if (new_type)
486		*new_type = req_type;
487
488	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
489		page = pfn_to_page(pfn);
490		set_page_memtype(page, req_type);
491	}
492	return 0;
493}
494
495static int free_ram_pages_type(u64 start, u64 end)
496{
497	struct page *page;
498	u64 pfn;
499
500	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
501		page = pfn_to_page(pfn);
502		set_page_memtype(page, _PAGE_CACHE_MODE_WB);
503	}
504	return 0;
505}
506
507static u64 sanitize_phys(u64 address)
508{
509	/*
510	 * When changing the memtype for pages containing poison allow
511	 * for a "decoy" virtual address (bit 63 clear) passed to
512	 * set_memory_X(). __pa() on a "decoy" address results in a
513	 * physical address with bit 63 set.
514	 *
515	 * Decoy addresses are not present for 32-bit builds, see
516	 * set_mce_nospec().
517	 */
518	if (IS_ENABLED(CONFIG_X86_64))
519		return address & __PHYSICAL_MASK;
520	return address;
521}
522
523/*
524 * req_type typically has one of the:
525 * - _PAGE_CACHE_MODE_WB
526 * - _PAGE_CACHE_MODE_WC
527 * - _PAGE_CACHE_MODE_UC_MINUS
528 * - _PAGE_CACHE_MODE_UC
529 * - _PAGE_CACHE_MODE_WT
530 *
531 * If new_type is NULL, function will return an error if it cannot reserve the
532 * region with req_type. If new_type is non-NULL, function will return
533 * available type in new_type in case of no error. In case of any error
534 * it will return a negative return value.
535 */
536int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
537		    enum page_cache_mode *new_type)
538{
539	struct memtype *entry_new;
540	enum page_cache_mode actual_type;
541	int is_range_ram;
542	int err = 0;
543
544	start = sanitize_phys(start);
545
546	/*
547	 * The end address passed into this function is exclusive, but
548	 * sanitize_phys() expects an inclusive address.
549	 */
550	end = sanitize_phys(end - 1) + 1;
551	if (start >= end) {
552		WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
553				start, end - 1, cattr_name(req_type));
554		return -EINVAL;
555	}
556
557	if (!pat_enabled()) {
558		/* This is identical to page table setting without PAT */
559		if (new_type)
560			*new_type = req_type;
561		return 0;
562	}
563
564	/* Low ISA region is always mapped WB in page table. No need to track */
565	if (x86_platform.is_untracked_pat_range(start, end)) {
566		if (new_type)
567			*new_type = _PAGE_CACHE_MODE_WB;
568		return 0;
569	}
570
571	/*
572	 * Call mtrr_lookup to get the type hint. This is an
573	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
574	 * tools and ACPI tools). Use WB request for WB memory and use
575	 * UC_MINUS otherwise.
576	 */
577	actual_type = pat_x_mtrr_type(start, end, req_type);
578
579	if (new_type)
580		*new_type = actual_type;
581
582	is_range_ram = pat_pagerange_is_ram(start, end);
583	if (is_range_ram == 1) {
584
585		err = reserve_ram_pages_type(start, end, req_type, new_type);
586
587		return err;
588	} else if (is_range_ram < 0) {
589		return -EINVAL;
590	}
591
592	entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
593	if (!entry_new)
594		return -ENOMEM;
595
596	entry_new->start = start;
597	entry_new->end	 = end;
598	entry_new->type	 = actual_type;
599
600	spin_lock(&memtype_lock);
601
602	err = memtype_check_insert(entry_new, new_type);
603	if (err) {
604		pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
605			start, end - 1,
606			cattr_name(entry_new->type), cattr_name(req_type));
607		kfree(entry_new);
608		spin_unlock(&memtype_lock);
609
610		return err;
611	}
612
613	spin_unlock(&memtype_lock);
614
615	dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
616		start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
617		new_type ? cattr_name(*new_type) : "-");
618
619	return err;
620}
621
622int memtype_free(u64 start, u64 end)
623{
624	int is_range_ram;
625	struct memtype *entry_old;
626
627	if (!pat_enabled())
628		return 0;
629
630	start = sanitize_phys(start);
631	end = sanitize_phys(end);
632
633	/* Low ISA region is always mapped WB. No need to track */
634	if (x86_platform.is_untracked_pat_range(start, end))
635		return 0;
636
637	is_range_ram = pat_pagerange_is_ram(start, end);
638	if (is_range_ram == 1)
639		return free_ram_pages_type(start, end);
640	if (is_range_ram < 0)
641		return -EINVAL;
642
643	spin_lock(&memtype_lock);
644	entry_old = memtype_erase(start, end);
645	spin_unlock(&memtype_lock);
646
647	if (IS_ERR(entry_old)) {
648		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
649			current->comm, current->pid, start, end - 1);
650		return -EINVAL;
651	}
652
653	kfree(entry_old);
654
655	dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);
656
657	return 0;
658}
659
660
661/**
662 * lookup_memtype - Looks up the memory type for a physical address
663 * @paddr: physical address of which memory type needs to be looked up
664 *
665 * Only to be called when PAT is enabled
666 *
667 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
668 * or _PAGE_CACHE_MODE_WT.
669 */
670static enum page_cache_mode lookup_memtype(u64 paddr)
671{
672	enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
673	struct memtype *entry;
674
675	if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
676		return rettype;
677
678	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
679		struct page *page;
680
681		page = pfn_to_page(paddr >> PAGE_SHIFT);
682		return get_page_memtype(page);
683	}
684
685	spin_lock(&memtype_lock);
686
687	entry = memtype_lookup(paddr);
688	if (entry != NULL)
689		rettype = entry->type;
690	else
691		rettype = _PAGE_CACHE_MODE_UC_MINUS;
692
693	spin_unlock(&memtype_lock);
694
695	return rettype;
696}
697
698/**
699 * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type
700 * of @pfn cannot be overridden by UC MTRR memory type.
701 *
702 * Only to be called when PAT is enabled.
703 *
704 * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC.
705 * Returns false in other cases.
706 */
707bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
708{
709	enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn));
710
711	return cm == _PAGE_CACHE_MODE_UC ||
712	       cm == _PAGE_CACHE_MODE_UC_MINUS ||
713	       cm == _PAGE_CACHE_MODE_WC;
714}
715EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
716
717/**
718 * memtype_reserve_io - Request a memory type mapping for a region of memory
719 * @start: start (physical address) of the region
720 * @end: end (physical address) of the region
721 * @type: A pointer to memtype, with requested type. On success, requested
722 * or any other compatible type that was available for the region is returned
723 *
724 * On success, returns 0
725 * On failure, returns non-zero
726 */
727int memtype_reserve_io(resource_size_t start, resource_size_t end,
728			enum page_cache_mode *type)
729{
730	resource_size_t size = end - start;
731	enum page_cache_mode req_type = *type;
732	enum page_cache_mode new_type;
733	int ret;
734
735	WARN_ON_ONCE(iomem_map_sanity_check(start, size));
736
737	ret = memtype_reserve(start, end, req_type, &new_type);
738	if (ret)
739		goto out_err;
740
741	if (!is_new_memtype_allowed(start, size, req_type, new_type))
742		goto out_free;
743
744	if (memtype_kernel_map_sync(start, size, new_type) < 0)
745		goto out_free;
746
747	*type = new_type;
748	return 0;
749
750out_free:
751	memtype_free(start, end);
752	ret = -EBUSY;
753out_err:
754	return ret;
755}
756
757/**
758 * memtype_free_io - Release a memory type mapping for a region of memory
759 * @start: start (physical address) of the region
760 * @end: end (physical address) of the region
761 */
762void memtype_free_io(resource_size_t start, resource_size_t end)
763{
764	memtype_free(start, end);
765}
766
767#ifdef CONFIG_X86_PAT
768int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
769{
770	enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
771
772	return memtype_reserve_io(start, start + size, &type);
773}
774EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
775
776void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
777{
778	memtype_free_io(start, start + size);
779}
780EXPORT_SYMBOL(arch_io_free_memtype_wc);
781#endif
782
783pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
784				unsigned long size, pgprot_t vma_prot)
785{
786	if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size))
787		vma_prot = pgprot_decrypted(vma_prot);
788
789	return vma_prot;
790}
791
792#ifdef CONFIG_STRICT_DEVMEM
793/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
794static inline int range_is_allowed(unsigned long pfn, unsigned long size)
795{
796	return 1;
797}
798#else
799/* This check is needed to avoid cache aliasing when PAT is enabled */
800static inline int range_is_allowed(unsigned long pfn, unsigned long size)
801{
802	u64 from = ((u64)pfn) << PAGE_SHIFT;
803	u64 to = from + size;
804	u64 cursor = from;
805
806	if (!pat_enabled())
807		return 1;
808
809	while (cursor < to) {
810		if (!devmem_is_allowed(pfn))
811			return 0;
812		cursor += PAGE_SIZE;
813		pfn++;
814	}
815	return 1;
816}
817#endif /* CONFIG_STRICT_DEVMEM */
818
819int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
820				unsigned long size, pgprot_t *vma_prot)
821{
822	enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
823
824	if (!range_is_allowed(pfn, size))
825		return 0;
826
827	if (file->f_flags & O_DSYNC)
828		pcm = _PAGE_CACHE_MODE_UC_MINUS;
829
830	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
831			     cachemode2protval(pcm));
832	return 1;
833}
834
835/*
836 * Change the memory type for the physical address range in kernel identity
837 * mapping space if that range is a part of identity map.
838 */
839int memtype_kernel_map_sync(u64 base, unsigned long size,
840			    enum page_cache_mode pcm)
841{
842	unsigned long id_sz;
843
844	if (base > __pa(high_memory-1))
845		return 0;
846
847	/*
848	 * Some areas in the middle of the kernel identity range
849	 * are not mapped, for example the PCI space.
850	 */
851	if (!page_is_ram(base >> PAGE_SHIFT))
852		return 0;
853
854	id_sz = (__pa(high_memory-1) <= base + size) ?
855				__pa(high_memory) - base : size;
856
857	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
858		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
859			current->comm, current->pid,
860			cattr_name(pcm),
861			base, (unsigned long long)(base + size-1));
862		return -EINVAL;
863	}
864	return 0;
865}
866
867/*
868 * Internal interface to reserve a range of physical memory with prot.
869 * Reserved non RAM regions only and after successful memtype_reserve,
870 * this func also keeps identity mapping (if any) in sync with this new prot.
871 */
872static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
873				int strict_prot)
874{
875	int is_ram = 0;
876	int ret;
877	enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
878	enum page_cache_mode pcm = want_pcm;
879
880	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
881
882	/*
883	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
884	 * track of number of mappings of RAM pages. We can assert that
885	 * the type requested matches the type of first page in the range.
886	 */
887	if (is_ram) {
888		if (!pat_enabled())
889			return 0;
890
891		pcm = lookup_memtype(paddr);
892		if (want_pcm != pcm) {
893			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
894				current->comm, current->pid,
895				cattr_name(want_pcm),
896				(unsigned long long)paddr,
897				(unsigned long long)(paddr + size - 1),
898				cattr_name(pcm));
899			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
900					     (~_PAGE_CACHE_MASK)) |
901					     cachemode2protval(pcm));
902		}
903		return 0;
904	}
905
906	ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
907	if (ret)
908		return ret;
909
910	if (pcm != want_pcm) {
911		if (strict_prot ||
912		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
913			memtype_free(paddr, paddr + size);
914			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
915			       current->comm, current->pid,
916			       cattr_name(want_pcm),
917			       (unsigned long long)paddr,
918			       (unsigned long long)(paddr + size - 1),
919			       cattr_name(pcm));
920			return -EINVAL;
921		}
922		/*
923		 * We allow returning different type than the one requested in
924		 * non strict case.
925		 */
926		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
927				      (~_PAGE_CACHE_MASK)) |
928				     cachemode2protval(pcm));
929	}
930
931	if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
932		memtype_free(paddr, paddr + size);
933		return -EINVAL;
934	}
935	return 0;
936}
937
938/*
939 * Internal interface to free a range of physical memory.
940 * Frees non RAM regions only.
941 */
942static void free_pfn_range(u64 paddr, unsigned long size)
943{
944	int is_ram;
945
946	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
947	if (is_ram == 0)
948		memtype_free(paddr, paddr + size);
949}
950
951static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
952		resource_size_t *phys)
953{
954	pte_t *ptep, pte;
955	spinlock_t *ptl;
956
957	if (follow_pte(vma, vma->vm_start, &ptep, &ptl))
958		return -EINVAL;
959
960	pte = ptep_get(ptep);
961
962	/* Never return PFNs of anon folios in COW mappings. */
963	if (vm_normal_folio(vma, vma->vm_start, pte)) {
964		pte_unmap_unlock(ptep, ptl);
965		return -EINVAL;
966	}
967
968	*prot = pgprot_val(pte_pgprot(pte));
969	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
970	pte_unmap_unlock(ptep, ptl);
971	return 0;
972}
973
974static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
975		pgprot_t *pgprot)
976{
977	unsigned long prot;
978
979	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_PAT));
980
981	/*
982	 * We need the starting PFN and cachemode used for track_pfn_remap()
983	 * that covered the whole VMA. For most mappings, we can obtain that
984	 * information from the page tables. For COW mappings, we might now
985	 * suddenly have anon folios mapped and follow_phys() will fail.
986	 *
987	 * Fallback to using vma->vm_pgoff, see remap_pfn_range_notrack(), to
988	 * detect the PFN. If we need the cachemode as well, we're out of luck
989	 * for now and have to fail fork().
990	 */
991	if (!follow_phys(vma, &prot, paddr)) {
992		if (pgprot)
993			*pgprot = __pgprot(prot);
994		return 0;
995	}
996	if (is_cow_mapping(vma->vm_flags)) {
997		if (pgprot)
998			return -EINVAL;
999		*paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
1000		return 0;
1001	}
1002	WARN_ON_ONCE(1);
1003	return -EINVAL;
1004}
1005
1006/*
1007 * track_pfn_copy is called when vma that is covering the pfnmap gets
1008 * copied through copy_page_range().
1009 *
1010 * If the vma has a linear pfn mapping for the entire range, we get the prot
1011 * from pte and reserve the entire vma range with single reserve_pfn_range call.
1012 */
1013int track_pfn_copy(struct vm_area_struct *vma)
1014{
1015	resource_size_t paddr;
1016	unsigned long vma_size = vma->vm_end - vma->vm_start;
1017	pgprot_t pgprot;
1018
1019	if (vma->vm_flags & VM_PAT) {
1020		if (get_pat_info(vma, &paddr, &pgprot))
1021			return -EINVAL;
1022		/* reserve the whole chunk covered by vma. */
1023		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
1024	}
1025
1026	return 0;
1027}
1028
1029/*
1030 * prot is passed in as a parameter for the new mapping. If the vma has
1031 * a linear pfn mapping for the entire range, or no vma is provided,
1032 * reserve the entire pfn + size range with single reserve_pfn_range
1033 * call.
1034 */
1035int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
1036		    unsigned long pfn, unsigned long addr, unsigned long size)
1037{
1038	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
1039	enum page_cache_mode pcm;
1040
1041	/* reserve the whole chunk starting from paddr */
1042	if (!vma || (addr == vma->vm_start
1043				&& size == (vma->vm_end - vma->vm_start))) {
1044		int ret;
1045
1046		ret = reserve_pfn_range(paddr, size, prot, 0);
1047		if (ret == 0 && vma)
1048			vm_flags_set(vma, VM_PAT);
1049		return ret;
1050	}
1051
1052	if (!pat_enabled())
1053		return 0;
1054
1055	/*
1056	 * For anything smaller than the vma size we set prot based on the
1057	 * lookup.
1058	 */
1059	pcm = lookup_memtype(paddr);
1060
1061	/* Check memtype for the remaining pages */
1062	while (size > PAGE_SIZE) {
1063		size -= PAGE_SIZE;
1064		paddr += PAGE_SIZE;
1065		if (pcm != lookup_memtype(paddr))
1066			return -EINVAL;
1067	}
1068
1069	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1070			 cachemode2protval(pcm));
1071
1072	return 0;
1073}
1074
1075void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
1076{
1077	enum page_cache_mode pcm;
1078
1079	if (!pat_enabled())
1080		return;
1081
1082	/* Set prot based on lookup */
1083	pcm = lookup_memtype(pfn_t_to_phys(pfn));
1084	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
1085			 cachemode2protval(pcm));
1086}
1087
1088/*
1089 * untrack_pfn is called while unmapping a pfnmap for a region.
1090 * untrack can be called for a specific region indicated by pfn and size or
1091 * can be for the entire vma (in which case pfn, size are zero).
1092 */
1093void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
1094		 unsigned long size, bool mm_wr_locked)
1095{
1096	resource_size_t paddr;
1097
1098	if (vma && !(vma->vm_flags & VM_PAT))
1099		return;
1100
1101	/* free the chunk starting from pfn or the whole chunk */
1102	paddr = (resource_size_t)pfn << PAGE_SHIFT;
1103	if (!paddr && !size) {
1104		if (get_pat_info(vma, &paddr, NULL))
1105			return;
1106		size = vma->vm_end - vma->vm_start;
1107	}
1108	free_pfn_range(paddr, size);
1109	if (vma) {
1110		if (mm_wr_locked)
1111			vm_flags_clear(vma, VM_PAT);
1112		else
1113			__vm_flags_mod(vma, 0, VM_PAT);
1114	}
1115}
1116
1117/*
1118 * untrack_pfn_clear is called if the following situation fits:
1119 *
1120 * 1) while mremapping a pfnmap for a new region,  with the old vma after
1121 * its pfnmap page table has been removed.  The new vma has a new pfnmap
1122 * to the same pfn & cache type with VM_PAT set.
1123 * 2) while duplicating vm area, the new vma fails to copy the pgtable from
1124 * old vma.
1125 */
1126void untrack_pfn_clear(struct vm_area_struct *vma)
1127{
1128	vm_flags_clear(vma, VM_PAT);
1129}
1130
1131pgprot_t pgprot_writecombine(pgprot_t prot)
1132{
1133	return __pgprot(pgprot_val(prot) |
1134				cachemode2protval(_PAGE_CACHE_MODE_WC));
1135}
1136EXPORT_SYMBOL_GPL(pgprot_writecombine);
1137
1138pgprot_t pgprot_writethrough(pgprot_t prot)
1139{
1140	return __pgprot(pgprot_val(prot) |
1141				cachemode2protval(_PAGE_CACHE_MODE_WT));
1142}
1143EXPORT_SYMBOL_GPL(pgprot_writethrough);
1144
1145#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
1146
1147/*
1148 * We are allocating a temporary printout-entry to be passed
1149 * between seq_start()/next() and seq_show():
1150 */
1151static struct memtype *memtype_get_idx(loff_t pos)
1152{
1153	struct memtype *entry_print;
1154	int ret;
1155
1156	entry_print  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
1157	if (!entry_print)
1158		return NULL;
1159
1160	spin_lock(&memtype_lock);
1161	ret = memtype_copy_nth_element(entry_print, pos);
1162	spin_unlock(&memtype_lock);
1163
1164	/* Free it on error: */
1165	if (ret) {
1166		kfree(entry_print);
1167		return NULL;
1168	}
1169
1170	return entry_print;
1171}
1172
1173static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
1174{
1175	if (*pos == 0) {
1176		++*pos;
1177		seq_puts(seq, "PAT memtype list:\n");
1178	}
1179
1180	return memtype_get_idx(*pos);
1181}
1182
1183static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1184{
1185	kfree(v);
1186	++*pos;
1187	return memtype_get_idx(*pos);
1188}
1189
1190static void memtype_seq_stop(struct seq_file *seq, void *v)
1191{
1192	kfree(v);
1193}
1194
1195static int memtype_seq_show(struct seq_file *seq, void *v)
1196{
1197	struct memtype *entry_print = (struct memtype *)v;
1198
1199	seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
1200			entry_print->start,
1201			entry_print->end,
1202			cattr_name(entry_print->type));
1203
1204	return 0;
1205}
1206
1207static const struct seq_operations memtype_seq_ops = {
1208	.start = memtype_seq_start,
1209	.next  = memtype_seq_next,
1210	.stop  = memtype_seq_stop,
1211	.show  = memtype_seq_show,
1212};
1213
1214static int memtype_seq_open(struct inode *inode, struct file *file)
1215{
1216	return seq_open(file, &memtype_seq_ops);
1217}
1218
1219static const struct file_operations memtype_fops = {
1220	.open    = memtype_seq_open,
1221	.read    = seq_read,
1222	.llseek  = seq_lseek,
1223	.release = seq_release,
1224};
1225
1226static int __init pat_memtype_list_init(void)
1227{
1228	if (pat_enabled()) {
1229		debugfs_create_file("pat_memtype_list", S_IRUSR,
1230				    arch_debugfs_dir, NULL, &memtype_fops);
1231	}
1232	return 0;
1233}
1234late_initcall(pat_memtype_list_init);
1235
1236#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
1237