1108983Simp/* SPDX-License-Identifier: GPL-2.0 */ 2108983Simp#ifndef _ASM_X86_PGTABLE_3LEVEL_H 3108983Simp#define _ASM_X86_PGTABLE_3LEVEL_H 4108983Simp 5108983Simp/* 6108983Simp * Intel Physical Address Extension (PAE) Mode - three-level page 7108983Simp * tables on PPro+ CPUs. 8108983Simp * 9108983Simp * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> 10108983Simp */ 11108983Simp 12108983Simp#define pte_ERROR(e) \ 13108983Simp pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ 14108983Simp __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) 15108983Simp#define pmd_ERROR(e) \ 16108983Simp pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ 17108983Simp __FILE__, __LINE__, &(e), pmd_val(e)) 18108983Simp#define pgd_ERROR(e) \ 19108983Simp pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ 20148471Simp __FILE__, __LINE__, &(e), pgd_val(e)) 21108983Simp 22139027Sbrueffer#define pxx_xchg64(_pxx, _ptr, _val) ({ \ 23146969Smarius _pxx##val_t *_p = (_pxx##val_t *)_ptr; \ 24139027Sbrueffer _pxx##val_t _o = *_p; \ 25108983Simp do { } while (!try_cmpxchg64(_p, &_o, (_val))); \ 26108983Simp native_make_##_pxx(_o); \ 27108983Simp}) 28108983Simp 29108983Simp/* 30108983Simp * Rules for using set_pte: the pte being assigned *must* be 31147088Sbrooks * either not present or in a state where the hardware will 32126905Scperciva * not attempt to update the pte. In places where this is 33108983Simp * not possible, use pte_get_and_clear to obtain the old pte 34108983Simp * value and then use set_pte to update it. -ben 35148642Ssam */ 36108983Simpstatic inline void native_set_pte(pte_t *ptep, pte_t pte) 37108983Simp{ 38108983Simp WRITE_ONCE(ptep->pte_high, pte.pte_high); 39108983Simp smp_wmb(); 40148642Ssam WRITE_ONCE(ptep->pte_low, pte.pte_low); 41108983Simp} 42108983Simp 43108983Simpstatic inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 44147088Sbrooks{ 45147088Sbrooks pxx_xchg64(pte, ptep, native_pte_val(pte)); 46147088Sbrooks} 47147088Sbrooks 48147088Sbrooksstatic inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 49147088Sbrooks{ 50147088Sbrooks pxx_xchg64(pmd, pmdp, native_pmd_val(pmd)); 51147088Sbrooks} 52147088Sbrooks 53148642Ssamstatic inline void native_set_pud(pud_t *pudp, pud_t pud) 54147088Sbrooks{ 55147088Sbrooks#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 56147088Sbrooks pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd); 57148642Ssam#endif 58148642Ssam pxx_xchg64(pud, pudp, native_pud_val(pud)); 59148642Ssam} 60148642Ssam 61148642Ssam/* 62148642Ssam * For PTEs and PDEs, we must clear the P-bit first when clearing a page table 63148642Ssam * entry, so clear the bottom half first and enforce ordering with a compiler 64148642Ssam * barrier. 65148642Ssam */ 66148642Ssamstatic inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, 67148642Ssam pte_t *ptep) 68148642Ssam{ 69148642Ssam WRITE_ONCE(ptep->pte_low, 0); 70148642Ssam smp_wmb(); 71148642Ssam WRITE_ONCE(ptep->pte_high, 0); 72148642Ssam} 73148642Ssam 74148642Ssamstatic inline void native_pmd_clear(pmd_t *pmdp) 75148642Ssam{ 76148642Ssam WRITE_ONCE(pmdp->pmd_low, 0); 77108983Simp smp_wmb(); 78108983Simp WRITE_ONCE(pmdp->pmd_high, 0); 79108983Simp} 80139281Sbrueffer 81108983Simpstatic inline void native_pud_clear(pud_t *pudp) 82108983Simp{ 83108983Simp} 84108983Simp 85108983Simpstatic inline void pud_clear(pud_t *pudp) 86108983Simp{ 87108983Simp set_pud(pudp, __pud(0)); 88108983Simp 89152326Semax /* 90152326Semax * According to Intel App note "TLBs, Paging-Structure Caches, 91152326Semax * and Their Invalidation", April 2007, document 317080-001, 92152326Semax * section 8.1: in PAE mode we explicitly have to flush the 93152326Semax * TLB via cr3 if the top-level pgd is changed... 94152326Semax * 95152326Semax * Currently all places where pud_clear() is called either have 96152326Semax * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or 97152326Semax * pud_clear_bad()), so we don't need TLB flush here. 98152326Semax */ 99139281Sbrueffer} 100134584Sbrooks 101134584Sbrooks 102156782Semax#ifdef CONFIG_SMP 103134584Sbrooksstatic inline pte_t native_ptep_get_and_clear(pte_t *ptep) 104134584Sbrooks{ 105134584Sbrooks return pxx_xchg64(pte, ptep, 0ULL); 106156331Semax} 107134584Sbrooks 108134584Sbrooksstatic inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) 109138175Siedowse{ 110138175Siedowse return pxx_xchg64(pmd, pmdp, 0ULL); 111138175Siedowse} 112138175Siedowse 113138175Siedowsestatic inline pud_t native_pudp_get_and_clear(pud_t *pudp) 114138175Siedowse{ 115138175Siedowse return pxx_xchg64(pud, pudp, 0ULL); 116153300Siedowse} 117153300Siedowse#else 118153300Siedowse#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 119153300Siedowse#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) 120153300Siedowse#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp) 121153300Siedowse#endif 122153300Siedowse 123153300Siedowse#ifndef pmdp_establish 124153300Siedowse#define pmdp_establish pmdp_establish 125153300Siedowsestatic inline pmd_t pmdp_establish(struct vm_area_struct *vma, 126153300Siedowse unsigned long address, pmd_t *pmdp, pmd_t pmd) 127153300Siedowse{ 128153300Siedowse pmd_t old; 129153300Siedowse 130153300Siedowse /* 131153300Siedowse * If pmd has present bit cleared we can get away without expensive 132153300Siedowse * cmpxchg64: we can update pmdp half-by-half without racing with 133153300Siedowse * anybody. 134153300Siedowse */ 135153300Siedowse if (!(pmd_val(pmd) & _PAGE_PRESENT)) { 136153300Siedowse /* xchg acts as a barrier before setting of the high bits */ 137153300Siedowse old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low); 138153300Siedowse old.pmd_high = READ_ONCE(pmdp->pmd_high); 139153300Siedowse WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high); 140153300Siedowse 141153300Siedowse return old; 142153300Siedowse } 143153300Siedowse 144153300Siedowse return pxx_xchg64(pmd, pmdp, pmd.pmd); 145108983Simp} 146148471Simp#endif 147148471Simp 148108983Simp/* 149108983Simp * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that 150108983Simp * are !pte_none() && !pte_present(). 151131646Simp * 152108983Simp * Format of swap PTEs: 153108983Simp * 154108983Simp * 6 6 6 6 5 5 5 5 5 5 5 5 5 5 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 155114799Simp * 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 156139281Sbrueffer * < type -> <---------------------- offset ---------------------- 157119254Simp * 158114852Simp * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 159119254Simp * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 160108983Simp * --------------------------------------------> 0 E 0 0 0 0 0 0 0 161139281Sbrueffer * 162123626Snjl * E is the exclusive marker that is not stored in swap entries. 163123626Snjl */ 164123626Snjl#define SWP_TYPE_BITS 5 165125366Snjl#define _SWP_TYPE_MASK ((1U << SWP_TYPE_BITS) - 1) 166123626Snjl 167123626Snjl#define SWP_OFFSET_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) 168125366Snjl 169125366Snjl/* We always extract/encode the offset by shifting it all the way up, and then down again */ 170125366Snjl#define SWP_OFFSET_SHIFT (SWP_OFFSET_FIRST_BIT + SWP_TYPE_BITS) 171125366Snjl 172125366Snjl#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 173125366Snjl#define __swp_type(x) (((x).val) & _SWP_TYPE_MASK) 174125366Snjl#define __swp_offset(x) ((x).val >> SWP_TYPE_BITS) 175125366Snjl#define __swp_entry(type, offset) ((swp_entry_t){((type) & _SWP_TYPE_MASK) \ 176125366Snjl | (offset) << SWP_TYPE_BITS}) 177125366Snjl 178108983Simp/* 179108983Simp * Normally, __swp_entry() converts from arch-independent swp_entry_t to 180108983Simp * arch-dependent swp_entry_t, and __swp_entry_to_pte() just stores the result 181108983Simp * to pte. But here we have 32bit swp_entry_t and 64bit pte, and need to use the 182108983Simp * whole 64 bits. Thus, we shift the "real" arch-dependent conversion to 183108983Simp * __swp_entry_to_pte() through the following helper macro based on 64bit 184108983Simp * __swp_entry(). 185108983Simp */ 186108983Simp#define __swp_pteval_entry(type, offset) ((pteval_t) { \ 187108983Simp (~(pteval_t)(offset) << SWP_OFFSET_SHIFT >> SWP_TYPE_BITS) \ 188108983Simp | ((pteval_t)(type) << (64 - SWP_TYPE_BITS)) }) 189108983Simp 190108983Simp#define __swp_entry_to_pte(x) ((pte_t){ .pte = \ 191108983Simp __swp_pteval_entry(__swp_type(x), __swp_offset(x)) }) 192108983Simp/* 193108983Simp * Analogically, __pte_to_swp_entry() doesn't just extract the arch-dependent 194108983Simp * swp_entry_t, but also has to convert it from 64bit to the 32bit 195108983Simp * intermediate representation, using the following macros based on 64bit 196108983Simp * __swp_type() and __swp_offset(). 197108983Simp */ 198108983Simp#define __pteval_swp_type(x) ((unsigned long)((x).pte >> (64 - SWP_TYPE_BITS))) 199108983Simp#define __pteval_swp_offset(x) ((unsigned long)(~((x).pte) << SWP_TYPE_BITS >> SWP_OFFSET_SHIFT)) 200108983Simp 201121493Snjl#define __pte_to_swp_entry(pte) (__swp_entry(__pteval_swp_type(pte), \ 202121493Snjl __pteval_swp_offset(pte))) 203121493Snjl 204121493Snjl/* We borrow bit 7 to store the exclusive marker in swap PTEs. */ 205121493Snjl#define _PAGE_SWP_EXCLUSIVE _PAGE_PSE 206121493Snjl 207121493Snjl#include <asm/pgtable-invert.h> 208121493Snjl 209121493Snjl#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ 210121493Snjl