1/*
2 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
49 *  School of Computer Science
50 *  Carnegie Mellon University
51 *  Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58
59/*
60 *	File:	pmap.c
61 *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
62 *	(These guys wrote the Vax version)
63 *
64 *	Physical Map management code for Intel i386, i486, and i860.
65 *
66 *	Manages physical address maps.
67 *
68 *	In addition to hardware address maps, this
69 *	module is called upon to provide software-use-only
70 *	maps which may or may not be stored in the same
71 *	form as hardware maps.  These pseudo-maps are
72 *	used to store intermediate results from copy
73 *	operations to and from address spaces.
74 *
75 *	Since the information managed by this module is
76 *	also stored by the logical address mapping module,
77 *	this module may throw away valid virtual-to-physical
78 *	mappings at almost any time.  However, invalidations
79 *	of virtual-to-physical mappings must be done as
80 *	requested.
81 *
82 *	In order to cope with hardware architectures which
83 *	make virtual-to-physical map invalidates expensive,
84 *	this module may delay invalidate or reduced protection
85 *	operations until such time as they are actually
86 *	necessary.  This module is given full information as
87 *	to which processors are currently using which maps,
88 *	and to when physical maps must be made correct.
89 */
90
91#include <string.h>
92#include <mach_ldebug.h>
93
94#include <libkern/OSAtomic.h>
95
96#include <mach/machine/vm_types.h>
97
98#include <mach/boolean.h>
99#include <kern/thread.h>
100#include <kern/zalloc.h>
101#include <kern/queue.h>
102#include <kern/ledger.h>
103#include <kern/mach_param.h>
104
105#include <kern/lock.h>
106#include <kern/kalloc.h>
107#include <kern/spl.h>
108
109#include <vm/pmap.h>
110#include <vm/vm_map.h>
111#include <vm/vm_kern.h>
112#include <mach/vm_param.h>
113#include <mach/vm_prot.h>
114#include <vm/vm_object.h>
115#include <vm/vm_page.h>
116
117#include <mach/machine/vm_param.h>
118#include <machine/thread.h>
119
120#include <kern/misc_protos.h>			/* prototyping */
121#include <i386/misc_protos.h>
122#include <i386/i386_lowmem.h>
123#include <x86_64/lowglobals.h>
124
125#include <i386/cpuid.h>
126#include <i386/cpu_data.h>
127#include <i386/cpu_number.h>
128#include <i386/machine_cpu.h>
129#include <i386/seg.h>
130#include <i386/serial_io.h>
131#include <i386/cpu_capabilities.h>
132#include <i386/machine_routines.h>
133#include <i386/proc_reg.h>
134#include <i386/tsc.h>
135#include <i386/pmap_internal.h>
136#include <i386/pmap_pcid.h>
137
138#include <vm/vm_protos.h>
139
140#include <i386/mp.h>
141#include <i386/mp_desc.h>
142#include <libkern/kernel_mach_header.h>
143
144#include <pexpert/i386/efi.h>
145
146
147#ifdef IWANTTODEBUG
148#undef	DEBUG
149#define DEBUG 1
150#define POSTCODE_DELAY 1
151#include <i386/postcode.h>
152#endif /* IWANTTODEBUG */
153
154#ifdef	PMAP_DEBUG
155#define DBG(x...)	kprintf("DBG: " x)
156#else
157#define DBG(x...)
158#endif
159/* Compile time assert to ensure adjacency/alignment of per-CPU data fields used
160 * in the trampolines for kernel/user boundary TLB coherency.
161 */
162char pmap_cpu_data_assert[(((offsetof(cpu_data_t, cpu_tlb_invalid) - offsetof(cpu_data_t, cpu_active_cr3)) == 8) && (offsetof(cpu_data_t, cpu_active_cr3) % 64 == 0)) ? 1 : -1];
163boolean_t pmap_trace = FALSE;
164
165boolean_t	no_shared_cr3 = DEBUG;		/* TRUE for DEBUG by default */
166
167int nx_enabled = 1;			/* enable no-execute protection */
168int allow_data_exec  = VM_ABI_32;	/* 32-bit apps may execute data by default, 64-bit apps may not */
169int allow_stack_exec = 0;		/* No apps may execute from the stack by default */
170
171const boolean_t cpu_64bit  = TRUE; /* Mais oui! */
172
173uint64_t max_preemption_latency_tsc = 0;
174
175pv_hashed_entry_t     *pv_hash_table;  /* hash lists */
176
177uint32_t npvhash = 0;
178
179pv_hashed_entry_t	pv_hashed_free_list = PV_HASHED_ENTRY_NULL;
180pv_hashed_entry_t	pv_hashed_kern_free_list = PV_HASHED_ENTRY_NULL;
181decl_simple_lock_data(,pv_hashed_free_list_lock)
182decl_simple_lock_data(,pv_hashed_kern_free_list_lock)
183decl_simple_lock_data(,pv_hash_table_lock)
184
185zone_t		pv_hashed_list_zone;	/* zone of pv_hashed_entry structures */
186
187/*
188 *	First and last physical addresses that we maintain any information
189 *	for.  Initialized to zero so that pmap operations done before
190 *	pmap_init won't touch any non-existent structures.
191 */
192boolean_t	pmap_initialized = FALSE;/* Has pmap_init completed? */
193
194static struct vm_object kptobj_object_store;
195static struct vm_object kpml4obj_object_store;
196static struct vm_object kpdptobj_object_store;
197
198/*
199 *	Array of physical page attribites for managed pages.
200 *	One byte per physical page.
201 */
202char		*pmap_phys_attributes;
203ppnum_t		last_managed_page = 0;
204
205/*
206 *	Amount of virtual memory mapped by one
207 *	page-directory entry.
208 */
209
210uint64_t pde_mapped_size = PDE_MAPPED_SIZE;
211
212unsigned pmap_memory_region_count;
213unsigned pmap_memory_region_current;
214
215pmap_memory_region_t pmap_memory_regions[PMAP_MEMORY_REGIONS_SIZE];
216
217/*
218 *	Other useful macros.
219 */
220#define current_pmap()		(vm_map_pmap(current_thread()->map))
221
222struct pmap	kernel_pmap_store;
223pmap_t		kernel_pmap;
224
225struct zone	*pmap_zone;		/* zone of pmap structures */
226
227struct zone	*pmap_anchor_zone;
228int		pmap_debug = 0;		/* flag for debugging prints */
229
230unsigned int	inuse_ptepages_count = 0;
231long long	alloc_ptepages_count __attribute__((aligned(8))) = 0; /* aligned for atomic access */
232unsigned int	bootstrap_wired_pages = 0;
233int		pt_fake_zone_index = -1;
234
235extern 	long	NMIPI_acks;
236
237boolean_t	kernel_text_ps_4K = TRUE;
238boolean_t	wpkernel = TRUE;
239
240extern char	end;
241
242static int	nkpt;
243
244pt_entry_t     *DMAP1, *DMAP2;
245caddr_t         DADDR1;
246caddr_t         DADDR2;
247
248const boolean_t	pmap_disable_kheap_nx = FALSE;
249const boolean_t	pmap_disable_kstack_nx = FALSE;
250extern boolean_t doconstro_override;
251
252extern long __stack_chk_guard[];
253
254/*
255 *	Map memory at initialization.  The physical addresses being
256 *	mapped are not managed and are never unmapped.
257 *
258 *	For now, VM is already on, we only need to map the
259 *	specified memory.
260 */
261vm_offset_t
262pmap_map(
263	vm_offset_t	virt,
264	vm_map_offset_t	start_addr,
265	vm_map_offset_t	end_addr,
266	vm_prot_t	prot,
267	unsigned int	flags)
268{
269	int		ps;
270
271	ps = PAGE_SIZE;
272	while (start_addr < end_addr) {
273		pmap_enter(kernel_pmap, (vm_map_offset_t)virt,
274			   (ppnum_t) i386_btop(start_addr), prot, VM_PROT_NONE, flags, TRUE);
275		virt += ps;
276		start_addr += ps;
277	}
278	return(virt);
279}
280
281extern	char			*first_avail;
282extern	vm_offset_t		virtual_avail, virtual_end;
283extern	pmap_paddr_t		avail_start, avail_end;
284extern  vm_offset_t		sHIB;
285extern  vm_offset_t		eHIB;
286extern  vm_offset_t		stext;
287extern  vm_offset_t		etext;
288extern  vm_offset_t		sdata, edata;
289extern  vm_offset_t		sconstdata, econstdata;
290
291extern void			*KPTphys;
292
293boolean_t pmap_smep_enabled = FALSE;
294
295void
296pmap_cpu_init(void)
297{
298	cpu_data_t	*cdp = current_cpu_datap();
299	/*
300	 * Here early in the life of a processor (from cpu_mode_init()).
301	 * Ensure global page feature is disabled at this point.
302	 */
303
304	set_cr4(get_cr4() &~ CR4_PGE);
305
306	/*
307	 * Initialize the per-cpu, TLB-related fields.
308	 */
309	cdp->cpu_kernel_cr3 = kernel_pmap->pm_cr3;
310	cdp->cpu_active_cr3 = kernel_pmap->pm_cr3;
311	cdp->cpu_tlb_invalid = FALSE;
312	cdp->cpu_task_map = TASK_MAP_64BIT;
313	pmap_pcid_configure();
314	if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_SMEP) {
315		boolean_t nsmep;
316		if (!PE_parse_boot_argn("-pmap_smep_disable", &nsmep, sizeof(nsmep))) {
317			set_cr4(get_cr4() | CR4_SMEP);
318			pmap_smep_enabled = TRUE;
319		}
320	}
321
322	if (cdp->cpu_fixed_pmcs_enabled) {
323		boolean_t enable = TRUE;
324		cpu_pmc_control(&enable);
325	}
326}
327
328
329
330/*
331 *	Bootstrap the system enough to run with virtual memory.
332 *	Map the kernel's code and data, and allocate the system page table.
333 *	Called with mapping OFF.  Page_size must already be set.
334 */
335
336void
337pmap_bootstrap(
338	__unused vm_offset_t	load_start,
339	__unused boolean_t	IA32e)
340{
341#if NCOPY_WINDOWS > 0
342	vm_offset_t	va;
343	int i;
344#endif
345	assert(IA32e);
346
347	vm_last_addr = VM_MAX_KERNEL_ADDRESS;	/* Set the highest address
348						 * known to VM */
349	/*
350	 *	The kernel's pmap is statically allocated so we don't
351	 *	have to use pmap_create, which is unlikely to work
352	 *	correctly at this part of the boot sequence.
353	 */
354
355	kernel_pmap = &kernel_pmap_store;
356	kernel_pmap->ref_count = 1;
357	kernel_pmap->nx_enabled = TRUE;
358	kernel_pmap->pm_task_map = TASK_MAP_64BIT;
359	kernel_pmap->pm_obj = (vm_object_t) NULL;
360	kernel_pmap->dirbase = (pd_entry_t *)((uintptr_t)IdlePTD);
361	kernel_pmap->pm_pdpt = (pd_entry_t *) ((uintptr_t)IdlePDPT);
362	kernel_pmap->pm_pml4 = IdlePML4;
363	kernel_pmap->pm_cr3 = (uintptr_t)ID_MAP_VTOP(IdlePML4);
364	pmap_pcid_initialize_kernel(kernel_pmap);
365
366
367
368	current_cpu_datap()->cpu_kernel_cr3 = (addr64_t) kernel_pmap->pm_cr3;
369
370	nkpt = NKPT;
371	OSAddAtomic(NKPT,  &inuse_ptepages_count);
372	OSAddAtomic64(NKPT,  &alloc_ptepages_count);
373	bootstrap_wired_pages = NKPT;
374
375	virtual_avail = (vm_offset_t)(VM_MIN_KERNEL_ADDRESS) + (vm_offset_t)first_avail;
376	virtual_end = (vm_offset_t)(VM_MAX_KERNEL_ADDRESS);
377
378#if NCOPY_WINDOWS > 0
379	/*
380	 * Reserve some special page table entries/VA space for temporary
381	 * mapping of pages.
382	 */
383#define	SYSMAP(c, p, v, n)	\
384	v = (c)va; va += ((n)*INTEL_PGBYTES);
385
386	va = virtual_avail;
387
388        for (i=0; i<PMAP_NWINDOWS; i++) {
389#if 1
390	    kprintf("trying to do SYSMAP idx %d %p\n", i,
391	 	current_cpu_datap());
392	    kprintf("cpu_pmap %p\n", current_cpu_datap()->cpu_pmap);
393	    kprintf("mapwindow %p\n", current_cpu_datap()->cpu_pmap->mapwindow);
394	    kprintf("two stuff %p %p\n",
395		   (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
396                   (void *)(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR));
397#endif
398            SYSMAP(caddr_t,
399		   (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP),
400                   (current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CADDR),
401		   1);
402	    current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP =
403	        &(current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP_store);
404            *current_cpu_datap()->cpu_pmap->mapwindow[i].prv_CMAP = 0;
405        }
406
407	/* DMAP user for debugger */
408	SYSMAP(caddr_t, DMAP1, DADDR1, 1);
409	SYSMAP(caddr_t, DMAP2, DADDR2, 1);  /* XXX temporary - can remove */
410
411	virtual_avail = va;
412#endif
413
414	if (PE_parse_boot_argn("npvhash", &npvhash, sizeof (npvhash))) {
415		if (0 != ((npvhash + 1) & npvhash)) {
416			kprintf("invalid hash %d, must be ((2^N)-1), "
417				"using default %d\n", npvhash, NPVHASH);
418			npvhash = NPVHASH;
419		}
420	} else {
421		npvhash = NPVHASH;
422	}
423
424	simple_lock_init(&kernel_pmap->lock, 0);
425	simple_lock_init(&pv_hashed_free_list_lock, 0);
426	simple_lock_init(&pv_hashed_kern_free_list_lock, 0);
427	simple_lock_init(&pv_hash_table_lock,0);
428
429	pmap_cpu_init();
430
431	if (pmap_pcid_ncpus)
432		printf("PMAP: PCID enabled\n");
433
434	if (pmap_smep_enabled)
435		printf("PMAP: Supervisor Mode Execute Protection enabled\n");
436
437#if	DEBUG
438	printf("Stack canary: 0x%lx\n", __stack_chk_guard[0]);
439	printf("ml_early_random(): 0x%qx\n", ml_early_random());
440#endif
441	boolean_t ptmp;
442	/* Check if the user has requested disabling stack or heap no-execute
443	 * enforcement. These are "const" variables; that qualifier is cast away
444	 * when altering them. The TEXT/DATA const sections are marked
445	 * write protected later in the kernel startup sequence, so altering
446	 * them is possible at this point, in pmap_bootstrap().
447	 */
448	if (PE_parse_boot_argn("-pmap_disable_kheap_nx", &ptmp, sizeof(ptmp))) {
449		boolean_t *pdknxp = (boolean_t *) &pmap_disable_kheap_nx;
450		*pdknxp = TRUE;
451	}
452
453	if (PE_parse_boot_argn("-pmap_disable_kstack_nx", &ptmp, sizeof(ptmp))) {
454		boolean_t *pdknhp = (boolean_t *) &pmap_disable_kstack_nx;
455		*pdknhp = TRUE;
456	}
457
458	boot_args *args = (boot_args *)PE_state.bootArgs;
459	if (args->efiMode == kBootArgsEfiMode32) {
460		printf("EFI32: kernel virtual space limited to 4GB\n");
461		virtual_end = VM_MAX_KERNEL_ADDRESS_EFI32;
462	}
463	kprintf("Kernel virtual space from 0x%lx to 0x%lx.\n",
464			(long)KERNEL_BASE, (long)virtual_end);
465	kprintf("Available physical space from 0x%llx to 0x%llx\n",
466			avail_start, avail_end);
467
468	/*
469	 * The -no_shared_cr3 boot-arg is a debugging feature (set by default
470	 * in the DEBUG kernel) to force the kernel to switch to its own map
471	 * (and cr3) when control is in kernelspace. The kernel's map does not
472	 * include (i.e. share) userspace so wild references will cause
473	 * a panic. Only copyin and copyout are exempt from this.
474	 */
475	(void) PE_parse_boot_argn("-no_shared_cr3",
476				  &no_shared_cr3, sizeof (no_shared_cr3));
477	if (no_shared_cr3)
478		kprintf("Kernel not sharing user map\n");
479
480#ifdef	PMAP_TRACES
481	if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
482		kprintf("Kernel traces for pmap operations enabled\n");
483	}
484#endif	/* PMAP_TRACES */
485}
486
487void
488pmap_virtual_space(
489	vm_offset_t *startp,
490	vm_offset_t *endp)
491{
492	*startp = virtual_avail;
493	*endp = virtual_end;
494}
495
496/*
497 *	Initialize the pmap module.
498 *	Called by vm_init, to initialize any structures that the pmap
499 *	system needs to map virtual memory.
500 */
501void
502pmap_init(void)
503{
504	long			npages;
505	vm_offset_t		addr;
506	vm_size_t		s, vsize;
507	vm_map_offset_t		vaddr;
508	ppnum_t ppn;
509
510
511	kernel_pmap->pm_obj_pml4 = &kpml4obj_object_store;
512	_vm_object_allocate((vm_object_size_t)NPML4PGS, &kpml4obj_object_store);
513
514	kernel_pmap->pm_obj_pdpt = &kpdptobj_object_store;
515	_vm_object_allocate((vm_object_size_t)NPDPTPGS, &kpdptobj_object_store);
516
517	kernel_pmap->pm_obj = &kptobj_object_store;
518	_vm_object_allocate((vm_object_size_t)NPDEPGS, &kptobj_object_store);
519
520	/*
521	 *	Allocate memory for the pv_head_table and its lock bits,
522	 *	the modify bit array, and the pte_page table.
523	 */
524
525	/*
526	 * zero bias all these arrays now instead of off avail_start
527	 * so we cover all memory
528	 */
529
530	npages = i386_btop(avail_end);
531	s = (vm_size_t) (sizeof(struct pv_rooted_entry) * npages
532			 + (sizeof (struct pv_hashed_entry_t *) * (npvhash+1))
533			 + pv_lock_table_size(npages)
534			 + pv_hash_lock_table_size((npvhash+1))
535				+ npages);
536
537	s = round_page(s);
538	if (kernel_memory_allocate(kernel_map, &addr, s, 0,
539				   KMA_KOBJECT | KMA_PERMANENT)
540	    != KERN_SUCCESS)
541		panic("pmap_init");
542
543	memset((char *)addr, 0, s);
544
545	vaddr = addr;
546	vsize = s;
547
548#if PV_DEBUG
549	if (0 == npvhash) panic("npvhash not initialized");
550#endif
551
552	/*
553	 *	Allocate the structures first to preserve word-alignment.
554	 */
555	pv_head_table = (pv_rooted_entry_t) addr;
556	addr = (vm_offset_t) (pv_head_table + npages);
557
558	pv_hash_table = (pv_hashed_entry_t *)addr;
559	addr = (vm_offset_t) (pv_hash_table + (npvhash + 1));
560
561	pv_lock_table = (char *) addr;
562	addr = (vm_offset_t) (pv_lock_table + pv_lock_table_size(npages));
563
564	pv_hash_lock_table = (char *) addr;
565	addr = (vm_offset_t) (pv_hash_lock_table + pv_hash_lock_table_size((npvhash+1)));
566
567	pmap_phys_attributes = (char *) addr;
568
569	ppnum_t  last_pn = i386_btop(avail_end);
570        unsigned int i;
571	pmap_memory_region_t *pmptr = pmap_memory_regions;
572	for (i = 0; i < pmap_memory_region_count; i++, pmptr++) {
573		if (pmptr->type != kEfiConventionalMemory)
574			continue;
575		ppnum_t pn;
576		for (pn = pmptr->base; pn <= pmptr->end; pn++) {
577			if (pn < last_pn) {
578				pmap_phys_attributes[pn] |= PHYS_MANAGED;
579
580				if (pn > last_managed_page)
581					last_managed_page = pn;
582
583				if (pn >= lowest_hi && pn <= highest_hi)
584					pmap_phys_attributes[pn] |= PHYS_NOENCRYPT;
585			}
586		}
587	}
588	while (vsize) {
589		ppn = pmap_find_phys(kernel_pmap, vaddr);
590
591		pmap_phys_attributes[ppn] |= PHYS_NOENCRYPT;
592
593		vaddr += PAGE_SIZE;
594		vsize -= PAGE_SIZE;
595	}
596	/*
597	 *	Create the zone of physical maps,
598	 *	and of the physical-to-virtual entries.
599	 */
600	s = (vm_size_t) sizeof(struct pmap);
601	pmap_zone = zinit(s, 400*s, 4096, "pmap"); /* XXX */
602        zone_change(pmap_zone, Z_NOENCRYPT, TRUE);
603
604	pmap_anchor_zone = zinit(PAGE_SIZE, task_max, PAGE_SIZE, "pagetable anchors");
605	zone_change(pmap_anchor_zone, Z_NOENCRYPT, TRUE);
606
607	/* The anchor is required to be page aligned. Zone debugging adds
608	 * padding which may violate that requirement. Tell the zone
609	 * subsystem that alignment is required.
610	 */
611
612	zone_change(pmap_anchor_zone, Z_ALIGNMENT_REQUIRED, TRUE);
613
614	s = (vm_size_t) sizeof(struct pv_hashed_entry);
615	pv_hashed_list_zone = zinit(s, 10000*s /* Expandable zone */,
616	    4096 * 3 /* LCM x86_64*/, "pv_list");
617	zone_change(pv_hashed_list_zone, Z_NOENCRYPT, TRUE);
618
619	/* create pv entries for kernel pages mapped by low level
620	   startup code.  these have to exist so we can pmap_remove()
621	   e.g. kext pages from the middle of our addr space */
622
623	vaddr = (vm_map_offset_t) VM_MIN_KERNEL_ADDRESS;
624	for (ppn = VM_MIN_KERNEL_PAGE; ppn < i386_btop(avail_start); ppn++) {
625		pv_rooted_entry_t pv_e;
626
627		pv_e = pai_to_pvh(ppn);
628		pv_e->va = vaddr;
629		vaddr += PAGE_SIZE;
630		pv_e->pmap = kernel_pmap;
631		queue_init(&pv_e->qlink);
632	}
633	pmap_initialized = TRUE;
634
635	max_preemption_latency_tsc = tmrCvt((uint64_t)MAX_PREEMPTION_LATENCY_NS, tscFCvtn2t);
636
637	/*
638	 * Ensure the kernel's PML4 entry exists for the basement
639	 * before this is shared with any user.
640	 */
641	pmap_expand_pml4(kernel_pmap, KERNEL_BASEMENT, PMAP_EXPAND_OPTIONS_NONE);
642}
643
644static
645void pmap_mark_range(pmap_t npmap, uint64_t sv, uint64_t nxrosz, boolean_t NX, boolean_t ro) {
646	uint64_t ev = sv + nxrosz, cv = sv;
647	pd_entry_t *pdep;
648	pt_entry_t *ptep = NULL;
649
650	assert(((sv & 0xFFFULL) | (nxrosz & 0xFFFULL)) == 0);
651
652	for (pdep = pmap_pde(npmap, cv); pdep != NULL && (cv < ev);) {
653		uint64_t pdev = (cv & ~((uint64_t)PDEMASK));
654
655		if (*pdep & INTEL_PTE_PS) {
656			if (NX)
657				*pdep |= INTEL_PTE_NX;
658			if (ro)
659				*pdep &= ~INTEL_PTE_WRITE;
660			cv += NBPD;
661			cv &= ~((uint64_t) PDEMASK);
662			pdep = pmap_pde(npmap, cv);
663			continue;
664		}
665
666		for (ptep = pmap_pte(npmap, cv); ptep != NULL && (cv < (pdev + NBPD)) && (cv < ev);) {
667			if (NX)
668				*ptep |= INTEL_PTE_NX;
669			if (ro)
670				*ptep &= ~INTEL_PTE_WRITE;
671			cv += NBPT;
672			ptep = pmap_pte(npmap, cv);
673		}
674	}
675	DPRINTF("%s(0x%llx, 0x%llx, %u, %u): 0x%llx, 0x%llx\n", __FUNCTION__, sv, nxrosz, NX, ro, cv, ptep ? *ptep: 0);
676}
677
678/*
679 * Called once VM is fully initialized so that we can release unused
680 * sections of low memory to the general pool.
681 * Also complete the set-up of identity-mapped sections of the kernel:
682 *  1) write-protect kernel text
683 *  2) map kernel text using large pages if possible
684 *  3) read and write-protect page zero (for K32)
685 *  4) map the global page at the appropriate virtual address.
686 *
687 * Use of large pages
688 * ------------------
689 * To effectively map and write-protect all kernel text pages, the text
690 * must be 2M-aligned at the base, and the data section above must also be
691 * 2M-aligned. That is, there's padding below and above. This is achieved
692 * through linker directives. Large pages are used only if this alignment
693 * exists (and not overriden by the -kernel_text_page_4K boot-arg). The
694 * memory layout is:
695 *
696 *                       :                :
697 *                       |     __DATA     |
698 *               sdata:  ==================  2Meg
699 *                       |                |
700 *                       |  zero-padding  |
701 *                       |                |
702 *               etext:  ------------------
703 *                       |                |
704 *                       :                :
705 *                       |                |
706 *                       |     __TEXT     |
707 *                       |                |
708 *                       :                :
709 *                       |                |
710 *               stext:  ==================  2Meg
711 *                       |                |
712 *                       |  zero-padding  |
713 *                       |                |
714 *               eHIB:   ------------------
715 *                       |     __HIB      |
716 *                       :                :
717 *
718 * Prior to changing the mapping from 4K to 2M, the zero-padding pages
719 * [eHIB,stext] and [etext,sdata] are ml_static_mfree()'d. Then all the
720 * 4K pages covering [stext,etext] are coalesced as 2M large pages.
721 * The now unused level-1 PTE pages are also freed.
722 */
723extern ppnum_t	vm_kernel_base_page;
724void
725pmap_lowmem_finalize(void)
726{
727	spl_t           spl;
728	int		i;
729
730	/*
731	 * Update wired memory statistics for early boot pages
732	 */
733	PMAP_ZINFO_PALLOC(kernel_pmap, bootstrap_wired_pages * PAGE_SIZE);
734
735	/*
736	 * Free pages in pmap regions below the base:
737	 * rdar://6332712
738	 *	We can't free all the pages to VM that EFI reports available.
739	 *	Pages in the range 0xc0000-0xff000 aren't safe over sleep/wake.
740	 *	There's also a size miscalculation here: pend is one page less
741	 *	than it should be but this is not fixed to be backwards
742	 *	compatible.
743	 * This is important for KASLR because up to 256*2MB = 512MB of space
744	 * needs has to be released to VM.
745	 */
746	for (i = 0;
747	     pmap_memory_regions[i].end < vm_kernel_base_page;
748	     i++) {
749		vm_offset_t	pbase = i386_ptob(pmap_memory_regions[i].base);
750		vm_offset_t	pend  = i386_ptob(pmap_memory_regions[i].end+1);
751
752		DBG("pmap region %d [%p..[%p\n",
753		    i, (void *) pbase, (void *) pend);
754
755		if (pmap_memory_regions[i].attribute & EFI_MEMORY_KERN_RESERVED)
756			continue;
757		/*
758		 * rdar://6332712
759		 * Adjust limits not to free pages in range 0xc0000-0xff000.
760		 */
761		if (pbase >= 0xc0000 && pend <= 0x100000)
762			continue;
763		if (pbase < 0xc0000 && pend > 0x100000) {
764			/* page range entirely within region, free lower part */
765			DBG("- ml_static_mfree(%p,%p)\n",
766			    (void *) ml_static_ptovirt(pbase),
767			    (void *) (0xc0000-pbase));
768			ml_static_mfree(ml_static_ptovirt(pbase),0xc0000-pbase);
769			pbase = 0x100000;
770		}
771		if (pbase < 0xc0000)
772			pend = MIN(pend, 0xc0000);
773		if (pend  > 0x100000)
774			pbase = MAX(pbase, 0x100000);
775		DBG("- ml_static_mfree(%p,%p)\n",
776		    (void *) ml_static_ptovirt(pbase),
777		    (void *) (pend - pbase));
778		ml_static_mfree(ml_static_ptovirt(pbase), pend - pbase);
779	}
780
781	/* A final pass to get rid of all initial identity mappings to
782	 * low pages.
783	 */
784	DPRINTF("%s: Removing mappings from 0->0x%lx\n", __FUNCTION__, vm_kernel_base);
785
786	/* Remove all mappings past the descriptor aliases and low globals */
787	pmap_remove(kernel_pmap, LOWGLOBAL_ALIAS + PAGE_SIZE, vm_kernel_base);
788
789	/*
790	 * If text and data are both 2MB-aligned,
791	 * we can map text with large-pages,
792	 * unless the -kernel_text_ps_4K boot-arg overrides.
793	 */
794	if ((stext & I386_LPGMASK) == 0 && (sdata & I386_LPGMASK) == 0) {
795		kprintf("Kernel text is 2MB aligned");
796		kernel_text_ps_4K = FALSE;
797		if (PE_parse_boot_argn("-kernel_text_ps_4K",
798				       &kernel_text_ps_4K,
799				       sizeof (kernel_text_ps_4K)))
800			kprintf(" but will be mapped with 4K pages\n");
801		else
802			kprintf(" and will be mapped with 2M pages\n");
803	}
804
805	(void) PE_parse_boot_argn("wpkernel", &wpkernel, sizeof (wpkernel));
806	if (wpkernel)
807		kprintf("Kernel text %p-%p to be write-protected\n",
808			(void *) stext, (void *) etext);
809
810	spl = splhigh();
811
812	/*
813	 * Scan over text if mappings are to be changed:
814	 * - Remap kernel text readonly unless the "wpkernel" boot-arg is 0
815 	 * - Change to large-pages if possible and not overriden.
816	 */
817	if (kernel_text_ps_4K && wpkernel) {
818		vm_offset_t     myva;
819		for (myva = stext; myva < etext; myva += PAGE_SIZE) {
820			pt_entry_t     *ptep;
821
822			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
823			if (ptep)
824				pmap_store_pte(ptep, *ptep & ~INTEL_PTE_WRITE);
825		}
826	}
827
828	if (!kernel_text_ps_4K) {
829		vm_offset_t     myva;
830
831		/*
832		 * Release zero-filled page padding used for 2M-alignment.
833		 */
834		DBG("ml_static_mfree(%p,%p) for padding below text\n",
835			(void *) eHIB, (void *) (stext - eHIB));
836		ml_static_mfree(eHIB, stext - eHIB);
837		DBG("ml_static_mfree(%p,%p) for padding above text\n",
838			(void *) etext, (void *) (sdata - etext));
839		ml_static_mfree(etext, sdata - etext);
840
841		/*
842		 * Coalesce text pages into large pages.
843		 */
844		for (myva = stext; myva < sdata; myva += I386_LPGBYTES) {
845			pt_entry_t	*ptep;
846			vm_offset_t	pte_phys;
847			pt_entry_t	*pdep;
848			pt_entry_t	pde;
849
850			pdep = pmap_pde(kernel_pmap, (vm_map_offset_t)myva);
851			ptep = pmap_pte(kernel_pmap, (vm_map_offset_t)myva);
852			DBG("myva: %p pdep: %p ptep: %p\n",
853				(void *) myva, (void *) pdep, (void *) ptep);
854			if ((*ptep & INTEL_PTE_VALID) == 0)
855				continue;
856			pte_phys = (vm_offset_t)(*ptep & PG_FRAME);
857			pde = *pdep & PTMASK;	/* page attributes from pde */
858			pde |= INTEL_PTE_PS;	/* make it a 2M entry */
859			pde |= pte_phys;	/* take page frame from pte */
860
861			if (wpkernel)
862				pde &= ~INTEL_PTE_WRITE;
863			DBG("pmap_store_pte(%p,0x%llx)\n",
864				(void *)pdep, pde);
865			pmap_store_pte(pdep, pde);
866
867			/*
868			 * Free the now-unused level-1 pte.
869			 * Note: ptep is a virtual address to the pte in the
870			 *   recursive map. We can't use this address to free
871			 *   the page. Instead we need to compute its address
872			 *   in the Idle PTEs in "low memory".
873			 */
874			vm_offset_t vm_ptep = (vm_offset_t) KPTphys
875						+ (pte_phys >> PTPGSHIFT);
876			DBG("ml_static_mfree(%p,0x%x) for pte\n",
877				(void *) vm_ptep, PAGE_SIZE);
878			ml_static_mfree(vm_ptep, PAGE_SIZE);
879		}
880
881		/* Change variable read by sysctl machdep.pmap */
882		pmap_kernel_text_ps = I386_LPGBYTES;
883	}
884
885	boolean_t doconstro = TRUE;
886
887	(void) PE_parse_boot_argn("dataconstro", &doconstro, sizeof(doconstro));
888
889	if ((sconstdata | econstdata) & PAGE_MASK) {
890		kprintf("Const DATA misaligned 0x%lx 0x%lx\n", sconstdata, econstdata);
891		if ((sconstdata & PAGE_MASK) || (doconstro_override == FALSE))
892			doconstro = FALSE;
893	}
894
895	if ((sconstdata > edata) || (sconstdata < sdata) || ((econstdata - sconstdata) >= (edata - sdata))) {
896		kprintf("Const DATA incorrect size 0x%lx 0x%lx 0x%lx 0x%lx\n", sconstdata, econstdata, sdata, edata);
897		doconstro = FALSE;
898	}
899
900	if (doconstro)
901		kprintf("Marking const DATA read-only\n");
902
903	vm_offset_t dva;
904
905	for (dva = sdata; dva < edata; dva += I386_PGBYTES) {
906		assert(((sdata | edata) & PAGE_MASK) == 0);
907		if ( (sdata | edata) & PAGE_MASK) {
908			kprintf("DATA misaligned, 0x%lx, 0x%lx\n", sdata, edata);
909			break;
910		}
911
912		pt_entry_t dpte, *dptep = pmap_pte(kernel_pmap, dva);
913
914		dpte = *dptep;
915
916		assert((dpte & INTEL_PTE_VALID));
917		if ((dpte & INTEL_PTE_VALID) == 0) {
918			kprintf("Missing data mapping 0x%lx 0x%lx 0x%lx\n", dva, sdata, edata);
919			continue;
920		}
921
922		dpte |= INTEL_PTE_NX;
923		if (doconstro && (dva >= sconstdata) && (dva < econstdata)) {
924			dpte &= ~INTEL_PTE_WRITE;
925		}
926		pmap_store_pte(dptep, dpte);
927	}
928	kernel_segment_command_t * seg;
929	kernel_section_t         * sec;
930
931	for (seg = firstseg(); seg != NULL; seg = nextsegfromheader(&_mh_execute_header, seg)) {
932		if (!strcmp(seg->segname, "__TEXT") ||
933		    !strcmp(seg->segname, "__DATA")) {
934			continue;
935		}
936		//XXX
937		if (!strcmp(seg->segname, "__KLD")) {
938			continue;
939		}
940		if (!strcmp(seg->segname, "__HIB")) {
941			for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
942				if (sec->addr & PAGE_MASK)
943					panic("__HIB segment's sections misaligned");
944				if (!strcmp(sec->sectname, "__text")) {
945					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), FALSE, TRUE);
946				} else {
947					pmap_mark_range(kernel_pmap, sec->addr, round_page(sec->size), TRUE, FALSE);
948				}
949			}
950		} else {
951			pmap_mark_range(kernel_pmap, seg->vmaddr, round_page_64(seg->vmsize), TRUE, FALSE);
952		}
953	}
954
955	/*
956	 * If we're debugging, map the low global vector page at the fixed
957	 * virtual address.  Otherwise, remove the mapping for this.
958	 */
959	if (debug_boot_arg) {
960		pt_entry_t *pte = NULL;
961		if (0 == (pte = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS)))
962			panic("lowmem pte");
963		/* make sure it is defined on page boundary */
964		assert(0 == ((vm_offset_t) &lowGlo & PAGE_MASK));
965		pmap_store_pte(pte, kvtophys((vm_offset_t)&lowGlo)
966					| INTEL_PTE_REF
967					| INTEL_PTE_MOD
968					| INTEL_PTE_WIRED
969					| INTEL_PTE_VALID
970					| INTEL_PTE_WRITE
971					| INTEL_PTE_NX);
972	} else {
973		pmap_remove(kernel_pmap,
974			    LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE);
975	}
976
977	splx(spl);
978	if (pmap_pcid_ncpus)
979		tlb_flush_global();
980	else
981		flush_tlb_raw();
982}
983
984/*
985 * this function is only used for debugging fron the vm layer
986 */
987boolean_t
988pmap_verify_free(
989		 ppnum_t pn)
990{
991	pv_rooted_entry_t	pv_h;
992	int		pai;
993	boolean_t	result;
994
995	assert(pn != vm_page_fictitious_addr);
996
997	if (!pmap_initialized)
998		return(TRUE);
999
1000	if (pn == vm_page_guard_addr)
1001		return TRUE;
1002
1003	pai = ppn_to_pai(pn);
1004	if (!IS_MANAGED_PAGE(pai))
1005		return(FALSE);
1006	pv_h = pai_to_pvh(pn);
1007	result = (pv_h->pmap == PMAP_NULL);
1008	return(result);
1009}
1010
1011boolean_t
1012pmap_is_empty(
1013       pmap_t          pmap,
1014       vm_map_offset_t va_start,
1015       vm_map_offset_t va_end)
1016{
1017	vm_map_offset_t offset;
1018	ppnum_t         phys_page;
1019
1020	if (pmap == PMAP_NULL) {
1021		return TRUE;
1022	}
1023
1024	/*
1025	 * Check the resident page count
1026	 * - if it's zero, the pmap is completely empty.
1027	 * This short-circuit test prevents a virtual address scan which is
1028	 * painfully slow for 64-bit spaces.
1029	 * This assumes the count is correct
1030	 * .. the debug kernel ought to be checking perhaps by page table walk.
1031	 */
1032	if (pmap->stats.resident_count == 0)
1033		return TRUE;
1034
1035	for (offset = va_start;
1036	     offset < va_end;
1037	     offset += PAGE_SIZE_64) {
1038		phys_page = pmap_find_phys(pmap, offset);
1039		if (phys_page) {
1040			kprintf("pmap_is_empty(%p,0x%llx,0x%llx): "
1041				"page %d at 0x%llx\n",
1042				pmap, va_start, va_end, phys_page, offset);
1043			return FALSE;
1044		}
1045	}
1046
1047	return TRUE;
1048}
1049
1050
1051/*
1052 *	Create and return a physical map.
1053 *
1054 *	If the size specified for the map
1055 *	is zero, the map is an actual physical
1056 *	map, and may be referenced by the
1057 *	hardware.
1058 *
1059 *	If the size specified is non-zero,
1060 *	the map will be used in software only, and
1061 *	is bounded by that size.
1062 */
1063pmap_t
1064pmap_create(
1065	ledger_t		ledger,
1066	    vm_map_size_t	sz,
1067	    boolean_t		is_64bit)
1068{
1069	pmap_t		p;
1070	vm_size_t	size;
1071	pml4_entry_t    *pml4;
1072	pml4_entry_t    *kpml4;
1073
1074	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1075		   (uint32_t) (sz>>32), (uint32_t) sz, is_64bit, 0, 0);
1076
1077	size = (vm_size_t) sz;
1078
1079	/*
1080	 *	A software use-only map doesn't even need a map.
1081	 */
1082
1083	if (size != 0) {
1084		return(PMAP_NULL);
1085	}
1086
1087	p = (pmap_t) zalloc(pmap_zone);
1088	if (PMAP_NULL == p)
1089		panic("pmap_create zalloc");
1090	/* Zero all fields */
1091	bzero(p, sizeof(*p));
1092	/* init counts now since we'll be bumping some */
1093	simple_lock_init(&p->lock, 0);
1094	p->stats.resident_count = 0;
1095	p->stats.resident_max = 0;
1096	p->stats.wired_count = 0;
1097	p->ref_count = 1;
1098	p->nx_enabled = 1;
1099	p->pm_shared = FALSE;
1100	ledger_reference(ledger);
1101	p->ledger = ledger;
1102
1103	p->pm_task_map = is_64bit ? TASK_MAP_64BIT : TASK_MAP_32BIT;;
1104	if (pmap_pcid_ncpus)
1105		pmap_pcid_initialize(p);
1106
1107	p->pm_pml4 = zalloc(pmap_anchor_zone);
1108
1109	pmap_assert((((uintptr_t)p->pm_pml4) & PAGE_MASK) == 0);
1110
1111	memset((char *)p->pm_pml4, 0, PAGE_SIZE);
1112
1113	p->pm_cr3 = (pmap_paddr_t)kvtophys((vm_offset_t)p->pm_pml4);
1114
1115	/* allocate the vm_objs to hold the pdpt, pde and pte pages */
1116
1117	p->pm_obj_pml4 = vm_object_allocate((vm_object_size_t)(NPML4PGS));
1118	if (NULL == p->pm_obj_pml4)
1119		panic("pmap_create pdpt obj");
1120
1121	p->pm_obj_pdpt = vm_object_allocate((vm_object_size_t)(NPDPTPGS));
1122	if (NULL == p->pm_obj_pdpt)
1123		panic("pmap_create pdpt obj");
1124
1125	p->pm_obj = vm_object_allocate((vm_object_size_t)(NPDEPGS));
1126	if (NULL == p->pm_obj)
1127		panic("pmap_create pte obj");
1128
1129	/* All pmaps share the kernel's pml4 */
1130	pml4 = pmap64_pml4(p, 0ULL);
1131	kpml4 = kernel_pmap->pm_pml4;
1132	pml4[KERNEL_PML4_INDEX]    = kpml4[KERNEL_PML4_INDEX];
1133	pml4[KERNEL_KEXTS_INDEX]   = kpml4[KERNEL_KEXTS_INDEX];
1134	pml4[KERNEL_PHYSMAP_PML4_INDEX] = kpml4[KERNEL_PHYSMAP_PML4_INDEX];
1135
1136	PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START,
1137		   p, is_64bit, 0, 0, 0);
1138
1139	return(p);
1140}
1141
1142/*
1143 *	Retire the given physical map from service.
1144 *	Should only be called if the map contains
1145 *	no valid mappings.
1146 */
1147
1148void
1149pmap_destroy(pmap_t	p)
1150{
1151	int		c;
1152
1153	if (p == PMAP_NULL)
1154		return;
1155
1156	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
1157		   p, 0, 0, 0, 0);
1158
1159	PMAP_LOCK(p);
1160
1161	c = --p->ref_count;
1162
1163	pmap_assert((current_thread() && (current_thread()->map)) ? (current_thread()->map->pmap != p) : TRUE);
1164
1165	if (c == 0) {
1166		/*
1167		 * If some cpu is not using the physical pmap pointer that it
1168		 * is supposed to be (see set_dirbase), we might be using the
1169		 * pmap that is being destroyed! Make sure we are
1170		 * physically on the right pmap:
1171		 */
1172		PMAP_UPDATE_TLBS(p, 0x0ULL, 0xFFFFFFFFFFFFF000ULL);
1173		if (pmap_pcid_ncpus)
1174			pmap_destroy_pcid_sync(p);
1175	}
1176
1177	PMAP_UNLOCK(p);
1178
1179	if (c != 0) {
1180		PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1181			   p, 1, 0, 0, 0);
1182		pmap_assert(p == kernel_pmap);
1183	        return;	/* still in use */
1184	}
1185
1186	/*
1187	 *	Free the memory maps, then the
1188	 *	pmap structure.
1189	 */
1190	int inuse_ptepages = 0;
1191
1192	zfree(pmap_anchor_zone, p->pm_pml4);
1193
1194	inuse_ptepages += p->pm_obj_pml4->resident_page_count;
1195	vm_object_deallocate(p->pm_obj_pml4);
1196
1197	inuse_ptepages += p->pm_obj_pdpt->resident_page_count;
1198	vm_object_deallocate(p->pm_obj_pdpt);
1199
1200	inuse_ptepages += p->pm_obj->resident_page_count;
1201	vm_object_deallocate(p->pm_obj);
1202
1203	OSAddAtomic(-inuse_ptepages,  &inuse_ptepages_count);
1204	PMAP_ZINFO_PFREE(p, inuse_ptepages * PAGE_SIZE);
1205	ledger_dereference(p->ledger);
1206	zfree(pmap_zone, p);
1207
1208	PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END,
1209		   0, 0, 0, 0, 0);
1210}
1211
1212/*
1213 *	Add a reference to the specified pmap.
1214 */
1215
1216void
1217pmap_reference(pmap_t	p)
1218{
1219	if (p != PMAP_NULL) {
1220	        PMAP_LOCK(p);
1221		p->ref_count++;
1222		PMAP_UNLOCK(p);;
1223	}
1224}
1225
1226/*
1227 *	Remove phys addr if mapped in specified map
1228 *
1229 */
1230void
1231pmap_remove_some_phys(
1232	__unused pmap_t		map,
1233	__unused ppnum_t         pn)
1234{
1235
1236/* Implement to support working set code */
1237
1238}
1239
1240/*
1241 *	Set the physical protection on the
1242 *	specified range of this map as requested.
1243 *	Will not increase permissions.
1244 */
1245void
1246pmap_protect(
1247	pmap_t		map,
1248	vm_map_offset_t	sva,
1249	vm_map_offset_t	eva,
1250	vm_prot_t	prot)
1251{
1252	pt_entry_t	*pde;
1253	pt_entry_t	*spte, *epte;
1254	vm_map_offset_t lva;
1255	vm_map_offset_t orig_sva;
1256	boolean_t       set_NX;
1257	int             num_found = 0;
1258
1259	pmap_intr_assert();
1260
1261	if (map == PMAP_NULL)
1262		return;
1263
1264	if (prot == VM_PROT_NONE) {
1265		pmap_remove(map, sva, eva);
1266		return;
1267	}
1268	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
1269		   map,
1270		   (uint32_t) (sva >> 32), (uint32_t) sva,
1271		   (uint32_t) (eva >> 32), (uint32_t) eva);
1272
1273	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !map->nx_enabled)
1274		set_NX = FALSE;
1275	else
1276		set_NX = TRUE;
1277
1278	PMAP_LOCK(map);
1279
1280	orig_sva = sva;
1281	while (sva < eva) {
1282		lva = (sva + pde_mapped_size) & ~(pde_mapped_size - 1);
1283		if (lva > eva)
1284			lva = eva;
1285		pde = pmap_pde(map, sva);
1286		if (pde && (*pde & INTEL_PTE_VALID)) {
1287			if (*pde & INTEL_PTE_PS) {
1288				/* superpage */
1289				spte = pde;
1290				epte = spte+1; /* excluded */
1291			} else {
1292				spte = pmap_pte(map, (sva & ~(pde_mapped_size - 1)));
1293				spte = &spte[ptenum(sva)];
1294				epte = &spte[intel_btop(lva - sva)];
1295			}
1296
1297			for (; spte < epte; spte++) {
1298				if (!(*spte & INTEL_PTE_VALID))
1299					continue;
1300
1301				if (prot & VM_PROT_WRITE)
1302					pmap_update_pte(spte, 0, INTEL_PTE_WRITE);
1303				else
1304					pmap_update_pte(spte, INTEL_PTE_WRITE, 0);
1305
1306				if (set_NX)
1307					pmap_update_pte(spte, 0, INTEL_PTE_NX);
1308				else
1309					pmap_update_pte(spte, INTEL_PTE_NX, 0);
1310				num_found++;
1311			}
1312		}
1313		sva = lva;
1314	}
1315	if (num_found)
1316		PMAP_UPDATE_TLBS(map, orig_sva, eva);
1317
1318	PMAP_UNLOCK(map);
1319
1320	PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END,
1321		   0, 0, 0, 0, 0);
1322
1323}
1324
1325/* Map a (possibly) autogenned block */
1326void
1327pmap_map_block(
1328	pmap_t		pmap,
1329	addr64_t	va,
1330	ppnum_t 	pa,
1331	uint32_t	size,
1332	vm_prot_t	prot,
1333	int		attr,
1334	__unused unsigned int	flags)
1335{
1336	uint32_t        page;
1337	int		cur_page_size;
1338
1339	if (attr & VM_MEM_SUPERPAGE)
1340		cur_page_size =  SUPERPAGE_SIZE;
1341	else
1342		cur_page_size =  PAGE_SIZE;
1343
1344	for (page = 0; page < size; page+=cur_page_size/PAGE_SIZE) {
1345		pmap_enter(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE);
1346		va += cur_page_size;
1347		pa+=cur_page_size/PAGE_SIZE;
1348	}
1349}
1350
1351kern_return_t
1352pmap_expand_pml4(
1353	pmap_t		map,
1354	vm_map_offset_t	vaddr,
1355	unsigned int options)
1356{
1357	vm_page_t	m;
1358	pmap_paddr_t	pa;
1359	uint64_t	i;
1360	ppnum_t		pn;
1361	pml4_entry_t	*pml4p;
1362
1363	DBG("pmap_expand_pml4(%p,%p)\n", map, (void *)vaddr);
1364
1365	/*
1366	 *	Allocate a VM page for the pml4 page
1367	 */
1368	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1369		if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1370			return KERN_RESOURCE_SHORTAGE;
1371		VM_PAGE_WAIT();
1372	}
1373	/*
1374	 *	put the page into the pmap's obj list so it
1375	 *	can be found later.
1376	 */
1377	pn = m->phys_page;
1378	pa = i386_ptob(pn);
1379	i = pml4idx(map, vaddr);
1380
1381	/*
1382	 *	Zero the page.
1383	 */
1384	pmap_zero_page(pn);
1385
1386	vm_page_lockspin_queues();
1387	vm_page_wire(m);
1388	vm_page_unlock_queues();
1389
1390	OSAddAtomic(1,  &inuse_ptepages_count);
1391	OSAddAtomic64(1,  &alloc_ptepages_count);
1392	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1393
1394	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1395	vm_object_lock(map->pm_obj_pml4);
1396
1397	PMAP_LOCK(map);
1398	/*
1399	 *	See if someone else expanded us first
1400	 */
1401	if (pmap64_pdpt(map, vaddr) != PDPT_ENTRY_NULL) {
1402	        PMAP_UNLOCK(map);
1403		vm_object_unlock(map->pm_obj_pml4);
1404
1405		VM_PAGE_FREE(m);
1406
1407		OSAddAtomic(-1,  &inuse_ptepages_count);
1408		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1409		return KERN_SUCCESS;
1410	}
1411
1412#if 0 /* DEBUG */
1413       if (0 != vm_page_lookup(map->pm_obj_pml4, (vm_object_offset_t)i)) {
1414	       panic("pmap_expand_pml4: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1415		     map, map->pm_obj_pml4, vaddr, i);
1416       }
1417#endif
1418	vm_page_insert(m, map->pm_obj_pml4, (vm_object_offset_t)i);
1419	vm_object_unlock(map->pm_obj_pml4);
1420
1421	/*
1422	 *	Set the page directory entry for this page table.
1423	 */
1424	pml4p = pmap64_pml4(map, vaddr); /* refetch under lock */
1425
1426	pmap_store_pte(pml4p, pa_to_pte(pa)
1427				| INTEL_PTE_VALID
1428				| INTEL_PTE_USER
1429				| INTEL_PTE_WRITE);
1430
1431	PMAP_UNLOCK(map);
1432
1433	return KERN_SUCCESS;
1434}
1435
1436kern_return_t
1437pmap_expand_pdpt(pmap_t map, vm_map_offset_t vaddr, unsigned int options)
1438{
1439	vm_page_t	m;
1440	pmap_paddr_t	pa;
1441	uint64_t	i;
1442	ppnum_t		pn;
1443	pdpt_entry_t	*pdptp;
1444
1445	DBG("pmap_expand_pdpt(%p,%p)\n", map, (void *)vaddr);
1446
1447	while ((pdptp = pmap64_pdpt(map, vaddr)) == PDPT_ENTRY_NULL) {
1448		kern_return_t pep4kr = pmap_expand_pml4(map, vaddr, options);
1449		if (pep4kr != KERN_SUCCESS)
1450			return pep4kr;
1451	}
1452
1453	/*
1454	 *	Allocate a VM page for the pdpt page
1455	 */
1456	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1457		if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1458			return KERN_RESOURCE_SHORTAGE;
1459		VM_PAGE_WAIT();
1460	}
1461
1462	/*
1463	 *	put the page into the pmap's obj list so it
1464	 *	can be found later.
1465	 */
1466	pn = m->phys_page;
1467	pa = i386_ptob(pn);
1468	i = pdptidx(map, vaddr);
1469
1470	/*
1471	 *	Zero the page.
1472	 */
1473	pmap_zero_page(pn);
1474
1475	vm_page_lockspin_queues();
1476	vm_page_wire(m);
1477	vm_page_unlock_queues();
1478
1479	OSAddAtomic(1,  &inuse_ptepages_count);
1480	OSAddAtomic64(1,  &alloc_ptepages_count);
1481	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1482
1483	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1484	vm_object_lock(map->pm_obj_pdpt);
1485
1486	PMAP_LOCK(map);
1487	/*
1488	 *	See if someone else expanded us first
1489	 */
1490	if (pmap64_pde(map, vaddr) != PD_ENTRY_NULL) {
1491		PMAP_UNLOCK(map);
1492		vm_object_unlock(map->pm_obj_pdpt);
1493
1494		VM_PAGE_FREE(m);
1495
1496		OSAddAtomic(-1,  &inuse_ptepages_count);
1497		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1498		return KERN_SUCCESS;
1499	}
1500
1501#if 0 /* DEBUG */
1502       if (0 != vm_page_lookup(map->pm_obj_pdpt, (vm_object_offset_t)i)) {
1503	       panic("pmap_expand_pdpt: obj not empty, pmap %p pm_obj %p vaddr 0x%llx i 0x%llx\n",
1504		     map, map->pm_obj_pdpt, vaddr, i);
1505       }
1506#endif
1507	vm_page_insert(m, map->pm_obj_pdpt, (vm_object_offset_t)i);
1508	vm_object_unlock(map->pm_obj_pdpt);
1509
1510	/*
1511	 *	Set the page directory entry for this page table.
1512	 */
1513	pdptp = pmap64_pdpt(map, vaddr); /* refetch under lock */
1514
1515	pmap_store_pte(pdptp, pa_to_pte(pa)
1516				| INTEL_PTE_VALID
1517				| INTEL_PTE_USER
1518				| INTEL_PTE_WRITE);
1519
1520	PMAP_UNLOCK(map);
1521
1522	return KERN_SUCCESS;
1523
1524}
1525
1526
1527
1528/*
1529 *	Routine:	pmap_expand
1530 *
1531 *	Expands a pmap to be able to map the specified virtual address.
1532 *
1533 *	Allocates new virtual memory for the P0 or P1 portion of the
1534 *	pmap, then re-maps the physical pages that were in the old
1535 *	pmap to be in the new pmap.
1536 *
1537 *	Must be called with the pmap system and the pmap unlocked,
1538 *	since these must be unlocked to use vm_allocate or vm_deallocate.
1539 *	Thus it must be called in a loop that checks whether the map
1540 *	has been expanded enough.
1541 *	(We won't loop forever, since page tables aren't shrunk.)
1542 */
1543kern_return_t
1544pmap_expand(
1545	pmap_t		map,
1546	vm_map_offset_t	vaddr,
1547	unsigned int options)
1548{
1549	pt_entry_t		*pdp;
1550	register vm_page_t	m;
1551	register pmap_paddr_t	pa;
1552	uint64_t		i;
1553	ppnum_t                 pn;
1554
1555
1556	/*
1557 	 * For the kernel, the virtual address must be in or above the basement
1558	 * which is for kexts and is in the 512GB immediately below the kernel..
1559	 * XXX - should use VM_MIN_KERNEL_AND_KEXT_ADDRESS not KERNEL_BASEMENT
1560	 */
1561	if (map == kernel_pmap &&
1562	    !(vaddr >= KERNEL_BASEMENT && vaddr <= VM_MAX_KERNEL_ADDRESS))
1563		panic("pmap_expand: bad vaddr 0x%llx for kernel pmap", vaddr);
1564
1565
1566	while ((pdp = pmap64_pde(map, vaddr)) == PD_ENTRY_NULL) {
1567		kern_return_t pepkr = pmap_expand_pdpt(map, vaddr, options);
1568		if (pepkr != KERN_SUCCESS)
1569			return pepkr;
1570	}
1571
1572	/*
1573	 *	Allocate a VM page for the pde entries.
1574	 */
1575	while ((m = vm_page_grab()) == VM_PAGE_NULL) {
1576		if (options & PMAP_EXPAND_OPTIONS_NOWAIT)
1577			return KERN_RESOURCE_SHORTAGE;
1578		VM_PAGE_WAIT();
1579	}
1580
1581	/*
1582	 *	put the page into the pmap's obj list so it
1583	 *	can be found later.
1584	 */
1585	pn = m->phys_page;
1586	pa = i386_ptob(pn);
1587	i = pdeidx(map, vaddr);
1588
1589	/*
1590	 *	Zero the page.
1591	 */
1592	pmap_zero_page(pn);
1593
1594	vm_page_lockspin_queues();
1595	vm_page_wire(m);
1596	vm_page_unlock_queues();
1597
1598	OSAddAtomic(1,  &inuse_ptepages_count);
1599	OSAddAtomic64(1,  &alloc_ptepages_count);
1600	PMAP_ZINFO_PALLOC(map, PAGE_SIZE);
1601
1602	/* Take the oject lock (mutex) before the PMAP_LOCK (spinlock) */
1603	vm_object_lock(map->pm_obj);
1604
1605	PMAP_LOCK(map);
1606
1607	/*
1608	 *	See if someone else expanded us first
1609	 */
1610	if (pmap_pte(map, vaddr) != PT_ENTRY_NULL) {
1611		PMAP_UNLOCK(map);
1612		vm_object_unlock(map->pm_obj);
1613
1614		VM_PAGE_FREE(m);
1615
1616		OSAddAtomic(-1,  &inuse_ptepages_count);
1617		PMAP_ZINFO_PFREE(map, PAGE_SIZE);
1618		return KERN_SUCCESS;
1619	}
1620
1621#if 0 /* DEBUG */
1622       if (0 != vm_page_lookup(map->pm_obj, (vm_object_offset_t)i)) {
1623	       panic("pmap_expand: obj not empty, pmap 0x%x pm_obj 0x%x vaddr 0x%llx i 0x%llx\n",
1624		     map, map->pm_obj, vaddr, i);
1625       }
1626#endif
1627	vm_page_insert(m, map->pm_obj, (vm_object_offset_t)i);
1628	vm_object_unlock(map->pm_obj);
1629
1630	/*
1631	 *	Set the page directory entry for this page table.
1632	 */
1633	pdp = pmap_pde(map, vaddr);
1634	pmap_store_pte(pdp, pa_to_pte(pa)
1635				| INTEL_PTE_VALID
1636				| INTEL_PTE_USER
1637				| INTEL_PTE_WRITE);
1638
1639	PMAP_UNLOCK(map);
1640
1641	return KERN_SUCCESS;
1642}
1643
1644/* On K64 machines with more than 32GB of memory, pmap_steal_memory
1645 * will allocate past the 1GB of pre-expanded virtual kernel area. This
1646 * function allocates all the page tables using memory from the same pool
1647 * that pmap_steal_memory uses, rather than calling vm_page_grab (which
1648 * isn't available yet). */
1649void
1650pmap_pre_expand(pmap_t pmap, vm_map_offset_t vaddr)
1651{
1652	ppnum_t pn;
1653	pt_entry_t		*pte;
1654
1655	PMAP_LOCK(pmap);
1656
1657	if(pmap64_pdpt(pmap, vaddr) == PDPT_ENTRY_NULL) {
1658		if (!pmap_next_page_hi(&pn))
1659			panic("pmap_pre_expand");
1660
1661		pmap_zero_page(pn);
1662
1663		pte = pmap64_pml4(pmap, vaddr);
1664
1665		pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
1666				| INTEL_PTE_VALID
1667				| INTEL_PTE_USER
1668				| INTEL_PTE_WRITE);
1669	}
1670
1671	if(pmap64_pde(pmap, vaddr) == PD_ENTRY_NULL) {
1672		if (!pmap_next_page_hi(&pn))
1673			panic("pmap_pre_expand");
1674
1675		pmap_zero_page(pn);
1676
1677		pte = pmap64_pdpt(pmap, vaddr);
1678
1679		pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
1680				| INTEL_PTE_VALID
1681				| INTEL_PTE_USER
1682				| INTEL_PTE_WRITE);
1683	}
1684
1685	if(pmap_pte(pmap, vaddr) == PT_ENTRY_NULL) {
1686		if (!pmap_next_page_hi(&pn))
1687			panic("pmap_pre_expand");
1688
1689		pmap_zero_page(pn);
1690
1691		pte = pmap64_pde(pmap, vaddr);
1692
1693		pmap_store_pte(pte, pa_to_pte(i386_ptob(pn))
1694				| INTEL_PTE_VALID
1695				| INTEL_PTE_USER
1696				| INTEL_PTE_WRITE);
1697	}
1698
1699	PMAP_UNLOCK(pmap);
1700}
1701
1702/*
1703 * pmap_sync_page_data_phys(ppnum_t pa)
1704 *
1705 * Invalidates all of the instruction cache on a physical page and
1706 * pushes any dirty data from the data cache for the same physical page
1707 * Not required in i386.
1708 */
1709void
1710pmap_sync_page_data_phys(__unused ppnum_t pa)
1711{
1712	return;
1713}
1714
1715/*
1716 * pmap_sync_page_attributes_phys(ppnum_t pa)
1717 *
1718 * Write back and invalidate all cachelines on a physical page.
1719 */
1720void
1721pmap_sync_page_attributes_phys(ppnum_t pa)
1722{
1723	cache_flush_page_phys(pa);
1724}
1725
1726
1727
1728#ifdef CURRENTLY_UNUSED_AND_UNTESTED
1729
1730int	collect_ref;
1731int	collect_unref;
1732
1733/*
1734 *	Routine:	pmap_collect
1735 *	Function:
1736 *		Garbage collects the physical map system for
1737 *		pages which are no longer used.
1738 *		Success need not be guaranteed -- that is, there
1739 *		may well be pages which are not referenced, but
1740 *		others may be collected.
1741 *	Usage:
1742 *		Called by the pageout daemon when pages are scarce.
1743 */
1744void
1745pmap_collect(
1746	pmap_t 		p)
1747{
1748	register pt_entry_t	*pdp, *ptp;
1749	pt_entry_t		*eptp;
1750	int			wired;
1751
1752	if (p == PMAP_NULL)
1753		return;
1754
1755	if (p == kernel_pmap)
1756		return;
1757
1758	/*
1759	 *	Garbage collect map.
1760	 */
1761	PMAP_LOCK(p);
1762
1763	for (pdp = (pt_entry_t *)p->dirbase;
1764	     pdp < (pt_entry_t *)&p->dirbase[(UMAXPTDI+1)];
1765	     pdp++)
1766	{
1767	   if (*pdp & INTEL_PTE_VALID) {
1768	      if(*pdp & INTEL_PTE_REF) {
1769		pmap_store_pte(pdp, *pdp & ~INTEL_PTE_REF);
1770		collect_ref++;
1771	      } else {
1772		collect_unref++;
1773		ptp = pmap_pte(p, pdetova(pdp - (pt_entry_t *)p->dirbase));
1774		eptp = ptp + NPTEPG;
1775
1776		/*
1777		 * If the pte page has any wired mappings, we cannot
1778		 * free it.
1779		 */
1780		wired = 0;
1781		{
1782		    register pt_entry_t *ptep;
1783		    for (ptep = ptp; ptep < eptp; ptep++) {
1784			if (iswired(*ptep)) {
1785			    wired = 1;
1786			    break;
1787			}
1788		    }
1789		}
1790		if (!wired) {
1791		    /*
1792		     * Remove the virtual addresses mapped by this pte page.
1793		     */
1794		    pmap_remove_range(p,
1795				pdetova(pdp - (pt_entry_t *)p->dirbase),
1796				ptp,
1797				eptp);
1798
1799		    /*
1800		     * Invalidate the page directory pointer.
1801		     */
1802		    pmap_store_pte(pdp, 0x0);
1803
1804		    PMAP_UNLOCK(p);
1805
1806		    /*
1807		     * And free the pte page itself.
1808		     */
1809		    {
1810			register vm_page_t m;
1811
1812			vm_object_lock(p->pm_obj);
1813
1814			m = vm_page_lookup(p->pm_obj,(vm_object_offset_t)(pdp - (pt_entry_t *)&p->dirbase[0]));
1815			if (m == VM_PAGE_NULL)
1816			    panic("pmap_collect: pte page not in object");
1817
1818			vm_object_unlock(p->pm_obj);
1819
1820			VM_PAGE_FREE(m);
1821
1822			OSAddAtomic(-1,  &inuse_ptepages_count);
1823			PMAP_ZINFO_PFREE(p, PAGE_SIZE);
1824		    }
1825
1826		    PMAP_LOCK(p);
1827		}
1828	      }
1829	   }
1830	}
1831
1832	PMAP_UPDATE_TLBS(p, 0x0, 0xFFFFFFFFFFFFF000ULL);
1833	PMAP_UNLOCK(p);
1834	return;
1835
1836}
1837#endif
1838
1839
1840void
1841pmap_copy_page(ppnum_t src, ppnum_t dst)
1842{
1843	bcopy_phys((addr64_t)i386_ptob(src),
1844		   (addr64_t)i386_ptob(dst),
1845		   PAGE_SIZE);
1846}
1847
1848
1849/*
1850 *	Routine:	pmap_pageable
1851 *	Function:
1852 *		Make the specified pages (by pmap, offset)
1853 *		pageable (or not) as requested.
1854 *
1855 *		A page which is not pageable may not take
1856 *		a fault; therefore, its page table entry
1857 *		must remain valid for the duration.
1858 *
1859 *		This routine is merely advisory; pmap_enter
1860 *		will specify that these pages are to be wired
1861 *		down (or not) as appropriate.
1862 */
1863void
1864pmap_pageable(
1865	__unused pmap_t			pmap,
1866	__unused vm_map_offset_t	start_addr,
1867	__unused vm_map_offset_t	end_addr,
1868	__unused boolean_t		pageable)
1869{
1870#ifdef	lint
1871	pmap++; start_addr++; end_addr++; pageable++;
1872#endif	/* lint */
1873}
1874
1875void
1876invalidate_icache(__unused vm_offset_t	addr,
1877		  __unused unsigned	cnt,
1878		  __unused int		phys)
1879{
1880	return;
1881}
1882
1883void
1884flush_dcache(__unused vm_offset_t	addr,
1885	     __unused unsigned		count,
1886	     __unused int		phys)
1887{
1888	return;
1889}
1890
1891#if CONFIG_DTRACE
1892/*
1893 * Constrain DTrace copyin/copyout actions
1894 */
1895extern kern_return_t dtrace_copyio_preflight(addr64_t);
1896extern kern_return_t dtrace_copyio_postflight(addr64_t);
1897
1898kern_return_t dtrace_copyio_preflight(__unused addr64_t va)
1899{
1900	thread_t thread = current_thread();
1901	uint64_t ccr3;
1902
1903	if (current_map() == kernel_map)
1904		return KERN_FAILURE;
1905	else if (((ccr3 = get_cr3_base()) != thread->map->pmap->pm_cr3) && (no_shared_cr3 == FALSE))
1906		return KERN_FAILURE;
1907	else if (no_shared_cr3 && (ccr3 != kernel_pmap->pm_cr3))
1908		return KERN_FAILURE;
1909	else if (thread->machine.specFlags & CopyIOActive)
1910		return KERN_FAILURE;
1911	else
1912		return KERN_SUCCESS;
1913}
1914
1915kern_return_t dtrace_copyio_postflight(__unused addr64_t va)
1916{
1917	return KERN_SUCCESS;
1918}
1919#endif /* CONFIG_DTRACE */
1920
1921#include <mach_vm_debug.h>
1922#if	MACH_VM_DEBUG
1923#include <vm/vm_debug.h>
1924
1925int
1926pmap_list_resident_pages(
1927	__unused pmap_t		pmap,
1928	__unused vm_offset_t	*listp,
1929	__unused int		space)
1930{
1931	return 0;
1932}
1933#endif	/* MACH_VM_DEBUG */
1934
1935
1936
1937/* temporary workaround */
1938boolean_t
1939coredumpok(__unused vm_map_t map, __unused vm_offset_t va)
1940{
1941#if 0
1942	pt_entry_t     *ptep;
1943
1944	ptep = pmap_pte(map->pmap, va);
1945	if (0 == ptep)
1946		return FALSE;
1947	return ((*ptep & (INTEL_PTE_NCACHE | INTEL_PTE_WIRED)) != (INTEL_PTE_NCACHE | INTEL_PTE_WIRED));
1948#else
1949	return TRUE;
1950#endif
1951}
1952
1953
1954boolean_t
1955phys_page_exists(ppnum_t pn)
1956{
1957	assert(pn != vm_page_fictitious_addr);
1958
1959	if (!pmap_initialized)
1960		return TRUE;
1961
1962	if (pn == vm_page_guard_addr)
1963		return FALSE;
1964
1965	if (!IS_MANAGED_PAGE(ppn_to_pai(pn)))
1966		return FALSE;
1967
1968	return TRUE;
1969}
1970
1971
1972
1973void
1974pmap_switch(pmap_t tpmap)
1975{
1976        spl_t	s;
1977
1978	s = splhigh();		/* Make sure interruptions are disabled */
1979	set_dirbase(tpmap, current_thread());
1980	splx(s);
1981}
1982
1983
1984/*
1985 * disable no-execute capability on
1986 * the specified pmap
1987 */
1988void
1989pmap_disable_NX(pmap_t pmap)
1990{
1991        pmap->nx_enabled = 0;
1992}
1993
1994void
1995pt_fake_zone_init(int zone_index)
1996{
1997	pt_fake_zone_index = zone_index;
1998}
1999
2000void
2001pt_fake_zone_info(
2002	int		*count,
2003	vm_size_t	*cur_size,
2004	vm_size_t	*max_size,
2005	vm_size_t	*elem_size,
2006	vm_size_t	*alloc_size,
2007	uint64_t	*sum_size,
2008	int		*collectable,
2009	int		*exhaustable,
2010	int		*caller_acct)
2011{
2012        *count      = inuse_ptepages_count;
2013	*cur_size   = PAGE_SIZE * inuse_ptepages_count;
2014	*max_size   = PAGE_SIZE * (inuse_ptepages_count +
2015				   vm_page_inactive_count +
2016				   vm_page_active_count +
2017				   vm_page_free_count);
2018	*elem_size  = PAGE_SIZE;
2019	*alloc_size = PAGE_SIZE;
2020	*sum_size = alloc_ptepages_count * PAGE_SIZE;
2021
2022	*collectable = 1;
2023	*exhaustable = 0;
2024	*caller_acct = 1;
2025}
2026
2027static inline void
2028pmap_cpuset_NMIPI(cpu_set cpu_mask) {
2029	unsigned int cpu, cpu_bit;
2030	uint64_t deadline;
2031
2032	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2033		if (cpu_mask & cpu_bit)
2034			cpu_NMI_interrupt(cpu);
2035	}
2036	deadline = mach_absolute_time() + (LockTimeOut);
2037	while (mach_absolute_time() < deadline)
2038		cpu_pause();
2039}
2040
2041/*
2042 * Called with pmap locked, we:
2043 *  - scan through per-cpu data to see which other cpus need to flush
2044 *  - send an IPI to each non-idle cpu to be flushed
2045 *  - wait for all to signal back that they are inactive or we see that
2046 *    they are at a safe point (idle).
2047 *  - flush the local tlb if active for this pmap
2048 *  - return ... the caller will unlock the pmap
2049 */
2050
2051void
2052pmap_flush_tlbs(pmap_t	pmap, vm_map_offset_t startv, vm_map_offset_t endv)
2053{
2054	unsigned int	cpu;
2055	unsigned int	cpu_bit;
2056	cpu_set		cpus_to_signal;
2057	unsigned int	my_cpu = cpu_number();
2058	pmap_paddr_t	pmap_cr3 = pmap->pm_cr3;
2059	boolean_t	flush_self = FALSE;
2060	uint64_t	deadline;
2061	boolean_t	pmap_is_shared = (pmap->pm_shared || (pmap == kernel_pmap));
2062
2063	assert((processor_avail_count < 2) ||
2064	       (ml_get_interrupts_enabled() && get_preemption_level() != 0));
2065
2066	/*
2067	 * Scan other cpus for matching active or task CR3.
2068	 * For idle cpus (with no active map) we mark them invalid but
2069	 * don't signal -- they'll check as they go busy.
2070	 */
2071	cpus_to_signal = 0;
2072
2073	if (pmap_pcid_ncpus) {
2074		pmap_pcid_invalidate_all_cpus(pmap);
2075		__asm__ volatile("mfence":::"memory");
2076	}
2077
2078	for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2079		if (!cpu_datap(cpu)->cpu_running)
2080			continue;
2081		uint64_t	cpu_active_cr3 = CPU_GET_ACTIVE_CR3(cpu);
2082		uint64_t	cpu_task_cr3 = CPU_GET_TASK_CR3(cpu);
2083
2084		if ((pmap_cr3 == cpu_task_cr3) ||
2085		    (pmap_cr3 == cpu_active_cr3) ||
2086		    (pmap_is_shared)) {
2087			if (cpu == my_cpu) {
2088				flush_self = TRUE;
2089				continue;
2090			}
2091			if (pmap_pcid_ncpus && pmap_is_shared)
2092				cpu_datap(cpu)->cpu_tlb_invalid_global = TRUE;
2093			else
2094				cpu_datap(cpu)->cpu_tlb_invalid_local = TRUE;
2095			__asm__ volatile("mfence":::"memory");
2096
2097			/*
2098			 * We don't need to signal processors which will flush
2099			 * lazily at the idle state or kernel boundary.
2100			 * For example, if we're invalidating the kernel pmap,
2101			 * processors currently in userspace don't need to flush
2102			 * their TLBs until the next time they enter the kernel.
2103			 * Alterations to the address space of a task active
2104			 * on a remote processor result in a signal, to
2105			 * account for copy operations. (There may be room
2106			 * for optimization in such cases).
2107			 * The order of the loads below with respect
2108			 * to the store to the "cpu_tlb_invalid" field above
2109			 * is important--hence the barrier.
2110			 */
2111			if (CPU_CR3_IS_ACTIVE(cpu) &&
2112			    (pmap_cr3 == CPU_GET_ACTIVE_CR3(cpu) ||
2113			    pmap->pm_shared ||
2114			    (pmap_cr3 == CPU_GET_TASK_CR3(cpu)))) {
2115				cpus_to_signal |= cpu_bit;
2116				i386_signal_cpu(cpu, MP_TLB_FLUSH, ASYNC);
2117			}
2118		}
2119	}
2120
2121	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_START,
2122		   pmap, cpus_to_signal, flush_self, startv, endv);
2123
2124	/*
2125	 * Flush local tlb if required.
2126	 * Do this now to overlap with other processors responding.
2127	 */
2128	if (flush_self) {
2129		if (pmap_pcid_ncpus) {
2130			pmap_pcid_validate_cpu(pmap, my_cpu);
2131			if (pmap_is_shared)
2132				tlb_flush_global();
2133			else
2134				flush_tlb_raw();
2135		}
2136		else
2137			flush_tlb_raw();
2138	}
2139
2140	if (cpus_to_signal) {
2141		cpu_set	cpus_to_respond = cpus_to_signal;
2142
2143		deadline = mach_absolute_time() + LockTimeOut;
2144		/*
2145		 * Wait for those other cpus to acknowledge
2146		 */
2147		while (cpus_to_respond != 0) {
2148			long orig_acks = 0;
2149
2150			for (cpu = 0, cpu_bit = 1; cpu < real_ncpus; cpu++, cpu_bit <<= 1) {
2151				/* Consider checking local/global invalidity
2152				 * as appropriate in the PCID case.
2153				 */
2154				if ((cpus_to_respond & cpu_bit) != 0) {
2155					if (!cpu_datap(cpu)->cpu_running ||
2156					    cpu_datap(cpu)->cpu_tlb_invalid == FALSE ||
2157					    !CPU_CR3_IS_ACTIVE(cpu)) {
2158						cpus_to_respond &= ~cpu_bit;
2159					}
2160					cpu_pause();
2161				}
2162				if (cpus_to_respond == 0)
2163					break;
2164			}
2165			if (cpus_to_respond && (mach_absolute_time() > deadline)) {
2166				if (machine_timeout_suspended())
2167					continue;
2168				pmap_tlb_flush_timeout = TRUE;
2169				orig_acks = NMIPI_acks;
2170				pmap_cpuset_NMIPI(cpus_to_respond);
2171
2172				panic("TLB invalidation IPI timeout: "
2173				    "CPU(s) failed to respond to interrupts, unresponsive CPU bitmap: 0x%lx, NMIPI acks: orig: 0x%lx, now: 0x%lx",
2174				    cpus_to_respond, orig_acks, NMIPI_acks);
2175			}
2176		}
2177	}
2178
2179	if (__improbable((pmap == kernel_pmap) && (flush_self != TRUE))) {
2180		panic("pmap_flush_tlbs: pmap == kernel_pmap && flush_self != TRUE; kernel CR3: 0x%llX, CPU active CR3: 0x%llX, CPU Task Map: %d", kernel_pmap->pm_cr3, current_cpu_datap()->cpu_active_cr3, current_cpu_datap()->cpu_task_map);
2181	}
2182
2183	PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__FLUSH_TLBS) | DBG_FUNC_END,
2184	    pmap, cpus_to_signal, startv, endv, 0);
2185}
2186
2187void
2188process_pmap_updates(void)
2189{
2190	int ccpu = cpu_number();
2191	pmap_assert(ml_get_interrupts_enabled() == 0 || get_preemption_level() != 0);
2192	if (pmap_pcid_ncpus) {
2193		pmap_pcid_validate_current();
2194		if (cpu_datap(ccpu)->cpu_tlb_invalid_global) {
2195			cpu_datap(ccpu)->cpu_tlb_invalid = FALSE;
2196			tlb_flush_global();
2197		}
2198		else {
2199			cpu_datap(ccpu)->cpu_tlb_invalid_local = FALSE;
2200			flush_tlb_raw();
2201		}
2202	}
2203	else {
2204		current_cpu_datap()->cpu_tlb_invalid = FALSE;
2205		flush_tlb_raw();
2206	}
2207
2208	__asm__ volatile("mfence");
2209}
2210
2211void
2212pmap_update_interrupt(void)
2213{
2214        PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_START,
2215		   0, 0, 0, 0, 0);
2216
2217	process_pmap_updates();
2218
2219        PMAP_TRACE(PMAP_CODE(PMAP__UPDATE_INTERRUPT) | DBG_FUNC_END,
2220		   0, 0, 0, 0, 0);
2221}
2222
2223#include <mach/mach_vm.h>	/* mach_vm_region_recurse() */
2224/* Scan kernel pmap for W+X PTEs, scan kernel VM map for W+X map entries
2225 * and identify ranges with mismatched VM permissions and PTE permissions
2226 */
2227kern_return_t
2228pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset_t ev) {
2229	vm_offset_t cv = sv;
2230	kern_return_t rv = KERN_SUCCESS;
2231	uint64_t skip4 = 0, skip2 = 0;
2232
2233	sv &= ~PAGE_MASK_64;
2234	ev &= ~PAGE_MASK_64;
2235	while (cv < ev) {
2236		if (__improbable((cv > 0x00007FFFFFFFFFFFULL) &&
2237			(cv < 0xFFFF800000000000ULL))) {
2238			cv = 0xFFFF800000000000ULL;
2239		}
2240		/* Potential inconsistencies from not holding pmap lock
2241		 * but harmless for the moment.
2242		 */
2243		if (((cv & PML4MASK) == 0) && (pmap64_pml4(ipmap, cv) == 0)) {
2244			if ((cv + NBPML4) > cv)
2245				cv += NBPML4;
2246			else
2247				break;
2248			skip4++;
2249			continue;
2250		}
2251		if (((cv & PDMASK) == 0) && (pmap_pde(ipmap, cv) == 0)) {
2252			if ((cv + NBPD) > cv)
2253				cv += NBPD;
2254			else
2255				break;
2256			skip2++;
2257			continue;
2258		}
2259
2260		pt_entry_t *ptep = pmap_pte(ipmap, cv);
2261		if (ptep && (*ptep & INTEL_PTE_VALID)) {
2262			if (*ptep & INTEL_PTE_WRITE) {
2263				if (!(*ptep & INTEL_PTE_NX)) {
2264					kprintf("W+X PTE at 0x%lx, P4: 0x%llx, P3: 0x%llx, P2: 0x%llx, PT: 0x%llx, VP: %u\n", cv, *pmap64_pml4(ipmap, cv), *pmap64_pdpt(ipmap, cv), *pmap64_pde(ipmap, cv), *ptep, pmap_valid_page((ppnum_t)(i386_btop(pte_to_pa(*ptep)))));
2265					rv = KERN_FAILURE;
2266				}
2267			}
2268		}
2269		cv += PAGE_SIZE;
2270	}
2271	kprintf("Completed pmap scan\n");
2272	cv = sv;
2273
2274	struct vm_region_submap_info_64 vbr;
2275	mach_msg_type_number_t vbrcount = 0;
2276	mach_vm_size_t	vmsize;
2277	vm_prot_t	prot;
2278	uint32_t nesting_depth = 0;
2279	kern_return_t kret;
2280
2281	while (cv < ev) {
2282
2283		for (;;) {
2284			vbrcount = VM_REGION_SUBMAP_INFO_COUNT_64;
2285			if((kret = mach_vm_region_recurse(ivmmap,
2286				    (mach_vm_address_t *) &cv, &vmsize, &nesting_depth,
2287					(vm_region_recurse_info_t)&vbr,
2288					&vbrcount)) != KERN_SUCCESS) {
2289				break;
2290			}
2291
2292			if(vbr.is_submap) {
2293				nesting_depth++;
2294				continue;
2295			} else {
2296				break;
2297			}
2298		}
2299
2300		if(kret != KERN_SUCCESS)
2301			break;
2302
2303		prot = vbr.protection;
2304
2305		if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) {
2306			kprintf("W+X map entry at address 0x%lx\n", cv);
2307			rv = KERN_FAILURE;
2308		}
2309
2310		if (prot) {
2311			vm_offset_t pcv;
2312			for (pcv = cv; pcv < cv + vmsize; pcv += PAGE_SIZE) {
2313				pt_entry_t *ptep = pmap_pte(ipmap, pcv);
2314				vm_prot_t tprot;
2315
2316				if ((ptep == NULL) || !(*ptep & INTEL_PTE_VALID))
2317					continue;
2318				tprot = VM_PROT_READ;
2319				if (*ptep & INTEL_PTE_WRITE)
2320					tprot |= VM_PROT_WRITE;
2321				if ((*ptep & INTEL_PTE_NX) == 0)
2322					tprot |= VM_PROT_EXECUTE;
2323				if (tprot != prot) {
2324					kprintf("PTE/map entry permissions mismatch at address 0x%lx, pte: 0x%llx, protection: 0x%x\n", pcv, *ptep, prot);
2325					rv = KERN_FAILURE;
2326				}
2327			}
2328		}
2329		cv += vmsize;
2330	}
2331	return rv;
2332}
2333