1/*-
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
33 *
34 *
35 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
36 * All rights reserved.
37 *
38 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
39 *
40 * Permission to use, copy, modify and distribute this software and
41 * its documentation is hereby granted, provided that both the copyright
42 * notice and this permission notice appear in all copies of the
43 * software, derivative works or modified versions, and any portions
44 * thereof, and that both notices appear in supporting documentation.
45 *
46 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
47 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
48 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
49 *
50 * Carnegie Mellon requests users of this software to return to
51 *
52 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
53 *  School of Computer Science
54 *  Carnegie Mellon University
55 *  Pittsburgh PA 15213-3890
56 *
57 * any improvements or extensions that they make and grant Carnegie the
58 * rights to redistribute these changes.
59 */
60
61/*
62 *	Virtual memory mapping module.
63 */
64
65#include <sys/cdefs.h>
66__FBSDID("$FreeBSD: stable/10/sys/vm/vm_map.c 326523 2017-12-04 10:05:59Z kib $");
67
68#include <sys/param.h>
69#include <sys/systm.h>
70#include <sys/kernel.h>
71#include <sys/ktr.h>
72#include <sys/lock.h>
73#include <sys/mutex.h>
74#include <sys/proc.h>
75#include <sys/vmmeter.h>
76#include <sys/mman.h>
77#include <sys/vnode.h>
78#include <sys/racct.h>
79#include <sys/resourcevar.h>
80#include <sys/rwlock.h>
81#include <sys/file.h>
82#include <sys/sysctl.h>
83#include <sys/sysent.h>
84#include <sys/shm.h>
85
86#include <vm/vm.h>
87#include <vm/vm_param.h>
88#include <vm/pmap.h>
89#include <vm/vm_map.h>
90#include <vm/vm_page.h>
91#include <vm/vm_object.h>
92#include <vm/vm_pager.h>
93#include <vm/vm_kern.h>
94#include <vm/vm_extern.h>
95#include <vm/vnode_pager.h>
96#include <vm/swap_pager.h>
97#include <vm/uma.h>
98
99/*
100 *	Virtual memory maps provide for the mapping, protection,
101 *	and sharing of virtual memory objects.  In addition,
102 *	this module provides for an efficient virtual copy of
103 *	memory from one map to another.
104 *
105 *	Synchronization is required prior to most operations.
106 *
107 *	Maps consist of an ordered doubly-linked list of simple
108 *	entries; a self-adjusting binary search tree of these
109 *	entries is used to speed up lookups.
110 *
111 *	Since portions of maps are specified by start/end addresses,
112 *	which may not align with existing map entries, all
113 *	routines merely "clip" entries to these start/end values.
114 *	[That is, an entry is split into two, bordering at a
115 *	start or end value.]  Note that these clippings may not
116 *	always be necessary (as the two resulting entries are then
117 *	not changed); however, the clipping is done for convenience.
118 *
119 *	As mentioned above, virtual copy operations are performed
120 *	by copying VM object references from one map to
121 *	another, and then marking both regions as copy-on-write.
122 */
123
124static struct mtx map_sleep_mtx;
125static uma_zone_t mapentzone;
126static uma_zone_t kmapentzone;
127static uma_zone_t mapzone;
128static uma_zone_t vmspace_zone;
129static int vmspace_zinit(void *mem, int size, int flags);
130static int vm_map_zinit(void *mem, int ize, int flags);
131static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
132    vm_offset_t max);
133static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
134static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
135static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
136static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
137    vm_map_entry_t gap_entry);
138#ifdef INVARIANTS
139static void vm_map_zdtor(void *mem, int size, void *arg);
140static void vmspace_zdtor(void *mem, int size, void *arg);
141#endif
142static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
143    vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
144    int cow);
145static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
146    vm_offset_t failed_addr);
147
148#define	ENTRY_CHARGED(e) ((e)->cred != NULL || \
149    ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
150     !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
151
152/*
153 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
154 * stable.
155 */
156#define PROC_VMSPACE_LOCK(p) do { } while (0)
157#define PROC_VMSPACE_UNLOCK(p) do { } while (0)
158
159/*
160 *	VM_MAP_RANGE_CHECK:	[ internal use only ]
161 *
162 *	Asserts that the starting and ending region
163 *	addresses fall within the valid range of the map.
164 */
165#define	VM_MAP_RANGE_CHECK(map, start, end)		\
166		{					\
167		if (start < vm_map_min(map))		\
168			start = vm_map_min(map);	\
169		if (end > vm_map_max(map))		\
170			end = vm_map_max(map);		\
171		if (start > end)			\
172			start = end;			\
173		}
174
175/*
176 *	vm_map_startup:
177 *
178 *	Initialize the vm_map module.  Must be called before
179 *	any other vm_map routines.
180 *
181 *	Map and entry structures are allocated from the general
182 *	purpose memory pool with some exceptions:
183 *
184 *	- The kernel map and kmem submap are allocated statically.
185 *	- Kernel map entries are allocated out of a static pool.
186 *
187 *	These restrictions are necessary since malloc() uses the
188 *	maps and requires map entries.
189 */
190
191void
192vm_map_startup(void)
193{
194	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
195	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
196#ifdef INVARIANTS
197	    vm_map_zdtor,
198#else
199	    NULL,
200#endif
201	    vm_map_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
202	uma_prealloc(mapzone, MAX_KMAP);
203	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
204	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
205	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
206	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
207	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
208	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
209#ifdef INVARIANTS
210	    vmspace_zdtor,
211#else
212	    NULL,
213#endif
214	    vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
215}
216
217static int
218vmspace_zinit(void *mem, int size, int flags)
219{
220	struct vmspace *vm;
221
222	vm = (struct vmspace *)mem;
223
224	vm->vm_map.pmap = NULL;
225	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
226	PMAP_LOCK_INIT(vmspace_pmap(vm));
227	return (0);
228}
229
230static int
231vm_map_zinit(void *mem, int size, int flags)
232{
233	vm_map_t map;
234
235	map = (vm_map_t)mem;
236	memset(map, 0, sizeof(*map));
237	mtx_init(&map->system_mtx, "vm map (system)", NULL, MTX_DEF | MTX_DUPOK);
238	sx_init(&map->lock, "vm map (user)");
239	return (0);
240}
241
242#ifdef INVARIANTS
243static void
244vmspace_zdtor(void *mem, int size, void *arg)
245{
246	struct vmspace *vm;
247
248	vm = (struct vmspace *)mem;
249
250	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
251}
252static void
253vm_map_zdtor(void *mem, int size, void *arg)
254{
255	vm_map_t map;
256
257	map = (vm_map_t)mem;
258	KASSERT(map->nentries == 0,
259	    ("map %p nentries == %d on free.",
260	    map, map->nentries));
261	KASSERT(map->size == 0,
262	    ("map %p size == %lu on free.",
263	    map, (unsigned long)map->size));
264}
265#endif	/* INVARIANTS */
266
267/*
268 * Allocate a vmspace structure, including a vm_map and pmap,
269 * and initialize those structures.  The refcnt is set to 1.
270 *
271 * If 'pinit' is NULL then the embedded pmap is initialized via pmap_pinit().
272 */
273struct vmspace *
274vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
275{
276	struct vmspace *vm;
277
278	vm = uma_zalloc(vmspace_zone, M_WAITOK);
279
280	KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
281
282	if (pinit == NULL)
283		pinit = &pmap_pinit;
284
285	if (!pinit(vmspace_pmap(vm))) {
286		uma_zfree(vmspace_zone, vm);
287		return (NULL);
288	}
289	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
290	_vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
291	vm->vm_refcnt = 1;
292	vm->vm_shm = NULL;
293	vm->vm_swrss = 0;
294	vm->vm_tsize = 0;
295	vm->vm_dsize = 0;
296	vm->vm_ssize = 0;
297	vm->vm_taddr = 0;
298	vm->vm_daddr = 0;
299	vm->vm_maxsaddr = 0;
300	return (vm);
301}
302
303#ifdef RACCT
304static void
305vmspace_container_reset(struct proc *p)
306{
307
308	PROC_LOCK(p);
309	racct_set(p, RACCT_DATA, 0);
310	racct_set(p, RACCT_STACK, 0);
311	racct_set(p, RACCT_RSS, 0);
312	racct_set(p, RACCT_MEMLOCK, 0);
313	racct_set(p, RACCT_VMEM, 0);
314	PROC_UNLOCK(p);
315}
316#endif
317
318static inline void
319vmspace_dofree(struct vmspace *vm)
320{
321
322	CTR1(KTR_VM, "vmspace_free: %p", vm);
323
324	/*
325	 * Make sure any SysV shm is freed, it might not have been in
326	 * exit1().
327	 */
328	shmexit(vm);
329
330	/*
331	 * Lock the map, to wait out all other references to it.
332	 * Delete all of the mappings and pages they hold, then call
333	 * the pmap module to reclaim anything left.
334	 */
335	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
336	    vm->vm_map.max_offset);
337
338	pmap_release(vmspace_pmap(vm));
339	vm->vm_map.pmap = NULL;
340	uma_zfree(vmspace_zone, vm);
341}
342
343void
344vmspace_free(struct vmspace *vm)
345{
346
347	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
348	    "vmspace_free() called with non-sleepable lock held");
349
350	if (vm->vm_refcnt == 0)
351		panic("vmspace_free: attempt to free already freed vmspace");
352
353	if (atomic_fetchadd_int(&vm->vm_refcnt, -1) == 1)
354		vmspace_dofree(vm);
355}
356
357void
358vmspace_exitfree(struct proc *p)
359{
360	struct vmspace *vm;
361
362	PROC_VMSPACE_LOCK(p);
363	vm = p->p_vmspace;
364	p->p_vmspace = NULL;
365	PROC_VMSPACE_UNLOCK(p);
366	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
367	vmspace_free(vm);
368}
369
370void
371vmspace_exit(struct thread *td)
372{
373	int refcnt;
374	struct vmspace *vm;
375	struct proc *p;
376
377	/*
378	 * Release user portion of address space.
379	 * This releases references to vnodes,
380	 * which could cause I/O if the file has been unlinked.
381	 * Need to do this early enough that we can still sleep.
382	 *
383	 * The last exiting process to reach this point releases as
384	 * much of the environment as it can. vmspace_dofree() is the
385	 * slower fallback in case another process had a temporary
386	 * reference to the vmspace.
387	 */
388
389	p = td->td_proc;
390	vm = p->p_vmspace;
391	atomic_add_int(&vmspace0.vm_refcnt, 1);
392	do {
393		refcnt = vm->vm_refcnt;
394		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
395			/* Switch now since other proc might free vmspace */
396			PROC_VMSPACE_LOCK(p);
397			p->p_vmspace = &vmspace0;
398			PROC_VMSPACE_UNLOCK(p);
399			pmap_activate(td);
400		}
401	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
402	if (refcnt == 1) {
403		if (p->p_vmspace != vm) {
404			/* vmspace not yet freed, switch back */
405			PROC_VMSPACE_LOCK(p);
406			p->p_vmspace = vm;
407			PROC_VMSPACE_UNLOCK(p);
408			pmap_activate(td);
409		}
410		pmap_remove_pages(vmspace_pmap(vm));
411		/* Switch now since this proc will free vmspace */
412		PROC_VMSPACE_LOCK(p);
413		p->p_vmspace = &vmspace0;
414		PROC_VMSPACE_UNLOCK(p);
415		pmap_activate(td);
416		vmspace_dofree(vm);
417	}
418#ifdef RACCT
419	if (racct_enable)
420		vmspace_container_reset(p);
421#endif
422}
423
424/* Acquire reference to vmspace owned by another process. */
425
426struct vmspace *
427vmspace_acquire_ref(struct proc *p)
428{
429	struct vmspace *vm;
430	int refcnt;
431
432	PROC_VMSPACE_LOCK(p);
433	vm = p->p_vmspace;
434	if (vm == NULL) {
435		PROC_VMSPACE_UNLOCK(p);
436		return (NULL);
437	}
438	do {
439		refcnt = vm->vm_refcnt;
440		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
441			PROC_VMSPACE_UNLOCK(p);
442			return (NULL);
443		}
444	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
445	if (vm != p->p_vmspace) {
446		PROC_VMSPACE_UNLOCK(p);
447		vmspace_free(vm);
448		return (NULL);
449	}
450	PROC_VMSPACE_UNLOCK(p);
451	return (vm);
452}
453
454void
455_vm_map_lock(vm_map_t map, const char *file, int line)
456{
457
458	if (map->system_map)
459		mtx_lock_flags_(&map->system_mtx, 0, file, line);
460	else
461		sx_xlock_(&map->lock, file, line);
462	map->timestamp++;
463}
464
465static void
466vm_map_process_deferred(void)
467{
468	struct thread *td;
469	vm_map_entry_t entry, next;
470	vm_object_t object;
471
472	td = curthread;
473	entry = td->td_map_def_user;
474	td->td_map_def_user = NULL;
475	while (entry != NULL) {
476		next = entry->next;
477		if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) {
478			/*
479			 * Decrement the object's writemappings and
480			 * possibly the vnode's v_writecount.
481			 */
482			KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
483			    ("Submap with writecount"));
484			object = entry->object.vm_object;
485			KASSERT(object != NULL, ("No object for writecount"));
486			vnode_pager_release_writecount(object, entry->start,
487			    entry->end);
488		}
489		vm_map_entry_deallocate(entry, FALSE);
490		entry = next;
491	}
492}
493
494void
495_vm_map_unlock(vm_map_t map, const char *file, int line)
496{
497
498	if (map->system_map)
499		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
500	else {
501		sx_xunlock_(&map->lock, file, line);
502		vm_map_process_deferred();
503	}
504}
505
506void
507_vm_map_lock_read(vm_map_t map, const char *file, int line)
508{
509
510	if (map->system_map)
511		mtx_lock_flags_(&map->system_mtx, 0, file, line);
512	else
513		sx_slock_(&map->lock, file, line);
514}
515
516void
517_vm_map_unlock_read(vm_map_t map, const char *file, int line)
518{
519
520	if (map->system_map)
521		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
522	else {
523		sx_sunlock_(&map->lock, file, line);
524		vm_map_process_deferred();
525	}
526}
527
528int
529_vm_map_trylock(vm_map_t map, const char *file, int line)
530{
531	int error;
532
533	error = map->system_map ?
534	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
535	    !sx_try_xlock_(&map->lock, file, line);
536	if (error == 0)
537		map->timestamp++;
538	return (error == 0);
539}
540
541int
542_vm_map_trylock_read(vm_map_t map, const char *file, int line)
543{
544	int error;
545
546	error = map->system_map ?
547	    !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
548	    !sx_try_slock_(&map->lock, file, line);
549	return (error == 0);
550}
551
552/*
553 *	_vm_map_lock_upgrade:	[ internal use only ]
554 *
555 *	Tries to upgrade a read (shared) lock on the specified map to a write
556 *	(exclusive) lock.  Returns the value "0" if the upgrade succeeds and a
557 *	non-zero value if the upgrade fails.  If the upgrade fails, the map is
558 *	returned without a read or write lock held.
559 *
560 *	Requires that the map be read locked.
561 */
562int
563_vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
564{
565	unsigned int last_timestamp;
566
567	if (map->system_map) {
568		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
569	} else {
570		if (!sx_try_upgrade_(&map->lock, file, line)) {
571			last_timestamp = map->timestamp;
572			sx_sunlock_(&map->lock, file, line);
573			vm_map_process_deferred();
574			/*
575			 * If the map's timestamp does not change while the
576			 * map is unlocked, then the upgrade succeeds.
577			 */
578			sx_xlock_(&map->lock, file, line);
579			if (last_timestamp != map->timestamp) {
580				sx_xunlock_(&map->lock, file, line);
581				return (1);
582			}
583		}
584	}
585	map->timestamp++;
586	return (0);
587}
588
589void
590_vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
591{
592
593	if (map->system_map) {
594		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
595	} else
596		sx_downgrade_(&map->lock, file, line);
597}
598
599/*
600 *	vm_map_locked:
601 *
602 *	Returns a non-zero value if the caller holds a write (exclusive) lock
603 *	on the specified map and the value "0" otherwise.
604 */
605int
606vm_map_locked(vm_map_t map)
607{
608
609	if (map->system_map)
610		return (mtx_owned(&map->system_mtx));
611	else
612		return (sx_xlocked(&map->lock));
613}
614
615#ifdef INVARIANTS
616static void
617_vm_map_assert_locked(vm_map_t map, const char *file, int line)
618{
619
620	if (map->system_map)
621		mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
622	else
623		sx_assert_(&map->lock, SA_XLOCKED, file, line);
624}
625
626#define	VM_MAP_ASSERT_LOCKED(map) \
627    _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
628#else
629#define	VM_MAP_ASSERT_LOCKED(map)
630#endif
631
632/*
633 *	_vm_map_unlock_and_wait:
634 *
635 *	Atomically releases the lock on the specified map and puts the calling
636 *	thread to sleep.  The calling thread will remain asleep until either
637 *	vm_map_wakeup() is performed on the map or the specified timeout is
638 *	exceeded.
639 *
640 *	WARNING!  This function does not perform deferred deallocations of
641 *	objects and map	entries.  Therefore, the calling thread is expected to
642 *	reacquire the map lock after reawakening and later perform an ordinary
643 *	unlock operation, such as vm_map_unlock(), before completing its
644 *	operation on the map.
645 */
646int
647_vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
648{
649
650	mtx_lock(&map_sleep_mtx);
651	if (map->system_map)
652		mtx_unlock_flags_(&map->system_mtx, 0, file, line);
653	else
654		sx_xunlock_(&map->lock, file, line);
655	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
656	    timo));
657}
658
659/*
660 *	vm_map_wakeup:
661 *
662 *	Awaken any threads that have slept on the map using
663 *	vm_map_unlock_and_wait().
664 */
665void
666vm_map_wakeup(vm_map_t map)
667{
668
669	/*
670	 * Acquire and release map_sleep_mtx to prevent a wakeup()
671	 * from being performed (and lost) between the map unlock
672	 * and the msleep() in _vm_map_unlock_and_wait().
673	 */
674	mtx_lock(&map_sleep_mtx);
675	mtx_unlock(&map_sleep_mtx);
676	wakeup(&map->root);
677}
678
679void
680vm_map_busy(vm_map_t map)
681{
682
683	VM_MAP_ASSERT_LOCKED(map);
684	map->busy++;
685}
686
687void
688vm_map_unbusy(vm_map_t map)
689{
690
691	VM_MAP_ASSERT_LOCKED(map);
692	KASSERT(map->busy, ("vm_map_unbusy: not busy"));
693	if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
694		vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
695		wakeup(&map->busy);
696	}
697}
698
699void
700vm_map_wait_busy(vm_map_t map)
701{
702
703	VM_MAP_ASSERT_LOCKED(map);
704	while (map->busy) {
705		vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
706		if (map->system_map)
707			msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
708		else
709			sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
710	}
711	map->timestamp++;
712}
713
714long
715vmspace_resident_count(struct vmspace *vmspace)
716{
717	return pmap_resident_count(vmspace_pmap(vmspace));
718}
719
720/*
721 *	vm_map_create:
722 *
723 *	Creates and returns a new empty VM map with
724 *	the given physical map structure, and having
725 *	the given lower and upper address bounds.
726 */
727vm_map_t
728vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
729{
730	vm_map_t result;
731
732	result = uma_zalloc(mapzone, M_WAITOK);
733	CTR1(KTR_VM, "vm_map_create: %p", result);
734	_vm_map_init(result, pmap, min, max);
735	return (result);
736}
737
738/*
739 * Initialize an existing vm_map structure
740 * such as that in the vmspace structure.
741 */
742static void
743_vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
744{
745
746	map->header.next = map->header.prev = &map->header;
747	map->needs_wakeup = FALSE;
748	map->system_map = 0;
749	map->pmap = pmap;
750	map->min_offset = min;
751	map->max_offset = max;
752	map->flags = 0;
753	map->root = NULL;
754	map->timestamp = 0;
755	map->busy = 0;
756}
757
758void
759vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
760{
761
762	_vm_map_init(map, pmap, min, max);
763	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
764	sx_init(&map->lock, "user map");
765}
766
767/*
768 *	vm_map_entry_dispose:	[ internal use only ]
769 *
770 *	Inverse of vm_map_entry_create.
771 */
772static void
773vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
774{
775	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
776}
777
778/*
779 *	vm_map_entry_create:	[ internal use only ]
780 *
781 *	Allocates a VM map entry for insertion.
782 *	No entry fields are filled in.
783 */
784static vm_map_entry_t
785vm_map_entry_create(vm_map_t map)
786{
787	vm_map_entry_t new_entry;
788
789	if (map->system_map)
790		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
791	else
792		new_entry = uma_zalloc(mapentzone, M_WAITOK);
793	if (new_entry == NULL)
794		panic("vm_map_entry_create: kernel resources exhausted");
795	return (new_entry);
796}
797
798/*
799 *	vm_map_entry_set_behavior:
800 *
801 *	Set the expected access behavior, either normal, random, or
802 *	sequential.
803 */
804static inline void
805vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
806{
807	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
808	    (behavior & MAP_ENTRY_BEHAV_MASK);
809}
810
811/*
812 *	vm_map_entry_set_max_free:
813 *
814 *	Set the max_free field in a vm_map_entry.
815 */
816static inline void
817vm_map_entry_set_max_free(vm_map_entry_t entry)
818{
819
820	entry->max_free = entry->adj_free;
821	if (entry->left != NULL && entry->left->max_free > entry->max_free)
822		entry->max_free = entry->left->max_free;
823	if (entry->right != NULL && entry->right->max_free > entry->max_free)
824		entry->max_free = entry->right->max_free;
825}
826
827/*
828 *	vm_map_entry_splay:
829 *
830 *	The Sleator and Tarjan top-down splay algorithm with the
831 *	following variation.  Max_free must be computed bottom-up, so
832 *	on the downward pass, maintain the left and right spines in
833 *	reverse order.  Then, make a second pass up each side to fix
834 *	the pointers and compute max_free.  The time bound is O(log n)
835 *	amortized.
836 *
837 *	The new root is the vm_map_entry containing "addr", or else an
838 *	adjacent entry (lower or higher) if addr is not in the tree.
839 *
840 *	The map must be locked, and leaves it so.
841 *
842 *	Returns: the new root.
843 */
844static vm_map_entry_t
845vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
846{
847	vm_map_entry_t llist, rlist;
848	vm_map_entry_t ltree, rtree;
849	vm_map_entry_t y;
850
851	/* Special case of empty tree. */
852	if (root == NULL)
853		return (root);
854
855	/*
856	 * Pass One: Splay down the tree until we find addr or a NULL
857	 * pointer where addr would go.  llist and rlist are the two
858	 * sides in reverse order (bottom-up), with llist linked by
859	 * the right pointer and rlist linked by the left pointer in
860	 * the vm_map_entry.  Wait until Pass Two to set max_free on
861	 * the two spines.
862	 */
863	llist = NULL;
864	rlist = NULL;
865	for (;;) {
866		/* root is never NULL in here. */
867		if (addr < root->start) {
868			y = root->left;
869			if (y == NULL)
870				break;
871			if (addr < y->start && y->left != NULL) {
872				/* Rotate right and put y on rlist. */
873				root->left = y->right;
874				y->right = root;
875				vm_map_entry_set_max_free(root);
876				root = y->left;
877				y->left = rlist;
878				rlist = y;
879			} else {
880				/* Put root on rlist. */
881				root->left = rlist;
882				rlist = root;
883				root = y;
884			}
885		} else if (addr >= root->end) {
886			y = root->right;
887			if (y == NULL)
888				break;
889			if (addr >= y->end && y->right != NULL) {
890				/* Rotate left and put y on llist. */
891				root->right = y->left;
892				y->left = root;
893				vm_map_entry_set_max_free(root);
894				root = y->right;
895				y->right = llist;
896				llist = y;
897			} else {
898				/* Put root on llist. */
899				root->right = llist;
900				llist = root;
901				root = y;
902			}
903		} else
904			break;
905	}
906
907	/*
908	 * Pass Two: Walk back up the two spines, flip the pointers
909	 * and set max_free.  The subtrees of the root go at the
910	 * bottom of llist and rlist.
911	 */
912	ltree = root->left;
913	while (llist != NULL) {
914		y = llist->right;
915		llist->right = ltree;
916		vm_map_entry_set_max_free(llist);
917		ltree = llist;
918		llist = y;
919	}
920	rtree = root->right;
921	while (rlist != NULL) {
922		y = rlist->left;
923		rlist->left = rtree;
924		vm_map_entry_set_max_free(rlist);
925		rtree = rlist;
926		rlist = y;
927	}
928
929	/*
930	 * Final assembly: add ltree and rtree as subtrees of root.
931	 */
932	root->left = ltree;
933	root->right = rtree;
934	vm_map_entry_set_max_free(root);
935
936	return (root);
937}
938
939/*
940 *	vm_map_entry_{un,}link:
941 *
942 *	Insert/remove entries from maps.
943 */
944static void
945vm_map_entry_link(vm_map_t map,
946		  vm_map_entry_t after_where,
947		  vm_map_entry_t entry)
948{
949
950	CTR4(KTR_VM,
951	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
952	    map->nentries, entry, after_where);
953	VM_MAP_ASSERT_LOCKED(map);
954	KASSERT(after_where == &map->header ||
955	    after_where->end <= entry->start,
956	    ("vm_map_entry_link: prev end %jx new start %jx overlap",
957	    (uintmax_t)after_where->end, (uintmax_t)entry->start));
958	KASSERT(after_where->next == &map->header ||
959	    entry->end <= after_where->next->start,
960	    ("vm_map_entry_link: new end %jx next start %jx overlap",
961	    (uintmax_t)entry->end, (uintmax_t)after_where->next->start));
962
963	map->nentries++;
964	entry->prev = after_where;
965	entry->next = after_where->next;
966	entry->next->prev = entry;
967	after_where->next = entry;
968
969	if (after_where != &map->header) {
970		if (after_where != map->root)
971			vm_map_entry_splay(after_where->start, map->root);
972		entry->right = after_where->right;
973		entry->left = after_where;
974		after_where->right = NULL;
975		after_where->adj_free = entry->start - after_where->end;
976		vm_map_entry_set_max_free(after_where);
977	} else {
978		entry->right = map->root;
979		entry->left = NULL;
980	}
981	entry->adj_free = (entry->next == &map->header ? map->max_offset :
982	    entry->next->start) - entry->end;
983	vm_map_entry_set_max_free(entry);
984	map->root = entry;
985}
986
987static void
988vm_map_entry_unlink(vm_map_t map,
989		    vm_map_entry_t entry)
990{
991	vm_map_entry_t next, prev, root;
992
993	VM_MAP_ASSERT_LOCKED(map);
994	if (entry != map->root)
995		vm_map_entry_splay(entry->start, map->root);
996	if (entry->left == NULL)
997		root = entry->right;
998	else {
999		root = vm_map_entry_splay(entry->start, entry->left);
1000		root->right = entry->right;
1001		root->adj_free = (entry->next == &map->header ? map->max_offset :
1002		    entry->next->start) - root->end;
1003		vm_map_entry_set_max_free(root);
1004	}
1005	map->root = root;
1006
1007	prev = entry->prev;
1008	next = entry->next;
1009	next->prev = prev;
1010	prev->next = next;
1011	map->nentries--;
1012	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1013	    map->nentries, entry);
1014}
1015
1016/*
1017 *	vm_map_entry_resize_free:
1018 *
1019 *	Recompute the amount of free space following a vm_map_entry
1020 *	and propagate that value up the tree.  Call this function after
1021 *	resizing a map entry in-place, that is, without a call to
1022 *	vm_map_entry_link() or _unlink().
1023 *
1024 *	The map must be locked, and leaves it so.
1025 */
1026static void
1027vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
1028{
1029
1030	/*
1031	 * Using splay trees without parent pointers, propagating
1032	 * max_free up the tree is done by moving the entry to the
1033	 * root and making the change there.
1034	 */
1035	if (entry != map->root)
1036		map->root = vm_map_entry_splay(entry->start, map->root);
1037
1038	entry->adj_free = (entry->next == &map->header ? map->max_offset :
1039	    entry->next->start) - entry->end;
1040	vm_map_entry_set_max_free(entry);
1041}
1042
1043/*
1044 *	vm_map_lookup_entry:	[ internal use only ]
1045 *
1046 *	Finds the map entry containing (or
1047 *	immediately preceding) the specified address
1048 *	in the given map; the entry is returned
1049 *	in the "entry" parameter.  The boolean
1050 *	result indicates whether the address is
1051 *	actually contained in the map.
1052 */
1053boolean_t
1054vm_map_lookup_entry(
1055	vm_map_t map,
1056	vm_offset_t address,
1057	vm_map_entry_t *entry)	/* OUT */
1058{
1059	vm_map_entry_t cur;
1060	boolean_t locked;
1061
1062	/*
1063	 * If the map is empty, then the map entry immediately preceding
1064	 * "address" is the map's header.
1065	 */
1066	cur = map->root;
1067	if (cur == NULL)
1068		*entry = &map->header;
1069	else if (address >= cur->start && cur->end > address) {
1070		*entry = cur;
1071		return (TRUE);
1072	} else if ((locked = vm_map_locked(map)) ||
1073	    sx_try_upgrade(&map->lock)) {
1074		/*
1075		 * Splay requires a write lock on the map.  However, it only
1076		 * restructures the binary search tree; it does not otherwise
1077		 * change the map.  Thus, the map's timestamp need not change
1078		 * on a temporary upgrade.
1079		 */
1080		map->root = cur = vm_map_entry_splay(address, cur);
1081		if (!locked)
1082			sx_downgrade(&map->lock);
1083
1084		/*
1085		 * If "address" is contained within a map entry, the new root
1086		 * is that map entry.  Otherwise, the new root is a map entry
1087		 * immediately before or after "address".
1088		 */
1089		if (address >= cur->start) {
1090			*entry = cur;
1091			if (cur->end > address)
1092				return (TRUE);
1093		} else
1094			*entry = cur->prev;
1095	} else
1096		/*
1097		 * Since the map is only locked for read access, perform a
1098		 * standard binary search tree lookup for "address".
1099		 */
1100		for (;;) {
1101			if (address < cur->start) {
1102				if (cur->left == NULL) {
1103					*entry = cur->prev;
1104					break;
1105				}
1106				cur = cur->left;
1107			} else if (cur->end > address) {
1108				*entry = cur;
1109				return (TRUE);
1110			} else {
1111				if (cur->right == NULL) {
1112					*entry = cur;
1113					break;
1114				}
1115				cur = cur->right;
1116			}
1117		}
1118	return (FALSE);
1119}
1120
1121/*
1122 *	vm_map_insert:
1123 *
1124 *	Inserts the given whole VM object into the target
1125 *	map at the specified address range.  The object's
1126 *	size should match that of the address range.
1127 *
1128 *	Requires that the map be locked, and leaves it so.
1129 *
1130 *	If object is non-NULL, ref count must be bumped by caller
1131 *	prior to making call to account for the new entry.
1132 */
1133int
1134vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1135    vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1136{
1137	vm_map_entry_t new_entry, prev_entry, temp_entry;
1138	struct ucred *cred;
1139	vm_eflags_t protoeflags;
1140	vm_inherit_t inheritance;
1141
1142	VM_MAP_ASSERT_LOCKED(map);
1143	KASSERT((object != kmem_object && object != kernel_object) ||
1144	    (cow & MAP_COPY_ON_WRITE) == 0,
1145	    ("vm_map_insert: kmem or kernel object and COW"));
1146	KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0,
1147	    ("vm_map_insert: paradoxical MAP_NOFAULT request"));
1148	KASSERT((prot & ~max) == 0,
1149	    ("prot %#x is not subset of max_prot %#x", prot, max));
1150
1151	/*
1152	 * Check that the start and end points are not bogus.
1153	 */
1154	if (start < map->min_offset || end > map->max_offset || start >= end)
1155		return (KERN_INVALID_ADDRESS);
1156
1157	/*
1158	 * Find the entry prior to the proposed starting address; if it's part
1159	 * of an existing entry, this range is bogus.
1160	 */
1161	if (vm_map_lookup_entry(map, start, &temp_entry))
1162		return (KERN_NO_SPACE);
1163
1164	prev_entry = temp_entry;
1165
1166	/*
1167	 * Assert that the next entry doesn't overlap the end point.
1168	 */
1169	if (prev_entry->next != &map->header && prev_entry->next->start < end)
1170		return (KERN_NO_SPACE);
1171
1172	if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1173	    max != VM_PROT_NONE))
1174		return (KERN_INVALID_ARGUMENT);
1175
1176	protoeflags = 0;
1177	if (cow & MAP_COPY_ON_WRITE)
1178		protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1179	if (cow & MAP_NOFAULT)
1180		protoeflags |= MAP_ENTRY_NOFAULT;
1181	if (cow & MAP_DISABLE_SYNCER)
1182		protoeflags |= MAP_ENTRY_NOSYNC;
1183	if (cow & MAP_DISABLE_COREDUMP)
1184		protoeflags |= MAP_ENTRY_NOCOREDUMP;
1185	if (cow & MAP_STACK_GROWS_DOWN)
1186		protoeflags |= MAP_ENTRY_GROWS_DOWN;
1187	if (cow & MAP_STACK_GROWS_UP)
1188		protoeflags |= MAP_ENTRY_GROWS_UP;
1189	if (cow & MAP_VN_WRITECOUNT)
1190		protoeflags |= MAP_ENTRY_VN_WRITECNT;
1191	if ((cow & MAP_CREATE_GUARD) != 0)
1192		protoeflags |= MAP_ENTRY_GUARD;
1193	if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1194		protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1195	if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1196		protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1197	if (cow & MAP_INHERIT_SHARE)
1198		inheritance = VM_INHERIT_SHARE;
1199	else
1200		inheritance = VM_INHERIT_DEFAULT;
1201
1202	cred = NULL;
1203	if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1204		goto charged;
1205	if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1206	    ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1207		if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1208			return (KERN_RESOURCE_SHORTAGE);
1209		KASSERT(object == NULL ||
1210		    (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1211		    object->cred == NULL,
1212		    ("overcommit: vm_map_insert o %p", object));
1213		cred = curthread->td_ucred;
1214	}
1215
1216charged:
1217	/* Expand the kernel pmap, if necessary. */
1218	if (map == kernel_map && end > kernel_vm_end)
1219		pmap_growkernel(end);
1220	if (object != NULL) {
1221		/*
1222		 * OBJ_ONEMAPPING must be cleared unless this mapping
1223		 * is trivially proven to be the only mapping for any
1224		 * of the object's pages.  (Object granularity
1225		 * reference counting is insufficient to recognize
1226		 * aliases with precision.)
1227		 */
1228		VM_OBJECT_WLOCK(object);
1229		if (object->ref_count > 1 || object->shadow_count != 0)
1230			vm_object_clear_flag(object, OBJ_ONEMAPPING);
1231		VM_OBJECT_WUNLOCK(object);
1232	} else if (prev_entry != &map->header &&
1233	    prev_entry->eflags == protoeflags &&
1234	    (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 &&
1235	    prev_entry->end == start && prev_entry->wired_count == 0 &&
1236	    (prev_entry->cred == cred ||
1237	    (prev_entry->object.vm_object != NULL &&
1238	    prev_entry->object.vm_object->cred == cred)) &&
1239	    vm_object_coalesce(prev_entry->object.vm_object,
1240	    prev_entry->offset,
1241	    (vm_size_t)(prev_entry->end - prev_entry->start),
1242	    (vm_size_t)(end - prev_entry->end), cred != NULL &&
1243	    (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1244		/*
1245		 * We were able to extend the object.  Determine if we
1246		 * can extend the previous map entry to include the
1247		 * new range as well.
1248		 */
1249		if (prev_entry->inheritance == inheritance &&
1250		    prev_entry->protection == prot &&
1251		    prev_entry->max_protection == max) {
1252			if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1253				map->size += end - prev_entry->end;
1254			prev_entry->end = end;
1255			vm_map_entry_resize_free(map, prev_entry);
1256			vm_map_simplify_entry(map, prev_entry);
1257			return (KERN_SUCCESS);
1258		}
1259
1260		/*
1261		 * If we can extend the object but cannot extend the
1262		 * map entry, we have to create a new map entry.  We
1263		 * must bump the ref count on the extended object to
1264		 * account for it.  object may be NULL.
1265		 */
1266		object = prev_entry->object.vm_object;
1267		offset = prev_entry->offset +
1268		    (prev_entry->end - prev_entry->start);
1269		vm_object_reference(object);
1270		if (cred != NULL && object != NULL && object->cred != NULL &&
1271		    !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1272			/* Object already accounts for this uid. */
1273			cred = NULL;
1274		}
1275	}
1276	if (cred != NULL)
1277		crhold(cred);
1278
1279	/*
1280	 * Create a new entry
1281	 */
1282	new_entry = vm_map_entry_create(map);
1283	new_entry->start = start;
1284	new_entry->end = end;
1285	new_entry->cred = NULL;
1286
1287	new_entry->eflags = protoeflags;
1288	new_entry->object.vm_object = object;
1289	new_entry->offset = offset;
1290
1291	new_entry->inheritance = inheritance;
1292	new_entry->protection = prot;
1293	new_entry->max_protection = max;
1294	new_entry->wired_count = 0;
1295	new_entry->wiring_thread = NULL;
1296	new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1297	new_entry->next_read = OFF_TO_IDX(offset);
1298
1299	KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1300	    ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1301	new_entry->cred = cred;
1302
1303	/*
1304	 * Insert the new entry into the list
1305	 */
1306	vm_map_entry_link(map, prev_entry, new_entry);
1307	if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1308		map->size += new_entry->end - new_entry->start;
1309
1310	/*
1311	 * Try to coalesce the new entry with both the previous and next
1312	 * entries in the list.  Previously, we only attempted to coalesce
1313	 * with the previous entry when object is NULL.  Here, we handle the
1314	 * other cases, which are less common.
1315	 */
1316	vm_map_simplify_entry(map, new_entry);
1317
1318	if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1319		vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1320		    end - start, cow & MAP_PREFAULT_PARTIAL);
1321	}
1322
1323	return (KERN_SUCCESS);
1324}
1325
1326/*
1327 *	vm_map_findspace:
1328 *
1329 *	Find the first fit (lowest VM address) for "length" free bytes
1330 *	beginning at address >= start in the given map.
1331 *
1332 *	In a vm_map_entry, "adj_free" is the amount of free space
1333 *	adjacent (higher address) to this entry, and "max_free" is the
1334 *	maximum amount of contiguous free space in its subtree.  This
1335 *	allows finding a free region in one path down the tree, so
1336 *	O(log n) amortized with splay trees.
1337 *
1338 *	The map must be locked, and leaves it so.
1339 *
1340 *	Returns: 0 on success, and starting address in *addr,
1341 *		 1 if insufficient space.
1342 */
1343int
1344vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
1345    vm_offset_t *addr)	/* OUT */
1346{
1347	vm_map_entry_t entry;
1348	vm_offset_t st;
1349
1350	/*
1351	 * Request must fit within min/max VM address and must avoid
1352	 * address wrap.
1353	 */
1354	if (start < map->min_offset)
1355		start = map->min_offset;
1356	if (start + length > map->max_offset || start + length < start)
1357		return (1);
1358
1359	/* Empty tree means wide open address space. */
1360	if (map->root == NULL) {
1361		*addr = start;
1362		return (0);
1363	}
1364
1365	/*
1366	 * After splay, if start comes before root node, then there
1367	 * must be a gap from start to the root.
1368	 */
1369	map->root = vm_map_entry_splay(start, map->root);
1370	if (start + length <= map->root->start) {
1371		*addr = start;
1372		return (0);
1373	}
1374
1375	/*
1376	 * Root is the last node that might begin its gap before
1377	 * start, and this is the last comparison where address
1378	 * wrap might be a problem.
1379	 */
1380	st = (start > map->root->end) ? start : map->root->end;
1381	if (length <= map->root->end + map->root->adj_free - st) {
1382		*addr = st;
1383		return (0);
1384	}
1385
1386	/* With max_free, can immediately tell if no solution. */
1387	entry = map->root->right;
1388	if (entry == NULL || length > entry->max_free)
1389		return (1);
1390
1391	/*
1392	 * Search the right subtree in the order: left subtree, root,
1393	 * right subtree (first fit).  The previous splay implies that
1394	 * all regions in the right subtree have addresses > start.
1395	 */
1396	while (entry != NULL) {
1397		if (entry->left != NULL && entry->left->max_free >= length)
1398			entry = entry->left;
1399		else if (entry->adj_free >= length) {
1400			*addr = entry->end;
1401			return (0);
1402		} else
1403			entry = entry->right;
1404	}
1405
1406	/* Can't get here, so panic if we do. */
1407	panic("vm_map_findspace: max_free corrupt");
1408}
1409
1410int
1411vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1412    vm_offset_t start, vm_size_t length, vm_prot_t prot,
1413    vm_prot_t max, int cow)
1414{
1415	vm_offset_t end;
1416	int result;
1417
1418	end = start + length;
1419	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1420	    object == NULL,
1421	    ("vm_map_fixed: non-NULL backing object for stack"));
1422	vm_map_lock(map);
1423	VM_MAP_RANGE_CHECK(map, start, end);
1424	if ((cow & MAP_CHECK_EXCL) == 0)
1425		vm_map_delete(map, start, end);
1426	if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1427		result = vm_map_stack_locked(map, start, length, sgrowsiz,
1428		    prot, max, cow);
1429	} else {
1430		result = vm_map_insert(map, object, offset, start, end,
1431		    prot, max, cow);
1432	}
1433	vm_map_unlock(map);
1434	return (result);
1435}
1436
1437/*
1438 *	vm_map_find finds an unallocated region in the target address
1439 *	map with the given length.  The search is defined to be
1440 *	first-fit from the specified address; the region found is
1441 *	returned in the same parameter.
1442 *
1443 *	If object is non-NULL, ref count must be bumped by caller
1444 *	prior to making call to account for the new entry.
1445 */
1446int
1447vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1448	    vm_offset_t *addr,	/* IN/OUT */
1449	    vm_size_t length, vm_offset_t max_addr, int find_space,
1450	    vm_prot_t prot, vm_prot_t max, int cow)
1451{
1452	vm_offset_t alignment, initial_addr, start;
1453	int result;
1454
1455	KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1456	    object == NULL,
1457	    ("vm_map_find: non-NULL backing object for stack"));
1458	if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
1459	    (object->flags & OBJ_COLORED) == 0))
1460		find_space = VMFS_ANY_SPACE;
1461	if (find_space >> 8 != 0) {
1462		KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
1463		alignment = (vm_offset_t)1 << (find_space >> 8);
1464	} else
1465		alignment = 0;
1466	initial_addr = *addr;
1467again:
1468	start = initial_addr;
1469	vm_map_lock(map);
1470	do {
1471		if (find_space != VMFS_NO_SPACE) {
1472			if (vm_map_findspace(map, start, length, addr) ||
1473			    (max_addr != 0 && *addr + length > max_addr)) {
1474				vm_map_unlock(map);
1475				if (find_space == VMFS_OPTIMAL_SPACE) {
1476					find_space = VMFS_ANY_SPACE;
1477					goto again;
1478				}
1479				return (KERN_NO_SPACE);
1480			}
1481			switch (find_space) {
1482			case VMFS_SUPER_SPACE:
1483			case VMFS_OPTIMAL_SPACE:
1484				pmap_align_superpage(object, offset, addr,
1485				    length);
1486				break;
1487			case VMFS_ANY_SPACE:
1488				break;
1489			default:
1490				if ((*addr & (alignment - 1)) != 0) {
1491					*addr &= ~(alignment - 1);
1492					*addr += alignment;
1493				}
1494				break;
1495			}
1496
1497			start = *addr;
1498		}
1499		if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1500			result = vm_map_stack_locked(map, start, length,
1501			    sgrowsiz, prot, max, cow);
1502		} else {
1503			result = vm_map_insert(map, object, offset, start,
1504			    start + length, prot, max, cow);
1505		}
1506	} while (result == KERN_NO_SPACE && find_space != VMFS_NO_SPACE &&
1507	    find_space != VMFS_ANY_SPACE);
1508	vm_map_unlock(map);
1509	return (result);
1510}
1511
1512/*
1513 *	vm_map_find_min() is a variant of vm_map_find() that takes an
1514 *	additional parameter (min_addr) and treats the given address
1515 *	(*addr) differently.  Specifically, it treats *addr as a hint
1516 *	and not as the minimum address where the mapping is created.
1517 *
1518 *	This function works in two phases.  First, it tries to
1519 *	allocate above the hint.  If that fails and the hint is
1520 *	greater than min_addr, it performs a second pass, replacing
1521 *	the hint with min_addr as the minimum address for the
1522 *	allocation.
1523 */
1524int
1525vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1526    vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
1527    vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
1528    int cow)
1529{
1530	vm_offset_t hint;
1531	int rv;
1532
1533	hint = *addr;
1534	for (;;) {
1535		rv = vm_map_find(map, object, offset, addr, length, max_addr,
1536		    find_space, prot, max, cow);
1537		if (rv == KERN_SUCCESS || min_addr >= hint)
1538			return (rv);
1539		*addr = hint = min_addr;
1540	}
1541}
1542
1543/*
1544 *	vm_map_simplify_entry:
1545 *
1546 *	Simplify the given map entry by merging with either neighbor.  This
1547 *	routine also has the ability to merge with both neighbors.
1548 *
1549 *	The map must be locked.
1550 *
1551 *	This routine guarentees that the passed entry remains valid (though
1552 *	possibly extended).  When merging, this routine may delete one or
1553 *	both neighbors.
1554 */
1555void
1556vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
1557{
1558	vm_map_entry_t next, prev;
1559	vm_size_t prevsize, esize;
1560
1561	if ((entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP |
1562	    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP)) != 0)
1563		return;
1564
1565	prev = entry->prev;
1566	if (prev != &map->header) {
1567		prevsize = prev->end - prev->start;
1568		if ( (prev->end == entry->start) &&
1569		     (prev->object.vm_object == entry->object.vm_object) &&
1570		     (!prev->object.vm_object ||
1571			(prev->offset + prevsize == entry->offset)) &&
1572		     (prev->eflags == entry->eflags) &&
1573		     (prev->protection == entry->protection) &&
1574		     (prev->max_protection == entry->max_protection) &&
1575		     (prev->inheritance == entry->inheritance) &&
1576		     (prev->wired_count == entry->wired_count) &&
1577		     (prev->cred == entry->cred)) {
1578			vm_map_entry_unlink(map, prev);
1579			entry->start = prev->start;
1580			entry->offset = prev->offset;
1581			if (entry->prev != &map->header)
1582				vm_map_entry_resize_free(map, entry->prev);
1583
1584			/*
1585			 * If the backing object is a vnode object,
1586			 * vm_object_deallocate() calls vrele().
1587			 * However, vrele() does not lock the vnode
1588			 * because the vnode has additional
1589			 * references.  Thus, the map lock can be kept
1590			 * without causing a lock-order reversal with
1591			 * the vnode lock.
1592			 *
1593			 * Since we count the number of virtual page
1594			 * mappings in object->un_pager.vnp.writemappings,
1595			 * the writemappings value should not be adjusted
1596			 * when the entry is disposed of.
1597			 */
1598			if (prev->object.vm_object)
1599				vm_object_deallocate(prev->object.vm_object);
1600			if (prev->cred != NULL)
1601				crfree(prev->cred);
1602			vm_map_entry_dispose(map, prev);
1603		}
1604	}
1605
1606	next = entry->next;
1607	if (next != &map->header) {
1608		esize = entry->end - entry->start;
1609		if ((entry->end == next->start) &&
1610		    (next->object.vm_object == entry->object.vm_object) &&
1611		     (!entry->object.vm_object ||
1612			(entry->offset + esize == next->offset)) &&
1613		    (next->eflags == entry->eflags) &&
1614		    (next->protection == entry->protection) &&
1615		    (next->max_protection == entry->max_protection) &&
1616		    (next->inheritance == entry->inheritance) &&
1617		    (next->wired_count == entry->wired_count) &&
1618		    (next->cred == entry->cred)) {
1619			vm_map_entry_unlink(map, next);
1620			entry->end = next->end;
1621			vm_map_entry_resize_free(map, entry);
1622
1623			/*
1624			 * See comment above.
1625			 */
1626			if (next->object.vm_object)
1627				vm_object_deallocate(next->object.vm_object);
1628			if (next->cred != NULL)
1629				crfree(next->cred);
1630			vm_map_entry_dispose(map, next);
1631		}
1632	}
1633}
1634/*
1635 *	vm_map_clip_start:	[ internal use only ]
1636 *
1637 *	Asserts that the given entry begins at or after
1638 *	the specified address; if necessary,
1639 *	it splits the entry into two.
1640 */
1641#define vm_map_clip_start(map, entry, startaddr) \
1642{ \
1643	if (startaddr > entry->start) \
1644		_vm_map_clip_start(map, entry, startaddr); \
1645}
1646
1647/*
1648 *	This routine is called only when it is known that
1649 *	the entry must be split.
1650 */
1651static void
1652_vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
1653{
1654	vm_map_entry_t new_entry;
1655
1656	VM_MAP_ASSERT_LOCKED(map);
1657
1658	/*
1659	 * Split off the front portion -- note that we must insert the new
1660	 * entry BEFORE this one, so that this entry has the specified
1661	 * starting address.
1662	 */
1663	vm_map_simplify_entry(map, entry);
1664
1665	/*
1666	 * If there is no object backing this entry, we might as well create
1667	 * one now.  If we defer it, an object can get created after the map
1668	 * is clipped, and individual objects will be created for the split-up
1669	 * map.  This is a bit of a hack, but is also about the best place to
1670	 * put this improvement.
1671	 */
1672	if (entry->object.vm_object == NULL && !map->system_map &&
1673	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
1674		vm_object_t object;
1675		object = vm_object_allocate(OBJT_DEFAULT,
1676				atop(entry->end - entry->start));
1677		entry->object.vm_object = object;
1678		entry->offset = 0;
1679		if (entry->cred != NULL) {
1680			object->cred = entry->cred;
1681			object->charge = entry->end - entry->start;
1682			entry->cred = NULL;
1683		}
1684	} else if (entry->object.vm_object != NULL &&
1685		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1686		   entry->cred != NULL) {
1687		VM_OBJECT_WLOCK(entry->object.vm_object);
1688		KASSERT(entry->object.vm_object->cred == NULL,
1689		    ("OVERCOMMIT: vm_entry_clip_start: both cred e %p", entry));
1690		entry->object.vm_object->cred = entry->cred;
1691		entry->object.vm_object->charge = entry->end - entry->start;
1692		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1693		entry->cred = NULL;
1694	}
1695
1696	new_entry = vm_map_entry_create(map);
1697	*new_entry = *entry;
1698
1699	new_entry->end = start;
1700	entry->offset += (start - entry->start);
1701	entry->start = start;
1702	if (new_entry->cred != NULL)
1703		crhold(entry->cred);
1704
1705	vm_map_entry_link(map, entry->prev, new_entry);
1706
1707	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1708		vm_object_reference(new_entry->object.vm_object);
1709		/*
1710		 * The object->un_pager.vnp.writemappings for the
1711		 * object of MAP_ENTRY_VN_WRITECNT type entry shall be
1712		 * kept as is here.  The virtual pages are
1713		 * re-distributed among the clipped entries, so the sum is
1714		 * left the same.
1715		 */
1716	}
1717}
1718
1719/*
1720 *	vm_map_clip_end:	[ internal use only ]
1721 *
1722 *	Asserts that the given entry ends at or before
1723 *	the specified address; if necessary,
1724 *	it splits the entry into two.
1725 */
1726#define vm_map_clip_end(map, entry, endaddr) \
1727{ \
1728	if ((endaddr) < (entry->end)) \
1729		_vm_map_clip_end((map), (entry), (endaddr)); \
1730}
1731
1732/*
1733 *	This routine is called only when it is known that
1734 *	the entry must be split.
1735 */
1736static void
1737_vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
1738{
1739	vm_map_entry_t new_entry;
1740
1741	VM_MAP_ASSERT_LOCKED(map);
1742
1743	/*
1744	 * If there is no object backing this entry, we might as well create
1745	 * one now.  If we defer it, an object can get created after the map
1746	 * is clipped, and individual objects will be created for the split-up
1747	 * map.  This is a bit of a hack, but is also about the best place to
1748	 * put this improvement.
1749	 */
1750	if (entry->object.vm_object == NULL && !map->system_map &&
1751	    (entry->eflags & MAP_ENTRY_GUARD) == 0) {
1752		vm_object_t object;
1753		object = vm_object_allocate(OBJT_DEFAULT,
1754				atop(entry->end - entry->start));
1755		entry->object.vm_object = object;
1756		entry->offset = 0;
1757		if (entry->cred != NULL) {
1758			object->cred = entry->cred;
1759			object->charge = entry->end - entry->start;
1760			entry->cred = NULL;
1761		}
1762	} else if (entry->object.vm_object != NULL &&
1763		   ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
1764		   entry->cred != NULL) {
1765		VM_OBJECT_WLOCK(entry->object.vm_object);
1766		KASSERT(entry->object.vm_object->cred == NULL,
1767		    ("OVERCOMMIT: vm_entry_clip_end: both cred e %p", entry));
1768		entry->object.vm_object->cred = entry->cred;
1769		entry->object.vm_object->charge = entry->end - entry->start;
1770		VM_OBJECT_WUNLOCK(entry->object.vm_object);
1771		entry->cred = NULL;
1772	}
1773
1774	/*
1775	 * Create a new entry and insert it AFTER the specified entry
1776	 */
1777	new_entry = vm_map_entry_create(map);
1778	*new_entry = *entry;
1779
1780	new_entry->start = entry->end = end;
1781	new_entry->offset += (end - entry->start);
1782	if (new_entry->cred != NULL)
1783		crhold(entry->cred);
1784
1785	vm_map_entry_link(map, entry, new_entry);
1786
1787	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
1788		vm_object_reference(new_entry->object.vm_object);
1789	}
1790}
1791
1792/*
1793 *	vm_map_submap:		[ kernel use only ]
1794 *
1795 *	Mark the given range as handled by a subordinate map.
1796 *
1797 *	This range must have been created with vm_map_find,
1798 *	and no other operations may have been performed on this
1799 *	range prior to calling vm_map_submap.
1800 *
1801 *	Only a limited number of operations can be performed
1802 *	within this rage after calling vm_map_submap:
1803 *		vm_fault
1804 *	[Don't try vm_map_copy!]
1805 *
1806 *	To remove a submapping, one must first remove the
1807 *	range from the superior map, and then destroy the
1808 *	submap (if desired).  [Better yet, don't try it.]
1809 */
1810int
1811vm_map_submap(
1812	vm_map_t map,
1813	vm_offset_t start,
1814	vm_offset_t end,
1815	vm_map_t submap)
1816{
1817	vm_map_entry_t entry;
1818	int result = KERN_INVALID_ARGUMENT;
1819
1820	vm_map_lock(map);
1821
1822	VM_MAP_RANGE_CHECK(map, start, end);
1823
1824	if (vm_map_lookup_entry(map, start, &entry)) {
1825		vm_map_clip_start(map, entry, start);
1826	} else
1827		entry = entry->next;
1828
1829	vm_map_clip_end(map, entry, end);
1830
1831	if ((entry->start == start) && (entry->end == end) &&
1832	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
1833	    (entry->object.vm_object == NULL)) {
1834		entry->object.sub_map = submap;
1835		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
1836		result = KERN_SUCCESS;
1837	}
1838	vm_map_unlock(map);
1839
1840	return (result);
1841}
1842
1843/*
1844 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
1845 */
1846#define	MAX_INIT_PT	96
1847
1848/*
1849 *	vm_map_pmap_enter:
1850 *
1851 *	Preload the specified map's pmap with mappings to the specified
1852 *	object's memory-resident pages.  No further physical pages are
1853 *	allocated, and no further virtual pages are retrieved from secondary
1854 *	storage.  If the specified flags include MAP_PREFAULT_PARTIAL, then a
1855 *	limited number of page mappings are created at the low-end of the
1856 *	specified address range.  (For this purpose, a superpage mapping
1857 *	counts as one page mapping.)  Otherwise, all resident pages within
1858 *	the specified address range are mapped.  Because these mappings are
1859 *	being created speculatively, cached pages are not reactivated and
1860 *	mapped.
1861 */
1862void
1863vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
1864    vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
1865{
1866	vm_offset_t start;
1867	vm_page_t p, p_start;
1868	vm_pindex_t mask, psize, threshold, tmpidx;
1869
1870	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
1871		return;
1872	VM_OBJECT_RLOCK(object);
1873	if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1874		VM_OBJECT_RUNLOCK(object);
1875		VM_OBJECT_WLOCK(object);
1876		if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
1877			pmap_object_init_pt(map->pmap, addr, object, pindex,
1878			    size);
1879			VM_OBJECT_WUNLOCK(object);
1880			return;
1881		}
1882		VM_OBJECT_LOCK_DOWNGRADE(object);
1883	}
1884
1885	psize = atop(size);
1886	if (psize + pindex > object->size) {
1887		if (object->size < pindex) {
1888			VM_OBJECT_RUNLOCK(object);
1889			return;
1890		}
1891		psize = object->size - pindex;
1892	}
1893
1894	start = 0;
1895	p_start = NULL;
1896	threshold = MAX_INIT_PT;
1897
1898	p = vm_page_find_least(object, pindex);
1899	/*
1900	 * Assert: the variable p is either (1) the page with the
1901	 * least pindex greater than or equal to the parameter pindex
1902	 * or (2) NULL.
1903	 */
1904	for (;
1905	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
1906	     p = TAILQ_NEXT(p, listq)) {
1907		/*
1908		 * don't allow an madvise to blow away our really
1909		 * free pages allocating pv entries.
1910		 */
1911		if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
1912		    cnt.v_free_count < cnt.v_free_reserved) ||
1913		    ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
1914		    tmpidx >= threshold)) {
1915			psize = tmpidx;
1916			break;
1917		}
1918		if (p->valid == VM_PAGE_BITS_ALL) {
1919			if (p_start == NULL) {
1920				start = addr + ptoa(tmpidx);
1921				p_start = p;
1922			}
1923			/* Jump ahead if a superpage mapping is possible. */
1924			if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
1925			    (pagesizes[p->psind] - 1)) == 0) {
1926				mask = atop(pagesizes[p->psind]) - 1;
1927				if (tmpidx + mask < psize &&
1928				    vm_page_ps_is_valid(p)) {
1929					p += mask;
1930					threshold += mask;
1931				}
1932			}
1933		} else if (p_start != NULL) {
1934			pmap_enter_object(map->pmap, start, addr +
1935			    ptoa(tmpidx), p_start, prot);
1936			p_start = NULL;
1937		}
1938	}
1939	if (p_start != NULL)
1940		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
1941		    p_start, prot);
1942	VM_OBJECT_RUNLOCK(object);
1943}
1944
1945/*
1946 *	vm_map_protect:
1947 *
1948 *	Sets the protection of the specified address
1949 *	region in the target map.  If "set_max" is
1950 *	specified, the maximum protection is to be set;
1951 *	otherwise, only the current protection is affected.
1952 */
1953int
1954vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
1955	       vm_prot_t new_prot, boolean_t set_max)
1956{
1957	vm_map_entry_t current, entry;
1958	vm_object_t obj;
1959	struct ucred *cred;
1960	vm_prot_t old_prot;
1961
1962	if (start == end)
1963		return (KERN_SUCCESS);
1964
1965	vm_map_lock(map);
1966
1967	VM_MAP_RANGE_CHECK(map, start, end);
1968
1969	if (vm_map_lookup_entry(map, start, &entry)) {
1970		vm_map_clip_start(map, entry, start);
1971	} else {
1972		entry = entry->next;
1973	}
1974
1975	/*
1976	 * Make a first pass to check for protection violations.
1977	 */
1978	for (current = entry; current != &map->header && current->start < end;
1979	    current = current->next) {
1980		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
1981			continue;
1982		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
1983			vm_map_unlock(map);
1984			return (KERN_INVALID_ARGUMENT);
1985		}
1986		if ((new_prot & current->max_protection) != new_prot) {
1987			vm_map_unlock(map);
1988			return (KERN_PROTECTION_FAILURE);
1989		}
1990	}
1991
1992	/*
1993	 * Do an accounting pass for private read-only mappings that
1994	 * now will do cow due to allowed write (e.g. debugger sets
1995	 * breakpoint on text segment)
1996	 */
1997	for (current = entry; current != &map->header && current->start < end;
1998	    current = current->next) {
1999
2000		vm_map_clip_end(map, current, end);
2001
2002		if (set_max ||
2003		    ((new_prot & ~(current->protection)) & VM_PROT_WRITE) == 0 ||
2004		    ENTRY_CHARGED(current) ||
2005		    (current->eflags & MAP_ENTRY_GUARD) != 0) {
2006			continue;
2007		}
2008
2009		cred = curthread->td_ucred;
2010		obj = current->object.vm_object;
2011
2012		if (obj == NULL || (current->eflags & MAP_ENTRY_NEEDS_COPY)) {
2013			if (!swap_reserve(current->end - current->start)) {
2014				vm_map_unlock(map);
2015				return (KERN_RESOURCE_SHORTAGE);
2016			}
2017			crhold(cred);
2018			current->cred = cred;
2019			continue;
2020		}
2021
2022		VM_OBJECT_WLOCK(obj);
2023		if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2024			VM_OBJECT_WUNLOCK(obj);
2025			continue;
2026		}
2027
2028		/*
2029		 * Charge for the whole object allocation now, since
2030		 * we cannot distinguish between non-charged and
2031		 * charged clipped mapping of the same object later.
2032		 */
2033		KASSERT(obj->charge == 0,
2034		    ("vm_map_protect: object %p overcharged (entry %p)",
2035		    obj, current));
2036		if (!swap_reserve(ptoa(obj->size))) {
2037			VM_OBJECT_WUNLOCK(obj);
2038			vm_map_unlock(map);
2039			return (KERN_RESOURCE_SHORTAGE);
2040		}
2041
2042		crhold(cred);
2043		obj->cred = cred;
2044		obj->charge = ptoa(obj->size);
2045		VM_OBJECT_WUNLOCK(obj);
2046	}
2047
2048	/*
2049	 * Go back and fix up protections. [Note that clipping is not
2050	 * necessary the second time.]
2051	 */
2052	for (current = entry; current != &map->header && current->start < end;
2053	    current = current->next) {
2054		if ((current->eflags & MAP_ENTRY_GUARD) != 0)
2055			continue;
2056
2057		old_prot = current->protection;
2058
2059		if (set_max)
2060			current->protection =
2061			    (current->max_protection = new_prot) &
2062			    old_prot;
2063		else
2064			current->protection = new_prot;
2065
2066		/*
2067		 * For user wired map entries, the normal lazy evaluation of
2068		 * write access upgrades through soft page faults is
2069		 * undesirable.  Instead, immediately copy any pages that are
2070		 * copy-on-write and enable write access in the physical map.
2071		 */
2072		if ((current->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2073		    (current->protection & VM_PROT_WRITE) != 0 &&
2074		    (old_prot & VM_PROT_WRITE) == 0)
2075			vm_fault_copy_entry(map, map, current, current, NULL);
2076
2077		/*
2078		 * When restricting access, update the physical map.  Worry
2079		 * about copy-on-write here.
2080		 */
2081		if ((old_prot & ~current->protection) != 0) {
2082#define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2083							VM_PROT_ALL)
2084			pmap_protect(map->pmap, current->start,
2085			    current->end,
2086			    current->protection & MASK(current));
2087#undef	MASK
2088		}
2089		vm_map_simplify_entry(map, current);
2090	}
2091	vm_map_unlock(map);
2092	return (KERN_SUCCESS);
2093}
2094
2095/*
2096 *	vm_map_madvise:
2097 *
2098 *	This routine traverses a processes map handling the madvise
2099 *	system call.  Advisories are classified as either those effecting
2100 *	the vm_map_entry structure, or those effecting the underlying
2101 *	objects.
2102 */
2103int
2104vm_map_madvise(
2105	vm_map_t map,
2106	vm_offset_t start,
2107	vm_offset_t end,
2108	int behav)
2109{
2110	vm_map_entry_t current, entry;
2111	int modify_map = 0;
2112
2113	/*
2114	 * Some madvise calls directly modify the vm_map_entry, in which case
2115	 * we need to use an exclusive lock on the map and we need to perform
2116	 * various clipping operations.  Otherwise we only need a read-lock
2117	 * on the map.
2118	 */
2119	switch(behav) {
2120	case MADV_NORMAL:
2121	case MADV_SEQUENTIAL:
2122	case MADV_RANDOM:
2123	case MADV_NOSYNC:
2124	case MADV_AUTOSYNC:
2125	case MADV_NOCORE:
2126	case MADV_CORE:
2127		if (start == end)
2128			return (KERN_SUCCESS);
2129		modify_map = 1;
2130		vm_map_lock(map);
2131		break;
2132	case MADV_WILLNEED:
2133	case MADV_DONTNEED:
2134	case MADV_FREE:
2135		if (start == end)
2136			return (KERN_SUCCESS);
2137		vm_map_lock_read(map);
2138		break;
2139	default:
2140		return (KERN_INVALID_ARGUMENT);
2141	}
2142
2143	/*
2144	 * Locate starting entry and clip if necessary.
2145	 */
2146	VM_MAP_RANGE_CHECK(map, start, end);
2147
2148	if (vm_map_lookup_entry(map, start, &entry)) {
2149		if (modify_map)
2150			vm_map_clip_start(map, entry, start);
2151	} else {
2152		entry = entry->next;
2153	}
2154
2155	if (modify_map) {
2156		/*
2157		 * madvise behaviors that are implemented in the vm_map_entry.
2158		 *
2159		 * We clip the vm_map_entry so that behavioral changes are
2160		 * limited to the specified address range.
2161		 */
2162		for (current = entry;
2163		     (current != &map->header) && (current->start < end);
2164		     current = current->next
2165		) {
2166			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2167				continue;
2168
2169			vm_map_clip_end(map, current, end);
2170
2171			switch (behav) {
2172			case MADV_NORMAL:
2173				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
2174				break;
2175			case MADV_SEQUENTIAL:
2176				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
2177				break;
2178			case MADV_RANDOM:
2179				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
2180				break;
2181			case MADV_NOSYNC:
2182				current->eflags |= MAP_ENTRY_NOSYNC;
2183				break;
2184			case MADV_AUTOSYNC:
2185				current->eflags &= ~MAP_ENTRY_NOSYNC;
2186				break;
2187			case MADV_NOCORE:
2188				current->eflags |= MAP_ENTRY_NOCOREDUMP;
2189				break;
2190			case MADV_CORE:
2191				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
2192				break;
2193			default:
2194				break;
2195			}
2196			vm_map_simplify_entry(map, current);
2197		}
2198		vm_map_unlock(map);
2199	} else {
2200		vm_pindex_t pstart, pend;
2201
2202		/*
2203		 * madvise behaviors that are implemented in the underlying
2204		 * vm_object.
2205		 *
2206		 * Since we don't clip the vm_map_entry, we have to clip
2207		 * the vm_object pindex and count.
2208		 */
2209		for (current = entry;
2210		     (current != &map->header) && (current->start < end);
2211		     current = current->next
2212		) {
2213			vm_offset_t useEnd, useStart;
2214
2215			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
2216				continue;
2217
2218			pstart = OFF_TO_IDX(current->offset);
2219			pend = pstart + atop(current->end - current->start);
2220			useStart = current->start;
2221			useEnd = current->end;
2222
2223			if (current->start < start) {
2224				pstart += atop(start - current->start);
2225				useStart = start;
2226			}
2227			if (current->end > end) {
2228				pend -= atop(current->end - end);
2229				useEnd = end;
2230			}
2231
2232			if (pstart >= pend)
2233				continue;
2234
2235			/*
2236			 * Perform the pmap_advise() before clearing
2237			 * PGA_REFERENCED in vm_page_advise().  Otherwise, a
2238			 * concurrent pmap operation, such as pmap_remove(),
2239			 * could clear a reference in the pmap and set
2240			 * PGA_REFERENCED on the page before the pmap_advise()
2241			 * had completed.  Consequently, the page would appear
2242			 * referenced based upon an old reference that
2243			 * occurred before this pmap_advise() ran.
2244			 */
2245			if (behav == MADV_DONTNEED || behav == MADV_FREE)
2246				pmap_advise(map->pmap, useStart, useEnd,
2247				    behav);
2248
2249			vm_object_madvise(current->object.vm_object, pstart,
2250			    pend, behav);
2251
2252			/*
2253			 * Pre-populate paging structures in the
2254			 * WILLNEED case.  For wired entries, the
2255			 * paging structures are already populated.
2256			 */
2257			if (behav == MADV_WILLNEED &&
2258			    current->wired_count == 0) {
2259				vm_map_pmap_enter(map,
2260				    useStart,
2261				    current->protection,
2262				    current->object.vm_object,
2263				    pstart,
2264				    ptoa(pend - pstart),
2265				    MAP_PREFAULT_MADVISE
2266				);
2267			}
2268		}
2269		vm_map_unlock_read(map);
2270	}
2271	return (0);
2272}
2273
2274
2275/*
2276 *	vm_map_inherit:
2277 *
2278 *	Sets the inheritance of the specified address
2279 *	range in the target map.  Inheritance
2280 *	affects how the map will be shared with
2281 *	child maps at the time of vmspace_fork.
2282 */
2283int
2284vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
2285	       vm_inherit_t new_inheritance)
2286{
2287	vm_map_entry_t entry;
2288	vm_map_entry_t temp_entry;
2289
2290	switch (new_inheritance) {
2291	case VM_INHERIT_NONE:
2292	case VM_INHERIT_COPY:
2293	case VM_INHERIT_SHARE:
2294	case VM_INHERIT_ZERO:
2295		break;
2296	default:
2297		return (KERN_INVALID_ARGUMENT);
2298	}
2299	if (start == end)
2300		return (KERN_SUCCESS);
2301	vm_map_lock(map);
2302	VM_MAP_RANGE_CHECK(map, start, end);
2303	if (vm_map_lookup_entry(map, start, &temp_entry)) {
2304		entry = temp_entry;
2305		vm_map_clip_start(map, entry, start);
2306	} else
2307		entry = temp_entry->next;
2308	while ((entry != &map->header) && (entry->start < end)) {
2309		vm_map_clip_end(map, entry, end);
2310		if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
2311		    new_inheritance != VM_INHERIT_ZERO)
2312			entry->inheritance = new_inheritance;
2313		vm_map_simplify_entry(map, entry);
2314		entry = entry->next;
2315	}
2316	vm_map_unlock(map);
2317	return (KERN_SUCCESS);
2318}
2319
2320/*
2321 *	vm_map_unwire:
2322 *
2323 *	Implements both kernel and user unwiring.
2324 */
2325int
2326vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2327    int flags)
2328{
2329	vm_map_entry_t entry, first_entry, tmp_entry;
2330	vm_offset_t saved_start;
2331	unsigned int last_timestamp;
2332	int rv;
2333	boolean_t need_wakeup, result, user_unwire;
2334
2335	if (start == end)
2336		return (KERN_SUCCESS);
2337	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2338	vm_map_lock(map);
2339	VM_MAP_RANGE_CHECK(map, start, end);
2340	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2341		if (flags & VM_MAP_WIRE_HOLESOK)
2342			first_entry = first_entry->next;
2343		else {
2344			vm_map_unlock(map);
2345			return (KERN_INVALID_ADDRESS);
2346		}
2347	}
2348	last_timestamp = map->timestamp;
2349	entry = first_entry;
2350	while (entry != &map->header && entry->start < end) {
2351		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2352			/*
2353			 * We have not yet clipped the entry.
2354			 */
2355			saved_start = (start >= entry->start) ? start :
2356			    entry->start;
2357			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2358			if (vm_map_unlock_and_wait(map, 0)) {
2359				/*
2360				 * Allow interruption of user unwiring?
2361				 */
2362			}
2363			vm_map_lock(map);
2364			if (last_timestamp+1 != map->timestamp) {
2365				/*
2366				 * Look again for the entry because the map was
2367				 * modified while it was unlocked.
2368				 * Specifically, the entry may have been
2369				 * clipped, merged, or deleted.
2370				 */
2371				if (!vm_map_lookup_entry(map, saved_start,
2372				    &tmp_entry)) {
2373					if (flags & VM_MAP_WIRE_HOLESOK)
2374						tmp_entry = tmp_entry->next;
2375					else {
2376						if (saved_start == start) {
2377							/*
2378							 * First_entry has been deleted.
2379							 */
2380							vm_map_unlock(map);
2381							return (KERN_INVALID_ADDRESS);
2382						}
2383						end = saved_start;
2384						rv = KERN_INVALID_ADDRESS;
2385						goto done;
2386					}
2387				}
2388				if (entry == first_entry)
2389					first_entry = tmp_entry;
2390				else
2391					first_entry = NULL;
2392				entry = tmp_entry;
2393			}
2394			last_timestamp = map->timestamp;
2395			continue;
2396		}
2397		vm_map_clip_start(map, entry, start);
2398		vm_map_clip_end(map, entry, end);
2399		/*
2400		 * Mark the entry in case the map lock is released.  (See
2401		 * above.)
2402		 */
2403		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2404		    entry->wiring_thread == NULL,
2405		    ("owned map entry %p", entry));
2406		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2407		entry->wiring_thread = curthread;
2408		/*
2409		 * Check the map for holes in the specified region.
2410		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2411		 */
2412		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
2413		    (entry->end < end && (entry->next == &map->header ||
2414		    entry->next->start > entry->end))) {
2415			end = entry->end;
2416			rv = KERN_INVALID_ADDRESS;
2417			goto done;
2418		}
2419		/*
2420		 * If system unwiring, require that the entry is system wired.
2421		 */
2422		if (!user_unwire &&
2423		    vm_map_entry_system_wired_count(entry) == 0) {
2424			end = entry->end;
2425			rv = KERN_INVALID_ARGUMENT;
2426			goto done;
2427		}
2428		entry = entry->next;
2429	}
2430	rv = KERN_SUCCESS;
2431done:
2432	need_wakeup = FALSE;
2433	if (first_entry == NULL) {
2434		result = vm_map_lookup_entry(map, start, &first_entry);
2435		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2436			first_entry = first_entry->next;
2437		else
2438			KASSERT(result, ("vm_map_unwire: lookup failed"));
2439	}
2440	for (entry = first_entry; entry != &map->header && entry->start < end;
2441	    entry = entry->next) {
2442		/*
2443		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2444		 * space in the unwired region could have been mapped
2445		 * while the map lock was dropped for draining
2446		 * MAP_ENTRY_IN_TRANSITION.  Moreover, another thread
2447		 * could be simultaneously wiring this new mapping
2448		 * entry.  Detect these cases and skip any entries
2449		 * marked as in transition by us.
2450		 */
2451		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2452		    entry->wiring_thread != curthread) {
2453			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2454			    ("vm_map_unwire: !HOLESOK and new/changed entry"));
2455			continue;
2456		}
2457
2458		if (rv == KERN_SUCCESS && (!user_unwire ||
2459		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
2460			if (user_unwire)
2461				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
2462			if (entry->wired_count == 1)
2463				vm_map_entry_unwire(map, entry);
2464			else
2465				entry->wired_count--;
2466		}
2467		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2468		    ("vm_map_unwire: in-transition flag missing %p", entry));
2469		KASSERT(entry->wiring_thread == curthread,
2470		    ("vm_map_unwire: alien wire %p", entry));
2471		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
2472		entry->wiring_thread = NULL;
2473		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2474			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2475			need_wakeup = TRUE;
2476		}
2477		vm_map_simplify_entry(map, entry);
2478	}
2479	vm_map_unlock(map);
2480	if (need_wakeup)
2481		vm_map_wakeup(map);
2482	return (rv);
2483}
2484
2485/*
2486 *	vm_map_wire_entry_failure:
2487 *
2488 *	Handle a wiring failure on the given entry.
2489 *
2490 *	The map should be locked.
2491 */
2492static void
2493vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
2494    vm_offset_t failed_addr)
2495{
2496
2497	VM_MAP_ASSERT_LOCKED(map);
2498	KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
2499	    entry->wired_count == 1,
2500	    ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
2501	KASSERT(failed_addr < entry->end,
2502	    ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
2503
2504	/*
2505	 * If any pages at the start of this entry were successfully wired,
2506	 * then unwire them.
2507	 */
2508	if (failed_addr > entry->start) {
2509		pmap_unwire(map->pmap, entry->start, failed_addr);
2510		vm_object_unwire(entry->object.vm_object, entry->offset,
2511		    failed_addr - entry->start, PQ_ACTIVE);
2512	}
2513
2514	/*
2515	 * Assign an out-of-range value to represent the failure to wire this
2516	 * entry.
2517	 */
2518	entry->wired_count = -1;
2519}
2520
2521/*
2522 *	vm_map_wire:
2523 *
2524 *	Implements both kernel and user wiring.
2525 */
2526int
2527vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
2528    int flags)
2529{
2530	vm_map_entry_t entry, first_entry, tmp_entry;
2531	vm_offset_t faddr, saved_end, saved_start;
2532	unsigned int last_timestamp;
2533	int rv;
2534	boolean_t need_wakeup, result, user_wire;
2535	vm_prot_t prot;
2536
2537	if (start == end)
2538		return (KERN_SUCCESS);
2539	prot = 0;
2540	if (flags & VM_MAP_WIRE_WRITE)
2541		prot |= VM_PROT_WRITE;
2542	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
2543	vm_map_lock(map);
2544	VM_MAP_RANGE_CHECK(map, start, end);
2545	if (!vm_map_lookup_entry(map, start, &first_entry)) {
2546		if (flags & VM_MAP_WIRE_HOLESOK)
2547			first_entry = first_entry->next;
2548		else {
2549			vm_map_unlock(map);
2550			return (KERN_INVALID_ADDRESS);
2551		}
2552	}
2553	last_timestamp = map->timestamp;
2554	entry = first_entry;
2555	while (entry != &map->header && entry->start < end) {
2556		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
2557			/*
2558			 * We have not yet clipped the entry.
2559			 */
2560			saved_start = (start >= entry->start) ? start :
2561			    entry->start;
2562			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2563			if (vm_map_unlock_and_wait(map, 0)) {
2564				/*
2565				 * Allow interruption of user wiring?
2566				 */
2567			}
2568			vm_map_lock(map);
2569			if (last_timestamp + 1 != map->timestamp) {
2570				/*
2571				 * Look again for the entry because the map was
2572				 * modified while it was unlocked.
2573				 * Specifically, the entry may have been
2574				 * clipped, merged, or deleted.
2575				 */
2576				if (!vm_map_lookup_entry(map, saved_start,
2577				    &tmp_entry)) {
2578					if (flags & VM_MAP_WIRE_HOLESOK)
2579						tmp_entry = tmp_entry->next;
2580					else {
2581						if (saved_start == start) {
2582							/*
2583							 * first_entry has been deleted.
2584							 */
2585							vm_map_unlock(map);
2586							return (KERN_INVALID_ADDRESS);
2587						}
2588						end = saved_start;
2589						rv = KERN_INVALID_ADDRESS;
2590						goto done;
2591					}
2592				}
2593				if (entry == first_entry)
2594					first_entry = tmp_entry;
2595				else
2596					first_entry = NULL;
2597				entry = tmp_entry;
2598			}
2599			last_timestamp = map->timestamp;
2600			continue;
2601		}
2602		vm_map_clip_start(map, entry, start);
2603		vm_map_clip_end(map, entry, end);
2604		/*
2605		 * Mark the entry in case the map lock is released.  (See
2606		 * above.)
2607		 */
2608		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
2609		    entry->wiring_thread == NULL,
2610		    ("owned map entry %p", entry));
2611		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
2612		entry->wiring_thread = curthread;
2613		if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
2614		    || (entry->protection & prot) != prot) {
2615			entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
2616			if ((flags & VM_MAP_WIRE_HOLESOK) == 0) {
2617				end = entry->end;
2618				rv = KERN_INVALID_ADDRESS;
2619				goto done;
2620			}
2621			goto next_entry;
2622		}
2623		if (entry->wired_count == 0) {
2624			entry->wired_count++;
2625			saved_start = entry->start;
2626			saved_end = entry->end;
2627
2628			/*
2629			 * Release the map lock, relying on the in-transition
2630			 * mark.  Mark the map busy for fork.
2631			 */
2632			vm_map_busy(map);
2633			vm_map_unlock(map);
2634
2635			faddr = saved_start;
2636			do {
2637				/*
2638				 * Simulate a fault to get the page and enter
2639				 * it into the physical map.
2640				 */
2641				if ((rv = vm_fault(map, faddr, VM_PROT_NONE,
2642				    VM_FAULT_WIRE)) != KERN_SUCCESS)
2643					break;
2644			} while ((faddr += PAGE_SIZE) < saved_end);
2645			vm_map_lock(map);
2646			vm_map_unbusy(map);
2647			if (last_timestamp + 1 != map->timestamp) {
2648				/*
2649				 * Look again for the entry because the map was
2650				 * modified while it was unlocked.  The entry
2651				 * may have been clipped, but NOT merged or
2652				 * deleted.
2653				 */
2654				result = vm_map_lookup_entry(map, saved_start,
2655				    &tmp_entry);
2656				KASSERT(result, ("vm_map_wire: lookup failed"));
2657				if (entry == first_entry)
2658					first_entry = tmp_entry;
2659				else
2660					first_entry = NULL;
2661				entry = tmp_entry;
2662				while (entry->end < saved_end) {
2663					/*
2664					 * In case of failure, handle entries
2665					 * that were not fully wired here;
2666					 * fully wired entries are handled
2667					 * later.
2668					 */
2669					if (rv != KERN_SUCCESS &&
2670					    faddr < entry->end)
2671						vm_map_wire_entry_failure(map,
2672						    entry, faddr);
2673					entry = entry->next;
2674				}
2675			}
2676			last_timestamp = map->timestamp;
2677			if (rv != KERN_SUCCESS) {
2678				vm_map_wire_entry_failure(map, entry, faddr);
2679				end = entry->end;
2680				goto done;
2681			}
2682		} else if (!user_wire ||
2683			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2684			entry->wired_count++;
2685		}
2686		/*
2687		 * Check the map for holes in the specified region.
2688		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
2689		 */
2690	next_entry:
2691		if ((flags & VM_MAP_WIRE_HOLESOK) == 0 &&
2692		    entry->end < end && (entry->next == &map->header ||
2693		    entry->next->start > entry->end)) {
2694			end = entry->end;
2695			rv = KERN_INVALID_ADDRESS;
2696			goto done;
2697		}
2698		entry = entry->next;
2699	}
2700	rv = KERN_SUCCESS;
2701done:
2702	need_wakeup = FALSE;
2703	if (first_entry == NULL) {
2704		result = vm_map_lookup_entry(map, start, &first_entry);
2705		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
2706			first_entry = first_entry->next;
2707		else
2708			KASSERT(result, ("vm_map_wire: lookup failed"));
2709	}
2710	for (entry = first_entry; entry != &map->header && entry->start < end;
2711	    entry = entry->next) {
2712		/*
2713		 * If VM_MAP_WIRE_HOLESOK was specified, an empty
2714		 * space in the unwired region could have been mapped
2715		 * while the map lock was dropped for faulting in the
2716		 * pages or draining MAP_ENTRY_IN_TRANSITION.
2717		 * Moreover, another thread could be simultaneously
2718		 * wiring this new mapping entry.  Detect these cases
2719		 * and skip any entries marked as in transition not by us.
2720		 */
2721		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
2722		    entry->wiring_thread != curthread) {
2723			KASSERT((flags & VM_MAP_WIRE_HOLESOK) != 0,
2724			    ("vm_map_wire: !HOLESOK and new/changed entry"));
2725			continue;
2726		}
2727
2728		if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0)
2729			goto next_entry_done;
2730
2731		if (rv == KERN_SUCCESS) {
2732			if (user_wire)
2733				entry->eflags |= MAP_ENTRY_USER_WIRED;
2734		} else if (entry->wired_count == -1) {
2735			/*
2736			 * Wiring failed on this entry.  Thus, unwiring is
2737			 * unnecessary.
2738			 */
2739			entry->wired_count = 0;
2740		} else if (!user_wire ||
2741		    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
2742			/*
2743			 * Undo the wiring.  Wiring succeeded on this entry
2744			 * but failed on a later entry.
2745			 */
2746			if (entry->wired_count == 1)
2747				vm_map_entry_unwire(map, entry);
2748			else
2749				entry->wired_count--;
2750		}
2751	next_entry_done:
2752		KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
2753		    ("vm_map_wire: in-transition flag missing %p", entry));
2754		KASSERT(entry->wiring_thread == curthread,
2755		    ("vm_map_wire: alien wire %p", entry));
2756		entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
2757		    MAP_ENTRY_WIRE_SKIPPED);
2758		entry->wiring_thread = NULL;
2759		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
2760			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
2761			need_wakeup = TRUE;
2762		}
2763		vm_map_simplify_entry(map, entry);
2764	}
2765	vm_map_unlock(map);
2766	if (need_wakeup)
2767		vm_map_wakeup(map);
2768	return (rv);
2769}
2770
2771/*
2772 * vm_map_sync
2773 *
2774 * Push any dirty cached pages in the address range to their pager.
2775 * If syncio is TRUE, dirty pages are written synchronously.
2776 * If invalidate is TRUE, any cached pages are freed as well.
2777 *
2778 * If the size of the region from start to end is zero, we are
2779 * supposed to flush all modified pages within the region containing
2780 * start.  Unfortunately, a region can be split or coalesced with
2781 * neighboring regions, making it difficult to determine what the
2782 * original region was.  Therefore, we approximate this requirement by
2783 * flushing the current region containing start.
2784 *
2785 * Returns an error if any part of the specified range is not mapped.
2786 */
2787int
2788vm_map_sync(
2789	vm_map_t map,
2790	vm_offset_t start,
2791	vm_offset_t end,
2792	boolean_t syncio,
2793	boolean_t invalidate)
2794{
2795	vm_map_entry_t current;
2796	vm_map_entry_t entry;
2797	vm_size_t size;
2798	vm_object_t object;
2799	vm_ooffset_t offset;
2800	unsigned int last_timestamp;
2801	boolean_t failed;
2802
2803	vm_map_lock_read(map);
2804	VM_MAP_RANGE_CHECK(map, start, end);
2805	if (!vm_map_lookup_entry(map, start, &entry)) {
2806		vm_map_unlock_read(map);
2807		return (KERN_INVALID_ADDRESS);
2808	} else if (start == end) {
2809		start = entry->start;
2810		end = entry->end;
2811	}
2812	/*
2813	 * Make a first pass to check for user-wired memory and holes.
2814	 */
2815	for (current = entry; current != &map->header && current->start < end;
2816	    current = current->next) {
2817		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
2818			vm_map_unlock_read(map);
2819			return (KERN_INVALID_ARGUMENT);
2820		}
2821		if (end > current->end &&
2822		    (current->next == &map->header ||
2823			current->end != current->next->start)) {
2824			vm_map_unlock_read(map);
2825			return (KERN_INVALID_ADDRESS);
2826		}
2827	}
2828
2829	if (invalidate)
2830		pmap_remove(map->pmap, start, end);
2831	failed = FALSE;
2832
2833	/*
2834	 * Make a second pass, cleaning/uncaching pages from the indicated
2835	 * objects as we go.
2836	 */
2837	for (current = entry; current != &map->header && current->start < end;) {
2838		offset = current->offset + (start - current->start);
2839		size = (end <= current->end ? end : current->end) - start;
2840		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
2841			vm_map_t smap;
2842			vm_map_entry_t tentry;
2843			vm_size_t tsize;
2844
2845			smap = current->object.sub_map;
2846			vm_map_lock_read(smap);
2847			(void) vm_map_lookup_entry(smap, offset, &tentry);
2848			tsize = tentry->end - offset;
2849			if (tsize < size)
2850				size = tsize;
2851			object = tentry->object.vm_object;
2852			offset = tentry->offset + (offset - tentry->start);
2853			vm_map_unlock_read(smap);
2854		} else {
2855			object = current->object.vm_object;
2856		}
2857		vm_object_reference(object);
2858		last_timestamp = map->timestamp;
2859		vm_map_unlock_read(map);
2860		if (!vm_object_sync(object, offset, size, syncio, invalidate))
2861			failed = TRUE;
2862		start += size;
2863		vm_object_deallocate(object);
2864		vm_map_lock_read(map);
2865		if (last_timestamp == map->timestamp ||
2866		    !vm_map_lookup_entry(map, start, &current))
2867			current = current->next;
2868	}
2869
2870	vm_map_unlock_read(map);
2871	return (failed ? KERN_FAILURE : KERN_SUCCESS);
2872}
2873
2874/*
2875 *	vm_map_entry_unwire:	[ internal use only ]
2876 *
2877 *	Make the region specified by this entry pageable.
2878 *
2879 *	The map in question should be locked.
2880 *	[This is the reason for this routine's existence.]
2881 */
2882static void
2883vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
2884{
2885
2886	VM_MAP_ASSERT_LOCKED(map);
2887	KASSERT(entry->wired_count > 0,
2888	    ("vm_map_entry_unwire: entry %p isn't wired", entry));
2889	pmap_unwire(map->pmap, entry->start, entry->end);
2890	vm_object_unwire(entry->object.vm_object, entry->offset, entry->end -
2891	    entry->start, PQ_ACTIVE);
2892	entry->wired_count = 0;
2893}
2894
2895static void
2896vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
2897{
2898
2899	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
2900		vm_object_deallocate(entry->object.vm_object);
2901	uma_zfree(system_map ? kmapentzone : mapentzone, entry);
2902}
2903
2904/*
2905 *	vm_map_entry_delete:	[ internal use only ]
2906 *
2907 *	Deallocate the given entry from the target map.
2908 */
2909static void
2910vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
2911{
2912	vm_object_t object;
2913	vm_pindex_t offidxstart, offidxend, count, size1;
2914	vm_ooffset_t size;
2915
2916	vm_map_entry_unlink(map, entry);
2917	object = entry->object.vm_object;
2918
2919	if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
2920		MPASS(entry->cred == NULL);
2921		MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
2922		MPASS(object == NULL);
2923		vm_map_entry_deallocate(entry, map->system_map);
2924		return;
2925	}
2926
2927	size = entry->end - entry->start;
2928	map->size -= size;
2929
2930	if (entry->cred != NULL) {
2931		swap_release_by_cred(size, entry->cred);
2932		crfree(entry->cred);
2933	}
2934
2935	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
2936	    (object != NULL)) {
2937		KASSERT(entry->cred == NULL || object->cred == NULL ||
2938		    (entry->eflags & MAP_ENTRY_NEEDS_COPY),
2939		    ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
2940		count = OFF_TO_IDX(size);
2941		offidxstart = OFF_TO_IDX(entry->offset);
2942		offidxend = offidxstart + count;
2943		VM_OBJECT_WLOCK(object);
2944		if (object->ref_count != 1 && ((object->flags & (OBJ_NOSPLIT |
2945		    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
2946		    object == kernel_object || object == kmem_object)) {
2947			vm_object_collapse(object);
2948
2949			/*
2950			 * The option OBJPR_NOTMAPPED can be passed here
2951			 * because vm_map_delete() already performed
2952			 * pmap_remove() on the only mapping to this range
2953			 * of pages.
2954			 */
2955			vm_object_page_remove(object, offidxstart, offidxend,
2956			    OBJPR_NOTMAPPED);
2957			if (object->type == OBJT_SWAP)
2958				swap_pager_freespace(object, offidxstart,
2959				    count);
2960			if (offidxend >= object->size &&
2961			    offidxstart < object->size) {
2962				size1 = object->size;
2963				object->size = offidxstart;
2964				if (object->cred != NULL) {
2965					size1 -= object->size;
2966					KASSERT(object->charge >= ptoa(size1),
2967					    ("object %p charge < 0", object));
2968					swap_release_by_cred(ptoa(size1),
2969					    object->cred);
2970					object->charge -= ptoa(size1);
2971				}
2972			}
2973		}
2974		VM_OBJECT_WUNLOCK(object);
2975	} else
2976		entry->object.vm_object = NULL;
2977	if (map->system_map)
2978		vm_map_entry_deallocate(entry, TRUE);
2979	else {
2980		entry->next = curthread->td_map_def_user;
2981		curthread->td_map_def_user = entry;
2982	}
2983}
2984
2985/*
2986 *	vm_map_delete:	[ internal use only ]
2987 *
2988 *	Deallocates the given address range from the target
2989 *	map.
2990 */
2991int
2992vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
2993{
2994	vm_map_entry_t entry;
2995	vm_map_entry_t first_entry;
2996
2997	VM_MAP_ASSERT_LOCKED(map);
2998	if (start == end)
2999		return (KERN_SUCCESS);
3000
3001	/*
3002	 * Find the start of the region, and clip it
3003	 */
3004	if (!vm_map_lookup_entry(map, start, &first_entry))
3005		entry = first_entry->next;
3006	else {
3007		entry = first_entry;
3008		vm_map_clip_start(map, entry, start);
3009	}
3010
3011	/*
3012	 * Step through all entries in this region
3013	 */
3014	while ((entry != &map->header) && (entry->start < end)) {
3015		vm_map_entry_t next;
3016
3017		/*
3018		 * Wait for wiring or unwiring of an entry to complete.
3019		 * Also wait for any system wirings to disappear on
3020		 * user maps.
3021		 */
3022		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3023		    (vm_map_pmap(map) != kernel_pmap &&
3024		    vm_map_entry_system_wired_count(entry) != 0)) {
3025			unsigned int last_timestamp;
3026			vm_offset_t saved_start;
3027			vm_map_entry_t tmp_entry;
3028
3029			saved_start = entry->start;
3030			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3031			last_timestamp = map->timestamp;
3032			(void) vm_map_unlock_and_wait(map, 0);
3033			vm_map_lock(map);
3034			if (last_timestamp + 1 != map->timestamp) {
3035				/*
3036				 * Look again for the entry because the map was
3037				 * modified while it was unlocked.
3038				 * Specifically, the entry may have been
3039				 * clipped, merged, or deleted.
3040				 */
3041				if (!vm_map_lookup_entry(map, saved_start,
3042							 &tmp_entry))
3043					entry = tmp_entry->next;
3044				else {
3045					entry = tmp_entry;
3046					vm_map_clip_start(map, entry,
3047							  saved_start);
3048				}
3049			}
3050			continue;
3051		}
3052		vm_map_clip_end(map, entry, end);
3053
3054		next = entry->next;
3055
3056		/*
3057		 * Unwire before removing addresses from the pmap; otherwise,
3058		 * unwiring will put the entries back in the pmap.
3059		 */
3060		if (entry->wired_count != 0) {
3061			vm_map_entry_unwire(map, entry);
3062		}
3063
3064		pmap_remove(map->pmap, entry->start, entry->end);
3065
3066		/*
3067		 * Delete the entry only after removing all pmap
3068		 * entries pointing to its pages.  (Otherwise, its
3069		 * page frames may be reallocated, and any modify bits
3070		 * will be set in the wrong object!)
3071		 */
3072		vm_map_entry_delete(map, entry);
3073		entry = next;
3074	}
3075	return (KERN_SUCCESS);
3076}
3077
3078/*
3079 *	vm_map_remove:
3080 *
3081 *	Remove the given address range from the target map.
3082 *	This is the exported form of vm_map_delete.
3083 */
3084int
3085vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
3086{
3087	int result;
3088
3089	vm_map_lock(map);
3090	VM_MAP_RANGE_CHECK(map, start, end);
3091	result = vm_map_delete(map, start, end);
3092	vm_map_unlock(map);
3093	return (result);
3094}
3095
3096/*
3097 *	vm_map_check_protection:
3098 *
3099 *	Assert that the target map allows the specified privilege on the
3100 *	entire address region given.  The entire region must be allocated.
3101 *
3102 *	WARNING!  This code does not and should not check whether the
3103 *	contents of the region is accessible.  For example a smaller file
3104 *	might be mapped into a larger address space.
3105 *
3106 *	NOTE!  This code is also called by munmap().
3107 *
3108 *	The map must be locked.  A read lock is sufficient.
3109 */
3110boolean_t
3111vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
3112			vm_prot_t protection)
3113{
3114	vm_map_entry_t entry;
3115	vm_map_entry_t tmp_entry;
3116
3117	if (!vm_map_lookup_entry(map, start, &tmp_entry))
3118		return (FALSE);
3119	entry = tmp_entry;
3120
3121	while (start < end) {
3122		if (entry == &map->header)
3123			return (FALSE);
3124		/*
3125		 * No holes allowed!
3126		 */
3127		if (start < entry->start)
3128			return (FALSE);
3129		/*
3130		 * Check protection associated with entry.
3131		 */
3132		if ((entry->protection & protection) != protection)
3133			return (FALSE);
3134		/* go to next entry */
3135		start = entry->end;
3136		entry = entry->next;
3137	}
3138	return (TRUE);
3139}
3140
3141/*
3142 *	vm_map_copy_entry:
3143 *
3144 *	Copies the contents of the source entry to the destination
3145 *	entry.  The entries *must* be aligned properly.
3146 */
3147static void
3148vm_map_copy_entry(
3149	vm_map_t src_map,
3150	vm_map_t dst_map,
3151	vm_map_entry_t src_entry,
3152	vm_map_entry_t dst_entry,
3153	vm_ooffset_t *fork_charge)
3154{
3155	vm_object_t src_object;
3156	vm_map_entry_t fake_entry;
3157	vm_offset_t size;
3158	struct ucred *cred;
3159	int charged;
3160
3161	VM_MAP_ASSERT_LOCKED(dst_map);
3162
3163	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
3164		return;
3165
3166	if (src_entry->wired_count == 0 ||
3167	    (src_entry->protection & VM_PROT_WRITE) == 0) {
3168		/*
3169		 * If the source entry is marked needs_copy, it is already
3170		 * write-protected.
3171		 */
3172		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
3173		    (src_entry->protection & VM_PROT_WRITE) != 0) {
3174			pmap_protect(src_map->pmap,
3175			    src_entry->start,
3176			    src_entry->end,
3177			    src_entry->protection & ~VM_PROT_WRITE);
3178		}
3179
3180		/*
3181		 * Make a copy of the object.
3182		 */
3183		size = src_entry->end - src_entry->start;
3184		if ((src_object = src_entry->object.vm_object) != NULL) {
3185			VM_OBJECT_WLOCK(src_object);
3186			charged = ENTRY_CHARGED(src_entry);
3187			if (src_object->handle == NULL &&
3188			    (src_object->type == OBJT_DEFAULT ||
3189			    src_object->type == OBJT_SWAP)) {
3190				vm_object_collapse(src_object);
3191				if ((src_object->flags & (OBJ_NOSPLIT |
3192				    OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
3193					vm_object_split(src_entry);
3194					src_object =
3195					    src_entry->object.vm_object;
3196				}
3197			}
3198			vm_object_reference_locked(src_object);
3199			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
3200			if (src_entry->cred != NULL &&
3201			    !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
3202				KASSERT(src_object->cred == NULL,
3203				    ("OVERCOMMIT: vm_map_copy_entry: cred %p",
3204				     src_object));
3205				src_object->cred = src_entry->cred;
3206				src_object->charge = size;
3207			}
3208			VM_OBJECT_WUNLOCK(src_object);
3209			dst_entry->object.vm_object = src_object;
3210			if (charged) {
3211				cred = curthread->td_ucred;
3212				crhold(cred);
3213				dst_entry->cred = cred;
3214				*fork_charge += size;
3215				if (!(src_entry->eflags &
3216				      MAP_ENTRY_NEEDS_COPY)) {
3217					crhold(cred);
3218					src_entry->cred = cred;
3219					*fork_charge += size;
3220				}
3221			}
3222			src_entry->eflags |= MAP_ENTRY_COW |
3223			    MAP_ENTRY_NEEDS_COPY;
3224			dst_entry->eflags |= MAP_ENTRY_COW |
3225			    MAP_ENTRY_NEEDS_COPY;
3226			dst_entry->offset = src_entry->offset;
3227			if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3228				/*
3229				 * MAP_ENTRY_VN_WRITECNT cannot
3230				 * indicate write reference from
3231				 * src_entry, since the entry is
3232				 * marked as needs copy.  Allocate a
3233				 * fake entry that is used to
3234				 * decrement object->un_pager.vnp.writecount
3235				 * at the appropriate time.  Attach
3236				 * fake_entry to the deferred list.
3237				 */
3238				fake_entry = vm_map_entry_create(dst_map);
3239				fake_entry->eflags = MAP_ENTRY_VN_WRITECNT;
3240				src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT;
3241				vm_object_reference(src_object);
3242				fake_entry->object.vm_object = src_object;
3243				fake_entry->start = src_entry->start;
3244				fake_entry->end = src_entry->end;
3245				fake_entry->next = curthread->td_map_def_user;
3246				curthread->td_map_def_user = fake_entry;
3247			}
3248
3249			pmap_copy(dst_map->pmap, src_map->pmap,
3250			    dst_entry->start, dst_entry->end - dst_entry->start,
3251			    src_entry->start);
3252		} else {
3253			dst_entry->object.vm_object = NULL;
3254			dst_entry->offset = 0;
3255			if (src_entry->cred != NULL) {
3256				dst_entry->cred = curthread->td_ucred;
3257				crhold(dst_entry->cred);
3258				*fork_charge += size;
3259			}
3260		}
3261	} else {
3262		/*
3263		 * We don't want to make writeable wired pages copy-on-write.
3264		 * Immediately copy these pages into the new map by simulating
3265		 * page faults.  The new pages are pageable.
3266		 */
3267		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
3268		    fork_charge);
3269	}
3270}
3271
3272/*
3273 * vmspace_map_entry_forked:
3274 * Update the newly-forked vmspace each time a map entry is inherited
3275 * or copied.  The values for vm_dsize and vm_tsize are approximate
3276 * (and mostly-obsolete ideas in the face of mmap(2) et al.)
3277 */
3278static void
3279vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
3280    vm_map_entry_t entry)
3281{
3282	vm_size_t entrysize;
3283	vm_offset_t newend;
3284
3285	if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
3286		return;
3287	entrysize = entry->end - entry->start;
3288	vm2->vm_map.size += entrysize;
3289	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
3290		vm2->vm_ssize += btoc(entrysize);
3291	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
3292	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
3293		newend = MIN(entry->end,
3294		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
3295		vm2->vm_dsize += btoc(newend - entry->start);
3296	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
3297	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
3298		newend = MIN(entry->end,
3299		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
3300		vm2->vm_tsize += btoc(newend - entry->start);
3301	}
3302}
3303
3304/*
3305 * vmspace_fork:
3306 * Create a new process vmspace structure and vm_map
3307 * based on those of an existing process.  The new map
3308 * is based on the old map, according to the inheritance
3309 * values on the regions in that map.
3310 *
3311 * XXX It might be worth coalescing the entries added to the new vmspace.
3312 *
3313 * The source map must not be locked.
3314 */
3315struct vmspace *
3316vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
3317{
3318	struct vmspace *vm2;
3319	vm_map_t new_map, old_map;
3320	vm_map_entry_t new_entry, old_entry;
3321	vm_object_t object;
3322	int locked;
3323	vm_inherit_t inh;
3324
3325	old_map = &vm1->vm_map;
3326	/* Copy immutable fields of vm1 to vm2. */
3327	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
3328	if (vm2 == NULL)
3329		return (NULL);
3330	vm2->vm_taddr = vm1->vm_taddr;
3331	vm2->vm_daddr = vm1->vm_daddr;
3332	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
3333	vm_map_lock(old_map);
3334	if (old_map->busy)
3335		vm_map_wait_busy(old_map);
3336	new_map = &vm2->vm_map;
3337	locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
3338	KASSERT(locked, ("vmspace_fork: lock failed"));
3339
3340	old_entry = old_map->header.next;
3341
3342	while (old_entry != &old_map->header) {
3343		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
3344			panic("vm_map_fork: encountered a submap");
3345
3346		inh = old_entry->inheritance;
3347		if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
3348		    inh != VM_INHERIT_NONE)
3349			inh = VM_INHERIT_COPY;
3350
3351		switch (inh) {
3352		case VM_INHERIT_NONE:
3353			break;
3354
3355		case VM_INHERIT_SHARE:
3356			/*
3357			 * Clone the entry, creating the shared object if necessary.
3358			 */
3359			object = old_entry->object.vm_object;
3360			if (object == NULL) {
3361				object = vm_object_allocate(OBJT_DEFAULT,
3362					atop(old_entry->end - old_entry->start));
3363				old_entry->object.vm_object = object;
3364				old_entry->offset = 0;
3365				if (old_entry->cred != NULL) {
3366					object->cred = old_entry->cred;
3367					object->charge = old_entry->end -
3368					    old_entry->start;
3369					old_entry->cred = NULL;
3370				}
3371			}
3372
3373			/*
3374			 * Add the reference before calling vm_object_shadow
3375			 * to insure that a shadow object is created.
3376			 */
3377			vm_object_reference(object);
3378			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
3379				vm_object_shadow(&old_entry->object.vm_object,
3380				    &old_entry->offset,
3381				    old_entry->end - old_entry->start);
3382				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
3383				/* Transfer the second reference too. */
3384				vm_object_reference(
3385				    old_entry->object.vm_object);
3386
3387				/*
3388				 * As in vm_map_simplify_entry(), the
3389				 * vnode lock will not be acquired in
3390				 * this call to vm_object_deallocate().
3391				 */
3392				vm_object_deallocate(object);
3393				object = old_entry->object.vm_object;
3394			}
3395			VM_OBJECT_WLOCK(object);
3396			vm_object_clear_flag(object, OBJ_ONEMAPPING);
3397			if (old_entry->cred != NULL) {
3398				KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
3399				object->cred = old_entry->cred;
3400				object->charge = old_entry->end - old_entry->start;
3401				old_entry->cred = NULL;
3402			}
3403
3404			/*
3405			 * Assert the correct state of the vnode
3406			 * v_writecount while the object is locked, to
3407			 * not relock it later for the assertion
3408			 * correctness.
3409			 */
3410			if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
3411			    object->type == OBJT_VNODE) {
3412				KASSERT(((struct vnode *)object->handle)->
3413				    v_writecount > 0,
3414				    ("vmspace_fork: v_writecount %p", object));
3415				KASSERT(object->un_pager.vnp.writemappings > 0,
3416				    ("vmspace_fork: vnp.writecount %p",
3417				    object));
3418			}
3419			VM_OBJECT_WUNLOCK(object);
3420
3421			/*
3422			 * Clone the entry, referencing the shared object.
3423			 */
3424			new_entry = vm_map_entry_create(new_map);
3425			*new_entry = *old_entry;
3426			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3427			    MAP_ENTRY_IN_TRANSITION);
3428			new_entry->wiring_thread = NULL;
3429			new_entry->wired_count = 0;
3430			if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
3431				vnode_pager_update_writecount(object,
3432				    new_entry->start, new_entry->end);
3433			}
3434
3435			/*
3436			 * Insert the entry into the new map -- we know we're
3437			 * inserting at the end of the new map.
3438			 */
3439			vm_map_entry_link(new_map, new_map->header.prev,
3440			    new_entry);
3441			vmspace_map_entry_forked(vm1, vm2, new_entry);
3442
3443			/*
3444			 * Update the physical map
3445			 */
3446			pmap_copy(new_map->pmap, old_map->pmap,
3447			    new_entry->start,
3448			    (old_entry->end - old_entry->start),
3449			    old_entry->start);
3450			break;
3451
3452		case VM_INHERIT_COPY:
3453			/*
3454			 * Clone the entry and link into the map.
3455			 */
3456			new_entry = vm_map_entry_create(new_map);
3457			*new_entry = *old_entry;
3458			/*
3459			 * Copied entry is COW over the old object.
3460			 */
3461			new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
3462			    MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
3463			new_entry->wiring_thread = NULL;
3464			new_entry->wired_count = 0;
3465			new_entry->object.vm_object = NULL;
3466			new_entry->cred = NULL;
3467			vm_map_entry_link(new_map, new_map->header.prev,
3468			    new_entry);
3469			vmspace_map_entry_forked(vm1, vm2, new_entry);
3470			vm_map_copy_entry(old_map, new_map, old_entry,
3471			    new_entry, fork_charge);
3472			break;
3473
3474		case VM_INHERIT_ZERO:
3475			/*
3476			 * Create a new anonymous mapping entry modelled from
3477			 * the old one.
3478			 */
3479			new_entry = vm_map_entry_create(new_map);
3480			memset(new_entry, 0, sizeof(*new_entry));
3481
3482			new_entry->start = old_entry->start;
3483			new_entry->end = old_entry->end;
3484			new_entry->eflags = old_entry->eflags &
3485			    ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
3486			    MAP_ENTRY_VN_WRITECNT);
3487			new_entry->protection = old_entry->protection;
3488			new_entry->max_protection = old_entry->max_protection;
3489			new_entry->inheritance = VM_INHERIT_ZERO;
3490
3491			vm_map_entry_link(new_map, new_map->header.prev,
3492			    new_entry);
3493			vmspace_map_entry_forked(vm1, vm2, new_entry);
3494
3495			new_entry->cred = curthread->td_ucred;
3496			crhold(new_entry->cred);
3497			*fork_charge += (new_entry->end - new_entry->start);
3498
3499			break;
3500		}
3501		old_entry = old_entry->next;
3502	}
3503	/*
3504	 * Use inlined vm_map_unlock() to postpone handling the deferred
3505	 * map entries, which cannot be done until both old_map and
3506	 * new_map locks are released.
3507	 */
3508	sx_xunlock(&old_map->lock);
3509	sx_xunlock(&new_map->lock);
3510	vm_map_process_deferred();
3511
3512	return (vm2);
3513}
3514
3515/*
3516 * Create a process's stack for exec_new_vmspace().  This function is never
3517 * asked to wire the newly created stack.
3518 */
3519int
3520vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3521    vm_prot_t prot, vm_prot_t max, int cow)
3522{
3523	vm_size_t growsize, init_ssize;
3524	rlim_t vmemlim;
3525	int rv;
3526
3527	MPASS((map->flags & MAP_WIREFUTURE) == 0);
3528	growsize = sgrowsiz;
3529	init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
3530	vm_map_lock(map);
3531	PROC_LOCK(curproc);
3532	vmemlim = lim_cur(curproc, RLIMIT_VMEM);
3533	PROC_UNLOCK(curproc);
3534	/* If we would blow our VMEM resource limit, no go */
3535	if (map->size + init_ssize > vmemlim) {
3536		rv = KERN_NO_SPACE;
3537		goto out;
3538	}
3539	rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
3540	    max, cow);
3541out:
3542	vm_map_unlock(map);
3543	return (rv);
3544}
3545
3546static int stack_guard_page = 1;
3547SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
3548    &stack_guard_page, 0,
3549    "Specifies the number of guard pages for a stack that grows");
3550
3551static int
3552vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
3553    vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
3554{
3555	vm_map_entry_t new_entry, prev_entry;
3556	vm_offset_t bot, gap_bot, gap_top, top;
3557	vm_size_t init_ssize, sgp;
3558	int orient, rv;
3559
3560	/*
3561	 * The stack orientation is piggybacked with the cow argument.
3562	 * Extract it into orient and mask the cow argument so that we
3563	 * don't pass it around further.
3564	 */
3565	orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
3566	KASSERT(orient != 0, ("No stack grow direction"));
3567	KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
3568	    ("bi-dir stack"));
3569
3570	if (addrbos < vm_map_min(map) ||
3571	    addrbos + max_ssize > vm_map_max(map) ||
3572	    addrbos + max_ssize <= addrbos)
3573		return (KERN_INVALID_ADDRESS);
3574	sgp = (vm_size_t)stack_guard_page * PAGE_SIZE;
3575	if (sgp >= max_ssize)
3576		return (KERN_INVALID_ARGUMENT);
3577
3578	init_ssize = growsize;
3579	if (max_ssize < init_ssize + sgp)
3580		init_ssize = max_ssize - sgp;
3581
3582	/* If addr is already mapped, no go */
3583	if (vm_map_lookup_entry(map, addrbos, &prev_entry))
3584		return (KERN_NO_SPACE);
3585
3586	/*
3587	 * If we can't accomodate max_ssize in the current mapping, no go.
3588	 */
3589	if ((prev_entry->next != &map->header) &&
3590	    (prev_entry->next->start < addrbos + max_ssize))
3591		return (KERN_NO_SPACE);
3592
3593	/*
3594	 * We initially map a stack of only init_ssize.  We will grow as
3595	 * needed later.  Depending on the orientation of the stack (i.e.
3596	 * the grow direction) we either map at the top of the range, the
3597	 * bottom of the range or in the middle.
3598	 *
3599	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
3600	 * and cow to be 0.  Possibly we should eliminate these as input
3601	 * parameters, and just pass these values here in the insert call.
3602	 */
3603	if (orient == MAP_STACK_GROWS_DOWN) {
3604		bot = addrbos + max_ssize - init_ssize;
3605		top = bot + init_ssize;
3606		gap_bot = addrbos;
3607		gap_top = bot;
3608	} else /* if (orient == MAP_STACK_GROWS_UP) */ {
3609		bot = addrbos;
3610		top = bot + init_ssize;
3611		gap_bot = top;
3612		gap_top = addrbos + max_ssize;
3613	}
3614	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
3615	if (rv != KERN_SUCCESS)
3616		return (rv);
3617	new_entry = prev_entry->next;
3618	KASSERT(new_entry->end == top || new_entry->start == bot,
3619	    ("Bad entry start/end for new stack entry"));
3620	KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
3621	    (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
3622	    ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
3623	KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
3624	    (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
3625	    ("new entry lacks MAP_ENTRY_GROWS_UP"));
3626	rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
3627	    VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
3628	    MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
3629	if (rv != KERN_SUCCESS)
3630		(void)vm_map_delete(map, bot, top);
3631	return (rv);
3632}
3633
3634/*
3635 * Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if we
3636 * successfully grow the stack.
3637 */
3638static int
3639vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
3640{
3641	vm_map_entry_t stack_entry;
3642	struct proc *p;
3643	struct vmspace *vm;
3644	struct ucred *cred;
3645	vm_offset_t gap_end, gap_start, grow_start;
3646	size_t grow_amount, guard, max_grow;
3647	rlim_t lmemlim, stacklim, vmemlim;
3648	int rv, rv1;
3649	bool gap_deleted, grow_down, is_procstack;
3650#ifdef notyet
3651	uint64_t limit;
3652#endif
3653#ifdef RACCT
3654	int error;
3655#endif
3656
3657	p = curproc;
3658	vm = p->p_vmspace;
3659
3660	/*
3661	 * Disallow stack growth when the access is performed by a
3662	 * debugger or AIO daemon.  The reason is that the wrong
3663	 * resource limits are applied.
3664	 */
3665	if (map != &p->p_vmspace->vm_map || p->p_textvp == NULL)
3666		return (KERN_FAILURE);
3667
3668	MPASS(!map->system_map);
3669
3670	guard = stack_guard_page * PAGE_SIZE;
3671	PROC_LOCK(p);
3672	lmemlim = lim_cur(p, RLIMIT_MEMLOCK);
3673	stacklim = lim_cur(p, RLIMIT_STACK);
3674	vmemlim = lim_cur(p, RLIMIT_VMEM);
3675	PROC_UNLOCK(p);
3676retry:
3677	/* If addr is not in a hole for a stack grow area, no need to grow. */
3678	if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
3679		return (KERN_FAILURE);
3680	if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
3681		return (KERN_SUCCESS);
3682	if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
3683		stack_entry = gap_entry->next;
3684		if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
3685		    stack_entry->start != gap_entry->end)
3686			return (KERN_FAILURE);
3687		grow_amount = round_page(stack_entry->start - addr);
3688		grow_down = true;
3689	} else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
3690		stack_entry = gap_entry->prev;
3691		if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
3692		    stack_entry->end != gap_entry->start)
3693			return (KERN_FAILURE);
3694		grow_amount = round_page(addr + 1 - stack_entry->end);
3695		grow_down = false;
3696	} else {
3697		return (KERN_FAILURE);
3698	}
3699	max_grow = gap_entry->end - gap_entry->start;
3700	if (guard > max_grow)
3701		return (KERN_NO_SPACE);
3702	max_grow -= guard;
3703	if (grow_amount > max_grow)
3704		return (KERN_NO_SPACE);
3705
3706	/*
3707	 * If this is the main process stack, see if we're over the stack
3708	 * limit.
3709	 */
3710	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
3711	    addr < (vm_offset_t)p->p_sysent->sv_usrstack;
3712	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
3713		return (KERN_NO_SPACE);
3714
3715#ifdef RACCT
3716	if (racct_enable) {
3717		PROC_LOCK(p);
3718		if (is_procstack && racct_set(p, RACCT_STACK,
3719		    ctob(vm->vm_ssize) + grow_amount)) {
3720			PROC_UNLOCK(p);
3721			return (KERN_NO_SPACE);
3722		}
3723		PROC_UNLOCK(p);
3724	}
3725#endif
3726
3727	grow_amount = roundup(grow_amount, sgrowsiz);
3728	if (grow_amount > max_grow)
3729		grow_amount = max_grow;
3730	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
3731		grow_amount = trunc_page((vm_size_t)stacklim) -
3732		    ctob(vm->vm_ssize);
3733	}
3734
3735#ifdef notyet
3736	PROC_LOCK(p);
3737	limit = racct_get_available(p, RACCT_STACK);
3738	PROC_UNLOCK(p);
3739	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
3740		grow_amount = limit - ctob(vm->vm_ssize);
3741#endif
3742
3743	if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
3744		if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
3745			rv = KERN_NO_SPACE;
3746			goto out;
3747		}
3748#ifdef RACCT
3749		if (racct_enable) {
3750			PROC_LOCK(p);
3751			if (racct_set(p, RACCT_MEMLOCK,
3752			    ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
3753				PROC_UNLOCK(p);
3754				rv = KERN_NO_SPACE;
3755				goto out;
3756			}
3757			PROC_UNLOCK(p);
3758		}
3759#endif
3760	}
3761
3762	/* If we would blow our VMEM resource limit, no go */
3763	if (map->size + grow_amount > vmemlim) {
3764		rv = KERN_NO_SPACE;
3765		goto out;
3766	}
3767#ifdef RACCT
3768	if (racct_enable) {
3769		PROC_LOCK(p);
3770		if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
3771			PROC_UNLOCK(p);
3772			rv = KERN_NO_SPACE;
3773			goto out;
3774		}
3775		PROC_UNLOCK(p);
3776	}
3777#endif
3778
3779	if (vm_map_lock_upgrade(map)) {
3780		gap_entry = NULL;
3781		vm_map_lock_read(map);
3782		goto retry;
3783	}
3784
3785	if (grow_down) {
3786		grow_start = gap_entry->end - grow_amount;
3787		if (gap_entry->start + grow_amount == gap_entry->end) {
3788			gap_start = gap_entry->start;
3789			gap_end = gap_entry->end;
3790			vm_map_entry_delete(map, gap_entry);
3791			gap_deleted = true;
3792		} else {
3793			MPASS(gap_entry->start < gap_entry->end - grow_amount);
3794			gap_entry->end -= grow_amount;
3795			vm_map_entry_resize_free(map, gap_entry);
3796			gap_deleted = false;
3797		}
3798		rv = vm_map_insert(map, NULL, 0, grow_start,
3799		    grow_start + grow_amount,
3800		    stack_entry->protection, stack_entry->max_protection,
3801		    MAP_STACK_GROWS_DOWN);
3802		if (rv != KERN_SUCCESS) {
3803			if (gap_deleted) {
3804				rv1 = vm_map_insert(map, NULL, 0, gap_start,
3805				    gap_end, VM_PROT_NONE, VM_PROT_NONE,
3806				    MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
3807				MPASS(rv1 == KERN_SUCCESS);
3808			} else {
3809				gap_entry->end += grow_amount;
3810				vm_map_entry_resize_free(map, gap_entry);
3811			}
3812		}
3813	} else {
3814		grow_start = stack_entry->end;
3815		cred = stack_entry->cred;
3816		if (cred == NULL && stack_entry->object.vm_object != NULL)
3817			cred = stack_entry->object.vm_object->cred;
3818		if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
3819			rv = KERN_NO_SPACE;
3820		/* Grow the underlying object if applicable. */
3821		else if (stack_entry->object.vm_object == NULL ||
3822		    vm_object_coalesce(stack_entry->object.vm_object,
3823		    stack_entry->offset,
3824		    (vm_size_t)(stack_entry->end - stack_entry->start),
3825		    (vm_size_t)grow_amount, cred != NULL)) {
3826			if (gap_entry->start + grow_amount == gap_entry->end)
3827				vm_map_entry_delete(map, gap_entry);
3828			else
3829				gap_entry->start += grow_amount;
3830			stack_entry->end += grow_amount;
3831			map->size += grow_amount;
3832			vm_map_entry_resize_free(map, stack_entry);
3833			rv = KERN_SUCCESS;
3834		} else
3835			rv = KERN_FAILURE;
3836	}
3837	if (rv == KERN_SUCCESS && is_procstack)
3838		vm->vm_ssize += btoc(grow_amount);
3839
3840	/*
3841	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
3842	 */
3843	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
3844		vm_map_unlock(map);
3845		vm_map_wire(map, grow_start, grow_start + grow_amount,
3846		    (p->p_flag & P_SYSTEM)
3847		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
3848		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
3849		vm_map_lock_read(map);
3850	} else
3851		vm_map_lock_downgrade(map);
3852
3853out:
3854#ifdef RACCT
3855	if (racct_enable && rv != KERN_SUCCESS) {
3856		PROC_LOCK(p);
3857		error = racct_set(p, RACCT_VMEM, map->size);
3858		KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
3859		if (!old_mlock) {
3860			error = racct_set(p, RACCT_MEMLOCK,
3861			    ptoa(pmap_wired_count(map->pmap)));
3862			KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
3863		}
3864	    	error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
3865		KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
3866		PROC_UNLOCK(p);
3867	}
3868#endif
3869
3870	return (rv);
3871}
3872
3873/*
3874 * Unshare the specified VM space for exec.  If other processes are
3875 * mapped to it, then create a new one.  The new vmspace is null.
3876 */
3877int
3878vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
3879{
3880	struct vmspace *oldvmspace = p->p_vmspace;
3881	struct vmspace *newvmspace;
3882
3883	KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
3884	    ("vmspace_exec recursed"));
3885	newvmspace = vmspace_alloc(minuser, maxuser, NULL);
3886	if (newvmspace == NULL)
3887		return (ENOMEM);
3888	newvmspace->vm_swrss = oldvmspace->vm_swrss;
3889	/*
3890	 * This code is written like this for prototype purposes.  The
3891	 * goal is to avoid running down the vmspace here, but let the
3892	 * other process's that are still using the vmspace to finally
3893	 * run it down.  Even though there is little or no chance of blocking
3894	 * here, it is a good idea to keep this form for future mods.
3895	 */
3896	PROC_VMSPACE_LOCK(p);
3897	p->p_vmspace = newvmspace;
3898	PROC_VMSPACE_UNLOCK(p);
3899	if (p == curthread->td_proc)
3900		pmap_activate(curthread);
3901	curthread->td_pflags |= TDP_EXECVMSPC;
3902	return (0);
3903}
3904
3905/*
3906 * Unshare the specified VM space for forcing COW.  This
3907 * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
3908 */
3909int
3910vmspace_unshare(struct proc *p)
3911{
3912	struct vmspace *oldvmspace = p->p_vmspace;
3913	struct vmspace *newvmspace;
3914	vm_ooffset_t fork_charge;
3915
3916	if (oldvmspace->vm_refcnt == 1)
3917		return (0);
3918	fork_charge = 0;
3919	newvmspace = vmspace_fork(oldvmspace, &fork_charge);
3920	if (newvmspace == NULL)
3921		return (ENOMEM);
3922	if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
3923		vmspace_free(newvmspace);
3924		return (ENOMEM);
3925	}
3926	PROC_VMSPACE_LOCK(p);
3927	p->p_vmspace = newvmspace;
3928	PROC_VMSPACE_UNLOCK(p);
3929	if (p == curthread->td_proc)
3930		pmap_activate(curthread);
3931	vmspace_free(oldvmspace);
3932	return (0);
3933}
3934
3935/*
3936 *	vm_map_lookup:
3937 *
3938 *	Finds the VM object, offset, and
3939 *	protection for a given virtual address in the
3940 *	specified map, assuming a page fault of the
3941 *	type specified.
3942 *
3943 *	Leaves the map in question locked for read; return
3944 *	values are guaranteed until a vm_map_lookup_done
3945 *	call is performed.  Note that the map argument
3946 *	is in/out; the returned map must be used in
3947 *	the call to vm_map_lookup_done.
3948 *
3949 *	A handle (out_entry) is returned for use in
3950 *	vm_map_lookup_done, to make that fast.
3951 *
3952 *	If a lookup is requested with "write protection"
3953 *	specified, the map may be changed to perform virtual
3954 *	copying operations, although the data referenced will
3955 *	remain the same.
3956 */
3957int
3958vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
3959	      vm_offset_t vaddr,
3960	      vm_prot_t fault_typea,
3961	      vm_map_entry_t *out_entry,	/* OUT */
3962	      vm_object_t *object,		/* OUT */
3963	      vm_pindex_t *pindex,		/* OUT */
3964	      vm_prot_t *out_prot,		/* OUT */
3965	      boolean_t *wired)			/* OUT */
3966{
3967	vm_map_entry_t entry;
3968	vm_map_t map = *var_map;
3969	vm_prot_t prot;
3970	vm_prot_t fault_type = fault_typea;
3971	vm_object_t eobject;
3972	vm_size_t size;
3973	struct ucred *cred;
3974
3975RetryLookup:
3976
3977	vm_map_lock_read(map);
3978
3979RetryLookupLocked:
3980	/*
3981	 * Lookup the faulting address.
3982	 */
3983	if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
3984		vm_map_unlock_read(map);
3985		return (KERN_INVALID_ADDRESS);
3986	}
3987
3988	entry = *out_entry;
3989
3990	/*
3991	 * Handle submaps.
3992	 */
3993	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
3994		vm_map_t old_map = map;
3995
3996		*var_map = map = entry->object.sub_map;
3997		vm_map_unlock_read(old_map);
3998		goto RetryLookup;
3999	}
4000
4001	/*
4002	 * Check whether this task is allowed to have this page.
4003	 */
4004	prot = entry->protection;
4005	if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4006		fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4007		if (prot == VM_PROT_NONE && map != kernel_map &&
4008		    (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4009		    (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4010		    MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4011		    vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4012			goto RetryLookupLocked;
4013	}
4014	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4015	if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4016		vm_map_unlock_read(map);
4017		return (KERN_PROTECTION_FAILURE);
4018	}
4019	KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
4020	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
4021	    (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
4022	    ("entry %p flags %x", entry, entry->eflags));
4023	if ((fault_typea & VM_PROT_COPY) != 0 &&
4024	    (entry->max_protection & VM_PROT_WRITE) == 0 &&
4025	    (entry->eflags & MAP_ENTRY_COW) == 0) {
4026		vm_map_unlock_read(map);
4027		return (KERN_PROTECTION_FAILURE);
4028	}
4029
4030	/*
4031	 * If this page is not pageable, we have to get it for all possible
4032	 * accesses.
4033	 */
4034	*wired = (entry->wired_count != 0);
4035	if (*wired)
4036		fault_type = entry->protection;
4037	size = entry->end - entry->start;
4038	/*
4039	 * If the entry was copy-on-write, we either ...
4040	 */
4041	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4042		/*
4043		 * If we want to write the page, we may as well handle that
4044		 * now since we've got the map locked.
4045		 *
4046		 * If we don't need to write the page, we just demote the
4047		 * permissions allowed.
4048		 */
4049		if ((fault_type & VM_PROT_WRITE) != 0 ||
4050		    (fault_typea & VM_PROT_COPY) != 0) {
4051			/*
4052			 * Make a new object, and place it in the object
4053			 * chain.  Note that no new references have appeared
4054			 * -- one just moved from the map to the new
4055			 * object.
4056			 */
4057			if (vm_map_lock_upgrade(map))
4058				goto RetryLookup;
4059
4060			if (entry->cred == NULL) {
4061				/*
4062				 * The debugger owner is charged for
4063				 * the memory.
4064				 */
4065				cred = curthread->td_ucred;
4066				crhold(cred);
4067				if (!swap_reserve_by_cred(size, cred)) {
4068					crfree(cred);
4069					vm_map_unlock(map);
4070					return (KERN_RESOURCE_SHORTAGE);
4071				}
4072				entry->cred = cred;
4073			}
4074			vm_object_shadow(&entry->object.vm_object,
4075			    &entry->offset, size);
4076			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4077			eobject = entry->object.vm_object;
4078			if (eobject->cred != NULL) {
4079				/*
4080				 * The object was not shadowed.
4081				 */
4082				swap_release_by_cred(size, entry->cred);
4083				crfree(entry->cred);
4084				entry->cred = NULL;
4085			} else if (entry->cred != NULL) {
4086				VM_OBJECT_WLOCK(eobject);
4087				eobject->cred = entry->cred;
4088				eobject->charge = size;
4089				VM_OBJECT_WUNLOCK(eobject);
4090				entry->cred = NULL;
4091			}
4092
4093			vm_map_lock_downgrade(map);
4094		} else {
4095			/*
4096			 * We're attempting to read a copy-on-write page --
4097			 * don't allow writes.
4098			 */
4099			prot &= ~VM_PROT_WRITE;
4100		}
4101	}
4102
4103	/*
4104	 * Create an object if necessary.
4105	 */
4106	if (entry->object.vm_object == NULL &&
4107	    !map->system_map) {
4108		if (vm_map_lock_upgrade(map))
4109			goto RetryLookup;
4110		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
4111		    atop(size));
4112		entry->offset = 0;
4113		if (entry->cred != NULL) {
4114			VM_OBJECT_WLOCK(entry->object.vm_object);
4115			entry->object.vm_object->cred = entry->cred;
4116			entry->object.vm_object->charge = size;
4117			VM_OBJECT_WUNLOCK(entry->object.vm_object);
4118			entry->cred = NULL;
4119		}
4120		vm_map_lock_downgrade(map);
4121	}
4122
4123	/*
4124	 * Return the object/offset from this entry.  If the entry was
4125	 * copy-on-write or empty, it has been fixed up.
4126	 */
4127	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4128	*object = entry->object.vm_object;
4129
4130	*out_prot = prot;
4131	return (KERN_SUCCESS);
4132}
4133
4134/*
4135 *	vm_map_lookup_locked:
4136 *
4137 *	Lookup the faulting address.  A version of vm_map_lookup that returns
4138 *      KERN_FAILURE instead of blocking on map lock or memory allocation.
4139 */
4140int
4141vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
4142		     vm_offset_t vaddr,
4143		     vm_prot_t fault_typea,
4144		     vm_map_entry_t *out_entry,	/* OUT */
4145		     vm_object_t *object,	/* OUT */
4146		     vm_pindex_t *pindex,	/* OUT */
4147		     vm_prot_t *out_prot,	/* OUT */
4148		     boolean_t *wired)		/* OUT */
4149{
4150	vm_map_entry_t entry;
4151	vm_map_t map = *var_map;
4152	vm_prot_t prot;
4153	vm_prot_t fault_type = fault_typea;
4154
4155	/*
4156	 * Lookup the faulting address.
4157	 */
4158	if (!vm_map_lookup_entry(map, vaddr, out_entry))
4159		return (KERN_INVALID_ADDRESS);
4160
4161	entry = *out_entry;
4162
4163	/*
4164	 * Fail if the entry refers to a submap.
4165	 */
4166	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
4167		return (KERN_FAILURE);
4168
4169	/*
4170	 * Check whether this task is allowed to have this page.
4171	 */
4172	prot = entry->protection;
4173	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
4174	if ((fault_type & prot) != fault_type)
4175		return (KERN_PROTECTION_FAILURE);
4176
4177	/*
4178	 * If this page is not pageable, we have to get it for all possible
4179	 * accesses.
4180	 */
4181	*wired = (entry->wired_count != 0);
4182	if (*wired)
4183		fault_type = entry->protection;
4184
4185	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4186		/*
4187		 * Fail if the entry was copy-on-write for a write fault.
4188		 */
4189		if (fault_type & VM_PROT_WRITE)
4190			return (KERN_FAILURE);
4191		/*
4192		 * We're attempting to read a copy-on-write page --
4193		 * don't allow writes.
4194		 */
4195		prot &= ~VM_PROT_WRITE;
4196	}
4197
4198	/*
4199	 * Fail if an object should be created.
4200	 */
4201	if (entry->object.vm_object == NULL && !map->system_map)
4202		return (KERN_FAILURE);
4203
4204	/*
4205	 * Return the object/offset from this entry.  If the entry was
4206	 * copy-on-write or empty, it has been fixed up.
4207	 */
4208	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
4209	*object = entry->object.vm_object;
4210
4211	*out_prot = prot;
4212	return (KERN_SUCCESS);
4213}
4214
4215/*
4216 *	vm_map_lookup_done:
4217 *
4218 *	Releases locks acquired by a vm_map_lookup
4219 *	(according to the handle returned by that lookup).
4220 */
4221void
4222vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
4223{
4224	/*
4225	 * Unlock the main-level map
4226	 */
4227	vm_map_unlock_read(map);
4228}
4229
4230#include "opt_ddb.h"
4231#ifdef DDB
4232#include <sys/kernel.h>
4233
4234#include <ddb/ddb.h>
4235
4236static void
4237vm_map_print(vm_map_t map)
4238{
4239	vm_map_entry_t entry;
4240
4241	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
4242	    (void *)map,
4243	    (void *)map->pmap, map->nentries, map->timestamp);
4244
4245	db_indent += 2;
4246	for (entry = map->header.next; entry != &map->header;
4247	    entry = entry->next) {
4248		db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
4249		    (void *)entry, (void *)entry->start, (void *)entry->end,
4250		    entry->eflags);
4251		{
4252			static char *inheritance_name[4] =
4253			{"share", "copy", "none", "donate_copy"};
4254
4255			db_iprintf(" prot=%x/%x/%s",
4256			    entry->protection,
4257			    entry->max_protection,
4258			    inheritance_name[(int)(unsigned char)entry->inheritance]);
4259			if (entry->wired_count != 0)
4260				db_printf(", wired");
4261		}
4262		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4263			db_printf(", share=%p, offset=0x%jx\n",
4264			    (void *)entry->object.sub_map,
4265			    (uintmax_t)entry->offset);
4266			if ((entry->prev == &map->header) ||
4267			    (entry->prev->object.sub_map !=
4268				entry->object.sub_map)) {
4269				db_indent += 2;
4270				vm_map_print((vm_map_t)entry->object.sub_map);
4271				db_indent -= 2;
4272			}
4273		} else {
4274			if (entry->cred != NULL)
4275				db_printf(", ruid %d", entry->cred->cr_ruid);
4276			db_printf(", object=%p, offset=0x%jx",
4277			    (void *)entry->object.vm_object,
4278			    (uintmax_t)entry->offset);
4279			if (entry->object.vm_object && entry->object.vm_object->cred)
4280				db_printf(", obj ruid %d charge %jx",
4281				    entry->object.vm_object->cred->cr_ruid,
4282				    (uintmax_t)entry->object.vm_object->charge);
4283			if (entry->eflags & MAP_ENTRY_COW)
4284				db_printf(", copy (%s)",
4285				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
4286			db_printf("\n");
4287
4288			if ((entry->prev == &map->header) ||
4289			    (entry->prev->object.vm_object !=
4290				entry->object.vm_object)) {
4291				db_indent += 2;
4292				vm_object_print((db_expr_t)(intptr_t)
4293						entry->object.vm_object,
4294						0, 0, (char *)0);
4295				db_indent -= 2;
4296			}
4297		}
4298	}
4299	db_indent -= 2;
4300}
4301
4302DB_SHOW_COMMAND(map, map)
4303{
4304
4305	if (!have_addr) {
4306		db_printf("usage: show map <addr>\n");
4307		return;
4308	}
4309	vm_map_print((vm_map_t)addr);
4310}
4311
4312DB_SHOW_COMMAND(procvm, procvm)
4313{
4314	struct proc *p;
4315
4316	if (have_addr) {
4317		p = db_lookup_proc(addr);
4318	} else {
4319		p = curproc;
4320	}
4321
4322	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
4323	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
4324	    (void *)vmspace_pmap(p->p_vmspace));
4325
4326	vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
4327}
4328
4329#endif /* DDB */
4330