vm_fault.c revision 270628
1243791Sdim/*-
2243791Sdim * Copyright (c) 1991, 1993
3353358Sdim *	The Regents of the University of California.  All rights reserved.
4353358Sdim * Copyright (c) 1994 John S. Dyson
5353358Sdim * All rights reserved.
6243791Sdim * Copyright (c) 1994 David Greenman
7243791Sdim * All rights reserved.
8243791Sdim *
9243791Sdim *
10243791Sdim * This code is derived from software contributed to Berkeley by
11243791Sdim * The Mach Operating System project at Carnegie-Mellon University.
12243791Sdim *
13243791Sdim * Redistribution and use in source and binary forms, with or without
14243791Sdim * modification, are permitted provided that the following conditions
15243791Sdim * are met:
16243791Sdim * 1. Redistributions of source code must retain the above copyright
17243791Sdim *    notice, this list of conditions and the following disclaimer.
18243791Sdim * 2. Redistributions in binary form must reproduce the above copyright
19243791Sdim *    notice, this list of conditions and the following disclaimer in the
20243791Sdim *    documentation and/or other materials provided with the distribution.
21243791Sdim * 3. All advertising materials mentioning features or use of this software
22243791Sdim *    must display the following acknowledgement:
23243791Sdim *	This product includes software developed by the University of
24243791Sdim *	California, Berkeley and its contributors.
25243791Sdim * 4. Neither the name of the University nor the names of its contributors
26243791Sdim *    may be used to endorse or promote products derived from this software
27243791Sdim *    without specific prior written permission.
28327952Sdim *
29243791Sdim * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30243791Sdim * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31243791Sdim * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32243791Sdim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33243791Sdim * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34243791Sdim * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35243791Sdim * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36243791Sdim * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37243791Sdim * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38243791Sdim * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39243791Sdim * SUCH DAMAGE.
40249423Sdim *
41249423Sdim *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
42353358Sdim *
43243791Sdim *
44243791Sdim * Copyright (c) 1987, 1990 Carnegie-Mellon University.
45243791Sdim * All rights reserved.
46243791Sdim *
47243791Sdim * Authors: Avadis Tevanian, Jr., Michael Wayne Young
48243791Sdim *
49243791Sdim * Permission to use, copy, modify and distribute this software and
50243791Sdim * its documentation is hereby granted, provided that both the copyright
51243791Sdim * notice and this permission notice appear in all copies of the
52243791Sdim * software, derivative works or modified versions, and any portions
53243791Sdim * thereof, and that both notices appear in supporting documentation.
54243791Sdim *
55327952Sdim * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
56327952Sdim * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
57327952Sdim * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
58327952Sdim *
59280031Sdim * Carnegie Mellon requests users of this software to return to
60327952Sdim *
61327952Sdim *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
62327952Sdim *  School of Computer Science
63327952Sdim *  Carnegie Mellon University
64280031Sdim *  Pittsburgh PA 15213-3890
65327952Sdim *
66327952Sdim * any improvements or extensions that they make and grant Carnegie the
67327952Sdim * rights to redistribute these changes.
68327952Sdim */
69280031Sdim
70327952Sdim/*
71327952Sdim *	Page fault handling module.
72327952Sdim */
73327952Sdim
74280031Sdim#include <sys/cdefs.h>
75327952Sdim__FBSDID("$FreeBSD: stable/10/sys/vm/vm_fault.c 270628 2014-08-25 21:16:57Z kib $");
76327952Sdim
77327952Sdim#include "opt_ktrace.h"
78327952Sdim#include "opt_vm.h"
79327952Sdim
80327952Sdim#include <sys/param.h>
81327952Sdim#include <sys/systm.h>
82280031Sdim#include <sys/kernel.h>
83327952Sdim#include <sys/lock.h>
84296417Sdim#include <sys/proc.h>
85296417Sdim#include <sys/resourcevar.h>
86243791Sdim#include <sys/rwlock.h>
87276479Sdim#include <sys/sysctl.h>
88327952Sdim#include <sys/vmmeter.h>
89327952Sdim#include <sys/vnode.h>
90276479Sdim#ifdef KTRACE
91243791Sdim#include <sys/ktrace.h>
92296417Sdim#endif
93327952Sdim
94243791Sdim#include <vm/vm.h>
95280031Sdim#include <vm/vm_param.h>
96280031Sdim#include <vm/pmap.h>
97280031Sdim#include <vm/vm_map.h>
98327952Sdim#include <vm/vm_object.h>
99280031Sdim#include <vm/vm_page.h>
100280031Sdim#include <vm/vm_pageout.h>
101280031Sdim#include <vm/vm_kern.h>
102280031Sdim#include <vm/vm_pager.h>
103327952Sdim#include <vm/vm_extern.h>
104280031Sdim
105327952Sdim#define PFBAK 4
106327952Sdim#define PFFOR 4
107288943Sdim
108276479Sdimstatic int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
109321369Sdim
110321369Sdim#define	VM_FAULT_READ_BEHIND	8
111321369Sdim#define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
112327952Sdim#define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
113327952Sdim#define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
114327952Sdim#define	VM_FAULT_CACHE_BEHIND	(VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
115327952Sdim
116327952Sdimstruct faultstate {
117327952Sdim	vm_page_t m;
118327952Sdim	vm_object_t object;
119327952Sdim	vm_pindex_t pindex;
120327952Sdim	vm_page_t first_m;
121327952Sdim	vm_object_t	first_object;
122327952Sdim	vm_pindex_t first_pindex;
123309124Sdim	vm_map_t map;
124309124Sdim	vm_map_entry_t entry;
125243791Sdim	int lookup_still_valid;
126296417Sdim	struct vnode *vp;
127296417Sdim};
128327952Sdim
129243791Sdimstatic void vm_fault_cache_behind(const struct faultstate *fs, int distance);
130243791Sdimstatic void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
131280031Sdim	    int faultcount, int reqpage);
132280031Sdim
133243791Sdimstatic inline void
134280031Sdimrelease_page(struct faultstate *fs)
135280031Sdim{
136243791Sdim
137296417Sdim	vm_page_xunbusy(fs->m);
138296417Sdim	vm_page_lock(fs->m);
139296417Sdim	vm_page_deactivate(fs->m);
140296417Sdim	vm_page_unlock(fs->m);
141296417Sdim	fs->m = NULL;
142296417Sdim}
143243791Sdim
144280031Sdimstatic inline void
145360784Sdimunlock_map(struct faultstate *fs)
146280031Sdim{
147327952Sdim
148327952Sdim	if (fs->lookup_still_valid) {
149327952Sdim		vm_map_lookup_done(fs->map, fs->entry);
150327952Sdim		fs->lookup_still_valid = FALSE;
151280031Sdim	}
152327952Sdim}
153280031Sdim
154327952Sdimstatic void
155243791Sdimunlock_and_deallocate(struct faultstate *fs)
156327952Sdim{
157327952Sdim
158327952Sdim	vm_object_pip_wakeup(fs->object);
159327952Sdim	VM_OBJECT_WUNLOCK(fs->object);
160327952Sdim	if (fs->object != fs->first_object) {
161327952Sdim		VM_OBJECT_WLOCK(fs->first_object);
162327952Sdim		vm_page_lock(fs->first_m);
163327952Sdim		vm_page_free(fs->first_m);
164327952Sdim		vm_page_unlock(fs->first_m);
165327952Sdim		vm_object_pip_wakeup(fs->first_object);
166327952Sdim		VM_OBJECT_WUNLOCK(fs->first_object);
167327952Sdim		fs->first_m = NULL;
168327952Sdim	}
169327952Sdim	vm_object_deallocate(fs->first_object);
170327952Sdim	unlock_map(fs);
171327952Sdim	if (fs->vp != NULL) {
172327952Sdim		vput(fs->vp);
173327952Sdim		fs->vp = NULL;
174327952Sdim	}
175327952Sdim}
176327952Sdim
177327952Sdim/*
178 * TRYPAGER - used by vm_fault to calculate whether the pager for the
179 *	      current object *might* contain the page.
180 *
181 *	      default objects are zero-fill, there is no real pager.
182 */
183#define TRYPAGER	(fs.object->type != OBJT_DEFAULT && \
184			((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired))
185
186/*
187 *	vm_fault:
188 *
189 *	Handle a page fault occurring at the given address,
190 *	requiring the given permissions, in the map specified.
191 *	If successful, the page is inserted into the
192 *	associated physical map.
193 *
194 *	NOTE: the given address should be truncated to the
195 *	proper page address.
196 *
197 *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
198 *	a standard error specifying why the fault is fatal is returned.
199 *
200 *	The map in question must be referenced, and remains so.
201 *	Caller may hold no locks.
202 */
203int
204vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
205    int fault_flags)
206{
207	struct thread *td;
208	int result;
209
210	td = curthread;
211	if ((td->td_pflags & TDP_NOFAULTING) != 0)
212		return (KERN_PROTECTION_FAILURE);
213#ifdef KTRACE
214	if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
215		ktrfault(vaddr, fault_type);
216#endif
217	result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
218	    NULL);
219#ifdef KTRACE
220	if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
221		ktrfaultend(result);
222#endif
223	return (result);
224}
225
226int
227vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
228    int fault_flags, vm_page_t *m_hold)
229{
230	vm_prot_t prot;
231	long ahead, behind;
232	int alloc_req, era, faultcount, nera, reqpage, result;
233	boolean_t growstack, is_first_object_locked, wired;
234	int map_generation;
235	vm_object_t next_object;
236	vm_page_t marray[VM_FAULT_READ_MAX];
237	int hardfault;
238	struct faultstate fs;
239	struct vnode *vp;
240	int locked, error;
241
242	hardfault = 0;
243	growstack = TRUE;
244	PCPU_INC(cnt.v_vm_faults);
245	fs.vp = NULL;
246	faultcount = reqpage = 0;
247
248RetryFault:;
249
250	/*
251	 * Find the backing store object and offset into it to begin the
252	 * search.
253	 */
254	fs.map = map;
255	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
256	    &fs.first_object, &fs.first_pindex, &prot, &wired);
257	if (result != KERN_SUCCESS) {
258		if (growstack && result == KERN_INVALID_ADDRESS &&
259		    map != kernel_map) {
260			result = vm_map_growstack(curproc, vaddr);
261			if (result != KERN_SUCCESS)
262				return (KERN_FAILURE);
263			growstack = FALSE;
264			goto RetryFault;
265		}
266		return (result);
267	}
268
269	map_generation = fs.map->timestamp;
270
271	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
272		if ((curthread->td_pflags & TDP_DEVMEMIO) != 0) {
273			vm_map_unlock_read(fs.map);
274			return (KERN_FAILURE);
275		}
276		panic("vm_fault: fault on nofault entry, addr: %lx",
277		    (u_long)vaddr);
278	}
279
280	if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
281	    fs.entry->wiring_thread != curthread) {
282		vm_map_unlock_read(fs.map);
283		vm_map_lock(fs.map);
284		if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
285		    (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
286			fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
287			vm_map_unlock_and_wait(fs.map, 0);
288		} else
289			vm_map_unlock(fs.map);
290		goto RetryFault;
291	}
292
293	/*
294	 * Make a reference to this object to prevent its disposal while we
295	 * are messing with it.  Once we have the reference, the map is free
296	 * to be diddled.  Since objects reference their shadows (and copies),
297	 * they will stay around as well.
298	 *
299	 * Bump the paging-in-progress count to prevent size changes (e.g.
300	 * truncation operations) during I/O.  This must be done after
301	 * obtaining the vnode lock in order to avoid possible deadlocks.
302	 */
303	VM_OBJECT_WLOCK(fs.first_object);
304	vm_object_reference_locked(fs.first_object);
305	vm_object_pip_add(fs.first_object, 1);
306
307	fs.lookup_still_valid = TRUE;
308
309	if (wired)
310		fault_type = prot | (fault_type & VM_PROT_COPY);
311
312	fs.first_m = NULL;
313
314	/*
315	 * Search for the page at object/offset.
316	 */
317	fs.object = fs.first_object;
318	fs.pindex = fs.first_pindex;
319	while (TRUE) {
320		/*
321		 * If the object is dead, we stop here
322		 */
323		if (fs.object->flags & OBJ_DEAD) {
324			unlock_and_deallocate(&fs);
325			return (KERN_PROTECTION_FAILURE);
326		}
327
328		/*
329		 * See if page is resident
330		 */
331		fs.m = vm_page_lookup(fs.object, fs.pindex);
332		if (fs.m != NULL) {
333			/*
334			 * Wait/Retry if the page is busy.  We have to do this
335			 * if the page is either exclusive or shared busy
336			 * because the vm_pager may be using read busy for
337			 * pageouts (and even pageins if it is the vnode
338			 * pager), and we could end up trying to pagein and
339			 * pageout the same page simultaneously.
340			 *
341			 * We can theoretically allow the busy case on a read
342			 * fault if the page is marked valid, but since such
343			 * pages are typically already pmap'd, putting that
344			 * special case in might be more effort then it is
345			 * worth.  We cannot under any circumstances mess
346			 * around with a shared busied page except, perhaps,
347			 * to pmap it.
348			 */
349			if (vm_page_busied(fs.m)) {
350				/*
351				 * Reference the page before unlocking and
352				 * sleeping so that the page daemon is less
353				 * likely to reclaim it.
354				 */
355				vm_page_aflag_set(fs.m, PGA_REFERENCED);
356				if (fs.object != fs.first_object) {
357					if (!VM_OBJECT_TRYWLOCK(
358					    fs.first_object)) {
359						VM_OBJECT_WUNLOCK(fs.object);
360						VM_OBJECT_WLOCK(fs.first_object);
361						VM_OBJECT_WLOCK(fs.object);
362					}
363					vm_page_lock(fs.first_m);
364					vm_page_free(fs.first_m);
365					vm_page_unlock(fs.first_m);
366					vm_object_pip_wakeup(fs.first_object);
367					VM_OBJECT_WUNLOCK(fs.first_object);
368					fs.first_m = NULL;
369				}
370				unlock_map(&fs);
371				if (fs.m == vm_page_lookup(fs.object,
372				    fs.pindex)) {
373					vm_page_sleep_if_busy(fs.m, "vmpfw");
374				}
375				vm_object_pip_wakeup(fs.object);
376				VM_OBJECT_WUNLOCK(fs.object);
377				PCPU_INC(cnt.v_intrans);
378				vm_object_deallocate(fs.first_object);
379				goto RetryFault;
380			}
381			vm_page_lock(fs.m);
382			vm_page_remque(fs.m);
383			vm_page_unlock(fs.m);
384
385			/*
386			 * Mark page busy for other processes, and the
387			 * pagedaemon.  If it still isn't completely valid
388			 * (readable), jump to readrest, else break-out ( we
389			 * found the page ).
390			 */
391			vm_page_xbusy(fs.m);
392			if (fs.m->valid != VM_PAGE_BITS_ALL)
393				goto readrest;
394			break;
395		}
396
397		/*
398		 * Page is not resident, If this is the search termination
399		 * or the pager might contain the page, allocate a new page.
400		 */
401		if (TRYPAGER || fs.object == fs.first_object) {
402			if (fs.pindex >= fs.object->size) {
403				unlock_and_deallocate(&fs);
404				return (KERN_PROTECTION_FAILURE);
405			}
406
407			/*
408			 * Allocate a new page for this object/offset pair.
409			 *
410			 * Unlocked read of the p_flag is harmless. At
411			 * worst, the P_KILLED might be not observed
412			 * there, and allocation can fail, causing
413			 * restart and new reading of the p_flag.
414			 */
415			fs.m = NULL;
416			if (!vm_page_count_severe() || P_KILLED(curproc)) {
417#if VM_NRESERVLEVEL > 0
418				if ((fs.object->flags & OBJ_COLORED) == 0) {
419					fs.object->flags |= OBJ_COLORED;
420					fs.object->pg_color = atop(vaddr) -
421					    fs.pindex;
422				}
423#endif
424				alloc_req = P_KILLED(curproc) ?
425				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
426				if (fs.object->type != OBJT_VNODE &&
427				    fs.object->backing_object == NULL)
428					alloc_req |= VM_ALLOC_ZERO;
429				fs.m = vm_page_alloc(fs.object, fs.pindex,
430				    alloc_req);
431			}
432			if (fs.m == NULL) {
433				unlock_and_deallocate(&fs);
434				VM_WAITPFAULT;
435				goto RetryFault;
436			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
437				break;
438		}
439
440readrest:
441		/*
442		 * We have found a valid page or we have allocated a new page.
443		 * The page thus may not be valid or may not be entirely
444		 * valid.
445		 *
446		 * Attempt to fault-in the page if there is a chance that the
447		 * pager has it, and potentially fault in additional pages
448		 * at the same time.
449		 */
450		if (TRYPAGER) {
451			int rv;
452			u_char behavior = vm_map_entry_behavior(fs.entry);
453
454			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
455			    P_KILLED(curproc)) {
456				behind = 0;
457				ahead = 0;
458			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
459				behind = 0;
460				ahead = atop(fs.entry->end - vaddr) - 1;
461				if (ahead > VM_FAULT_READ_AHEAD_MAX)
462					ahead = VM_FAULT_READ_AHEAD_MAX;
463				if (fs.pindex == fs.entry->next_read)
464					vm_fault_cache_behind(&fs,
465					    VM_FAULT_READ_MAX);
466			} else {
467				/*
468				 * If this is a sequential page fault, then
469				 * arithmetically increase the number of pages
470				 * in the read-ahead window.  Otherwise, reset
471				 * the read-ahead window to its smallest size.
472				 */
473				behind = atop(vaddr - fs.entry->start);
474				if (behind > VM_FAULT_READ_BEHIND)
475					behind = VM_FAULT_READ_BEHIND;
476				ahead = atop(fs.entry->end - vaddr) - 1;
477				era = fs.entry->read_ahead;
478				if (fs.pindex == fs.entry->next_read) {
479					nera = era + behind;
480					if (nera > VM_FAULT_READ_AHEAD_MAX)
481						nera = VM_FAULT_READ_AHEAD_MAX;
482					behind = 0;
483					if (ahead > nera)
484						ahead = nera;
485					if (era == VM_FAULT_READ_AHEAD_MAX)
486						vm_fault_cache_behind(&fs,
487						    VM_FAULT_CACHE_BEHIND);
488				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
489					ahead = VM_FAULT_READ_AHEAD_MIN;
490				if (era != ahead)
491					fs.entry->read_ahead = ahead;
492			}
493
494			/*
495			 * Call the pager to retrieve the data, if any, after
496			 * releasing the lock on the map.  We hold a ref on
497			 * fs.object and the pages are exclusive busied.
498			 */
499			unlock_map(&fs);
500
501			if (fs.object->type == OBJT_VNODE) {
502				vp = fs.object->handle;
503				if (vp == fs.vp)
504					goto vnode_locked;
505				else if (fs.vp != NULL) {
506					vput(fs.vp);
507					fs.vp = NULL;
508				}
509				locked = VOP_ISLOCKED(vp);
510
511				if (locked != LK_EXCLUSIVE)
512					locked = LK_SHARED;
513				/* Do not sleep for vnode lock while fs.m is busy */
514				error = vget(vp, locked | LK_CANRECURSE |
515				    LK_NOWAIT, curthread);
516				if (error != 0) {
517					vhold(vp);
518					release_page(&fs);
519					unlock_and_deallocate(&fs);
520					error = vget(vp, locked | LK_RETRY |
521					    LK_CANRECURSE, curthread);
522					vdrop(vp);
523					fs.vp = vp;
524					KASSERT(error == 0,
525					    ("vm_fault: vget failed"));
526					goto RetryFault;
527				}
528				fs.vp = vp;
529			}
530vnode_locked:
531			KASSERT(fs.vp == NULL || !fs.map->system_map,
532			    ("vm_fault: vnode-backed object mapped by system map"));
533
534			/*
535			 * now we find out if any other pages should be paged
536			 * in at this time this routine checks to see if the
537			 * pages surrounding this fault reside in the same
538			 * object as the page for this fault.  If they do,
539			 * then they are faulted in also into the object.  The
540			 * array "marray" returned contains an array of
541			 * vm_page_t structs where one of them is the
542			 * vm_page_t passed to the routine.  The reqpage
543			 * return value is the index into the marray for the
544			 * vm_page_t passed to the routine.
545			 *
546			 * fs.m plus the additional pages are exclusive busied.
547			 */
548			faultcount = vm_fault_additional_pages(
549			    fs.m, behind, ahead, marray, &reqpage);
550
551			rv = faultcount ?
552			    vm_pager_get_pages(fs.object, marray, faultcount,
553				reqpage) : VM_PAGER_FAIL;
554
555			if (rv == VM_PAGER_OK) {
556				/*
557				 * Found the page. Leave it busy while we play
558				 * with it.
559				 */
560
561				/*
562				 * Relookup in case pager changed page. Pager
563				 * is responsible for disposition of old page
564				 * if moved.
565				 */
566				fs.m = vm_page_lookup(fs.object, fs.pindex);
567				if (!fs.m) {
568					unlock_and_deallocate(&fs);
569					goto RetryFault;
570				}
571
572				hardfault++;
573				break; /* break to PAGE HAS BEEN FOUND */
574			}
575			/*
576			 * Remove the bogus page (which does not exist at this
577			 * object/offset); before doing so, we must get back
578			 * our object lock to preserve our invariant.
579			 *
580			 * Also wake up any other process that may want to bring
581			 * in this page.
582			 *
583			 * If this is the top-level object, we must leave the
584			 * busy page to prevent another process from rushing
585			 * past us, and inserting the page in that object at
586			 * the same time that we are.
587			 */
588			if (rv == VM_PAGER_ERROR)
589				printf("vm_fault: pager read error, pid %d (%s)\n",
590				    curproc->p_pid, curproc->p_comm);
591			/*
592			 * Data outside the range of the pager or an I/O error
593			 */
594			/*
595			 * XXX - the check for kernel_map is a kludge to work
596			 * around having the machine panic on a kernel space
597			 * fault w/ I/O error.
598			 */
599			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
600				(rv == VM_PAGER_BAD)) {
601				vm_page_lock(fs.m);
602				vm_page_free(fs.m);
603				vm_page_unlock(fs.m);
604				fs.m = NULL;
605				unlock_and_deallocate(&fs);
606				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
607			}
608			if (fs.object != fs.first_object) {
609				vm_page_lock(fs.m);
610				vm_page_free(fs.m);
611				vm_page_unlock(fs.m);
612				fs.m = NULL;
613				/*
614				 * XXX - we cannot just fall out at this
615				 * point, m has been freed and is invalid!
616				 */
617			}
618		}
619
620		/*
621		 * We get here if the object has default pager (or unwiring)
622		 * or the pager doesn't have the page.
623		 */
624		if (fs.object == fs.first_object)
625			fs.first_m = fs.m;
626
627		/*
628		 * Move on to the next object.  Lock the next object before
629		 * unlocking the current one.
630		 */
631		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
632		next_object = fs.object->backing_object;
633		if (next_object == NULL) {
634			/*
635			 * If there's no object left, fill the page in the top
636			 * object with zeros.
637			 */
638			if (fs.object != fs.first_object) {
639				vm_object_pip_wakeup(fs.object);
640				VM_OBJECT_WUNLOCK(fs.object);
641
642				fs.object = fs.first_object;
643				fs.pindex = fs.first_pindex;
644				fs.m = fs.first_m;
645				VM_OBJECT_WLOCK(fs.object);
646			}
647			fs.first_m = NULL;
648
649			/*
650			 * Zero the page if necessary and mark it valid.
651			 */
652			if ((fs.m->flags & PG_ZERO) == 0) {
653				pmap_zero_page(fs.m);
654			} else {
655				PCPU_INC(cnt.v_ozfod);
656			}
657			PCPU_INC(cnt.v_zfod);
658			fs.m->valid = VM_PAGE_BITS_ALL;
659			break;	/* break to PAGE HAS BEEN FOUND */
660		} else {
661			KASSERT(fs.object != next_object,
662			    ("object loop %p", next_object));
663			VM_OBJECT_WLOCK(next_object);
664			vm_object_pip_add(next_object, 1);
665			if (fs.object != fs.first_object)
666				vm_object_pip_wakeup(fs.object);
667			VM_OBJECT_WUNLOCK(fs.object);
668			fs.object = next_object;
669		}
670	}
671
672	vm_page_assert_xbusied(fs.m);
673
674	/*
675	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
676	 * is held.]
677	 */
678
679	/*
680	 * If the page is being written, but isn't already owned by the
681	 * top-level object, we have to copy it into a new page owned by the
682	 * top-level object.
683	 */
684	if (fs.object != fs.first_object) {
685		/*
686		 * We only really need to copy if we want to write it.
687		 */
688		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
689			/*
690			 * This allows pages to be virtually copied from a
691			 * backing_object into the first_object, where the
692			 * backing object has no other refs to it, and cannot
693			 * gain any more refs.  Instead of a bcopy, we just
694			 * move the page from the backing object to the
695			 * first object.  Note that we must mark the page
696			 * dirty in the first object so that it will go out
697			 * to swap when needed.
698			 */
699			is_first_object_locked = FALSE;
700			if (
701				/*
702				 * Only one shadow object
703				 */
704				(fs.object->shadow_count == 1) &&
705				/*
706				 * No COW refs, except us
707				 */
708				(fs.object->ref_count == 1) &&
709				/*
710				 * No one else can look this object up
711				 */
712				(fs.object->handle == NULL) &&
713				/*
714				 * No other ways to look the object up
715				 */
716				((fs.object->type == OBJT_DEFAULT) ||
717				 (fs.object->type == OBJT_SWAP)) &&
718			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
719				/*
720				 * We don't chase down the shadow chain
721				 */
722			    fs.object == fs.first_object->backing_object) {
723				/*
724				 * get rid of the unnecessary page
725				 */
726				vm_page_lock(fs.first_m);
727				vm_page_free(fs.first_m);
728				vm_page_unlock(fs.first_m);
729				/*
730				 * grab the page and put it into the
731				 * process'es object.  The page is
732				 * automatically made dirty.
733				 */
734				if (vm_page_rename(fs.m, fs.first_object,
735				    fs.first_pindex)) {
736					unlock_and_deallocate(&fs);
737					goto RetryFault;
738				}
739				vm_page_xbusy(fs.m);
740				fs.first_m = fs.m;
741				fs.m = NULL;
742				PCPU_INC(cnt.v_cow_optim);
743			} else {
744				/*
745				 * Oh, well, lets copy it.
746				 */
747				pmap_copy_page(fs.m, fs.first_m);
748				fs.first_m->valid = VM_PAGE_BITS_ALL;
749				if (wired && (fault_flags &
750				    VM_FAULT_CHANGE_WIRING) == 0) {
751					vm_page_lock(fs.first_m);
752					vm_page_wire(fs.first_m);
753					vm_page_unlock(fs.first_m);
754
755					vm_page_lock(fs.m);
756					vm_page_unwire(fs.m, FALSE);
757					vm_page_unlock(fs.m);
758				}
759				/*
760				 * We no longer need the old page or object.
761				 */
762				release_page(&fs);
763			}
764			/*
765			 * fs.object != fs.first_object due to above
766			 * conditional
767			 */
768			vm_object_pip_wakeup(fs.object);
769			VM_OBJECT_WUNLOCK(fs.object);
770			/*
771			 * Only use the new page below...
772			 */
773			fs.object = fs.first_object;
774			fs.pindex = fs.first_pindex;
775			fs.m = fs.first_m;
776			if (!is_first_object_locked)
777				VM_OBJECT_WLOCK(fs.object);
778			PCPU_INC(cnt.v_cow_faults);
779			curthread->td_cow++;
780		} else {
781			prot &= ~VM_PROT_WRITE;
782		}
783	}
784
785	/*
786	 * We must verify that the maps have not changed since our last
787	 * lookup.
788	 */
789	if (!fs.lookup_still_valid) {
790		vm_object_t retry_object;
791		vm_pindex_t retry_pindex;
792		vm_prot_t retry_prot;
793
794		if (!vm_map_trylock_read(fs.map)) {
795			release_page(&fs);
796			unlock_and_deallocate(&fs);
797			goto RetryFault;
798		}
799		fs.lookup_still_valid = TRUE;
800		if (fs.map->timestamp != map_generation) {
801			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
802			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
803
804			/*
805			 * If we don't need the page any longer, put it on the inactive
806			 * list (the easiest thing to do here).  If no one needs it,
807			 * pageout will grab it eventually.
808			 */
809			if (result != KERN_SUCCESS) {
810				release_page(&fs);
811				unlock_and_deallocate(&fs);
812
813				/*
814				 * If retry of map lookup would have blocked then
815				 * retry fault from start.
816				 */
817				if (result == KERN_FAILURE)
818					goto RetryFault;
819				return (result);
820			}
821			if ((retry_object != fs.first_object) ||
822			    (retry_pindex != fs.first_pindex)) {
823				release_page(&fs);
824				unlock_and_deallocate(&fs);
825				goto RetryFault;
826			}
827
828			/*
829			 * Check whether the protection has changed or the object has
830			 * been copied while we left the map unlocked. Changing from
831			 * read to write permission is OK - we leave the page
832			 * write-protected, and catch the write fault. Changing from
833			 * write to read permission means that we can't mark the page
834			 * write-enabled after all.
835			 */
836			prot &= retry_prot;
837		}
838	}
839	/*
840	 * If the page was filled by a pager, update the map entry's
841	 * last read offset.  Since the pager does not return the
842	 * actual set of pages that it read, this update is based on
843	 * the requested set.  Typically, the requested and actual
844	 * sets are the same.
845	 *
846	 * XXX The following assignment modifies the map
847	 * without holding a write lock on it.
848	 */
849	if (hardfault)
850		fs.entry->next_read = fs.pindex + faultcount - reqpage;
851
852	if (((prot & VM_PROT_WRITE) != 0 ||
853	    (fault_flags & VM_FAULT_DIRTY) != 0) &&
854	    (fs.m->oflags & VPO_UNMANAGED) == 0) {
855		vm_object_set_writeable_dirty(fs.object);
856
857		/*
858		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
859		 * if the page is already dirty to prevent data written with
860		 * the expectation of being synced from not being synced.
861		 * Likewise if this entry does not request NOSYNC then make
862		 * sure the page isn't marked NOSYNC.  Applications sharing
863		 * data should use the same flags to avoid ping ponging.
864		 */
865		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
866			if (fs.m->dirty == 0)
867				fs.m->oflags |= VPO_NOSYNC;
868		} else {
869			fs.m->oflags &= ~VPO_NOSYNC;
870		}
871
872		/*
873		 * If the fault is a write, we know that this page is being
874		 * written NOW so dirty it explicitly to save on
875		 * pmap_is_modified() calls later.
876		 *
877		 * Also tell the backing pager, if any, that it should remove
878		 * any swap backing since the page is now dirty.
879		 */
880		if (((fault_type & VM_PROT_WRITE) != 0 &&
881		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
882		    (fault_flags & VM_FAULT_DIRTY) != 0) {
883			vm_page_dirty(fs.m);
884			vm_pager_page_unswapped(fs.m);
885		}
886	}
887
888	vm_page_assert_xbusied(fs.m);
889
890	/*
891	 * Page must be completely valid or it is not fit to
892	 * map into user space.  vm_pager_get_pages() ensures this.
893	 */
894	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
895	    ("vm_fault: page %p partially invalid", fs.m));
896	VM_OBJECT_WUNLOCK(fs.object);
897
898	/*
899	 * Put this page into the physical map.  We had to do the unlock above
900	 * because pmap_enter() may sleep.  We don't put the page
901	 * back on the active queue until later so that the pageout daemon
902	 * won't find it (yet).
903	 */
904	pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
905	    fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
906	if ((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 && wired == 0)
907		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
908	VM_OBJECT_WLOCK(fs.object);
909	vm_page_lock(fs.m);
910
911	/*
912	 * If the page is not wired down, then put it where the pageout daemon
913	 * can find it.
914	 */
915	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
916		if (wired)
917			vm_page_wire(fs.m);
918		else
919			vm_page_unwire(fs.m, 1);
920	} else
921		vm_page_activate(fs.m);
922	if (m_hold != NULL) {
923		*m_hold = fs.m;
924		vm_page_hold(fs.m);
925	}
926	vm_page_unlock(fs.m);
927	vm_page_xunbusy(fs.m);
928
929	/*
930	 * Unlock everything, and return
931	 */
932	unlock_and_deallocate(&fs);
933	if (hardfault) {
934		PCPU_INC(cnt.v_io_faults);
935		curthread->td_ru.ru_majflt++;
936	} else
937		curthread->td_ru.ru_minflt++;
938
939	return (KERN_SUCCESS);
940}
941
942/*
943 * Speed up the reclamation of up to "distance" pages that precede the
944 * faulting pindex within the first object of the shadow chain.
945 */
946static void
947vm_fault_cache_behind(const struct faultstate *fs, int distance)
948{
949	vm_object_t first_object, object;
950	vm_page_t m, m_prev;
951	vm_pindex_t pindex;
952
953	object = fs->object;
954	VM_OBJECT_ASSERT_WLOCKED(object);
955	first_object = fs->first_object;
956	if (first_object != object) {
957		if (!VM_OBJECT_TRYWLOCK(first_object)) {
958			VM_OBJECT_WUNLOCK(object);
959			VM_OBJECT_WLOCK(first_object);
960			VM_OBJECT_WLOCK(object);
961		}
962	}
963	/* Neither fictitious nor unmanaged pages can be cached. */
964	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
965		if (fs->first_pindex < distance)
966			pindex = 0;
967		else
968			pindex = fs->first_pindex - distance;
969		if (pindex < OFF_TO_IDX(fs->entry->offset))
970			pindex = OFF_TO_IDX(fs->entry->offset);
971		m = first_object != object ? fs->first_m : fs->m;
972		vm_page_assert_xbusied(m);
973		m_prev = vm_page_prev(m);
974		while ((m = m_prev) != NULL && m->pindex >= pindex &&
975		    m->valid == VM_PAGE_BITS_ALL) {
976			m_prev = vm_page_prev(m);
977			if (vm_page_busied(m))
978				continue;
979			vm_page_lock(m);
980			if (m->hold_count == 0 && m->wire_count == 0) {
981				pmap_remove_all(m);
982				vm_page_aflag_clear(m, PGA_REFERENCED);
983				if (m->dirty != 0)
984					vm_page_deactivate(m);
985				else
986					vm_page_cache(m);
987			}
988			vm_page_unlock(m);
989		}
990	}
991	if (first_object != object)
992		VM_OBJECT_WUNLOCK(first_object);
993}
994
995/*
996 * vm_fault_prefault provides a quick way of clustering
997 * pagefaults into a processes address space.  It is a "cousin"
998 * of vm_map_pmap_enter, except it runs at page fault time instead
999 * of mmap time.
1000 */
1001static void
1002vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
1003    int faultcount, int reqpage)
1004{
1005	pmap_t pmap;
1006	vm_map_entry_t entry;
1007	vm_object_t backing_object, lobject;
1008	vm_offset_t addr, starta;
1009	vm_pindex_t pindex;
1010	vm_page_t m;
1011	int backward, forward, i;
1012
1013	pmap = fs->map->pmap;
1014	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
1015		return;
1016
1017	if (faultcount > 0) {
1018		backward = reqpage;
1019		forward = faultcount - reqpage - 1;
1020	} else {
1021		backward = PFBAK;
1022		forward = PFFOR;
1023	}
1024	entry = fs->entry;
1025
1026	starta = addra - backward * PAGE_SIZE;
1027	if (starta < entry->start) {
1028		starta = entry->start;
1029	} else if (starta > addra) {
1030		starta = 0;
1031	}
1032
1033	/*
1034	 * Generate the sequence of virtual addresses that are candidates for
1035	 * prefaulting in an outward spiral from the faulting virtual address,
1036	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
1037	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
1038	 * If the candidate address doesn't have a backing physical page, then
1039	 * the loop immediately terminates.
1040	 */
1041	for (i = 0; i < 2 * imax(backward, forward); i++) {
1042		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
1043		    PAGE_SIZE);
1044		if (addr > addra + forward * PAGE_SIZE)
1045			addr = 0;
1046
1047		if (addr < starta || addr >= entry->end)
1048			continue;
1049
1050		if (!pmap_is_prefaultable(pmap, addr))
1051			continue;
1052
1053		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
1054		lobject = entry->object.vm_object;
1055		VM_OBJECT_RLOCK(lobject);
1056		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
1057		    lobject->type == OBJT_DEFAULT &&
1058		    (backing_object = lobject->backing_object) != NULL) {
1059			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
1060			    0, ("vm_fault_prefault: unaligned object offset"));
1061			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
1062			VM_OBJECT_RLOCK(backing_object);
1063			VM_OBJECT_RUNLOCK(lobject);
1064			lobject = backing_object;
1065		}
1066		if (m == NULL) {
1067			VM_OBJECT_RUNLOCK(lobject);
1068			break;
1069		}
1070		if (m->valid == VM_PAGE_BITS_ALL &&
1071		    (m->flags & PG_FICTITIOUS) == 0)
1072			pmap_enter_quick(pmap, addr, m, entry->protection);
1073		VM_OBJECT_RUNLOCK(lobject);
1074	}
1075}
1076
1077/*
1078 * Hold each of the physical pages that are mapped by the specified range of
1079 * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
1080 * and allow the specified types of access, "prot".  If all of the implied
1081 * pages are successfully held, then the number of held pages is returned
1082 * together with pointers to those pages in the array "ma".  However, if any
1083 * of the pages cannot be held, -1 is returned.
1084 */
1085int
1086vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
1087    vm_prot_t prot, vm_page_t *ma, int max_count)
1088{
1089	vm_offset_t end, va;
1090	vm_page_t *mp;
1091	int count;
1092	boolean_t pmap_failed;
1093
1094	if (len == 0)
1095		return (0);
1096	end = round_page(addr + len);
1097	addr = trunc_page(addr);
1098
1099	/*
1100	 * Check for illegal addresses.
1101	 */
1102	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
1103		return (-1);
1104
1105	if (atop(end - addr) > max_count)
1106		panic("vm_fault_quick_hold_pages: count > max_count");
1107	count = atop(end - addr);
1108
1109	/*
1110	 * Most likely, the physical pages are resident in the pmap, so it is
1111	 * faster to try pmap_extract_and_hold() first.
1112	 */
1113	pmap_failed = FALSE;
1114	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
1115		*mp = pmap_extract_and_hold(map->pmap, va, prot);
1116		if (*mp == NULL)
1117			pmap_failed = TRUE;
1118		else if ((prot & VM_PROT_WRITE) != 0 &&
1119		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
1120			/*
1121			 * Explicitly dirty the physical page.  Otherwise, the
1122			 * caller's changes may go unnoticed because they are
1123			 * performed through an unmanaged mapping or by a DMA
1124			 * operation.
1125			 *
1126			 * The object lock is not held here.
1127			 * See vm_page_clear_dirty_mask().
1128			 */
1129			vm_page_dirty(*mp);
1130		}
1131	}
1132	if (pmap_failed) {
1133		/*
1134		 * One or more pages could not be held by the pmap.  Either no
1135		 * page was mapped at the specified virtual address or that
1136		 * mapping had insufficient permissions.  Attempt to fault in
1137		 * and hold these pages.
1138		 */
1139		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
1140			if (*mp == NULL && vm_fault_hold(map, va, prot,
1141			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
1142				goto error;
1143	}
1144	return (count);
1145error:
1146	for (mp = ma; mp < ma + count; mp++)
1147		if (*mp != NULL) {
1148			vm_page_lock(*mp);
1149			vm_page_unhold(*mp);
1150			vm_page_unlock(*mp);
1151		}
1152	return (-1);
1153}
1154
1155/*
1156 *	vm_fault_wire:
1157 *
1158 *	Wire down a range of virtual addresses in a map.
1159 */
1160int
1161vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1162    boolean_t fictitious)
1163{
1164	vm_offset_t va;
1165	int rv;
1166
1167	/*
1168	 * We simulate a fault to get the page and enter it in the physical
1169	 * map.  For user wiring, we only ask for read access on currently
1170	 * read-only sections.
1171	 */
1172	for (va = start; va < end; va += PAGE_SIZE) {
1173		rv = vm_fault(map, va, VM_PROT_NONE, VM_FAULT_CHANGE_WIRING);
1174		if (rv) {
1175			if (va != start)
1176				vm_fault_unwire(map, start, va, fictitious);
1177			return (rv);
1178		}
1179	}
1180	return (KERN_SUCCESS);
1181}
1182
1183/*
1184 *	vm_fault_unwire:
1185 *
1186 *	Unwire a range of virtual addresses in a map.
1187 */
1188void
1189vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
1190    boolean_t fictitious)
1191{
1192	vm_paddr_t pa;
1193	vm_offset_t va;
1194	vm_page_t m;
1195	pmap_t pmap;
1196
1197	pmap = vm_map_pmap(map);
1198
1199	/*
1200	 * Since the pages are wired down, we must be able to get their
1201	 * mappings from the physical map system.
1202	 */
1203	for (va = start; va < end; va += PAGE_SIZE) {
1204		pa = pmap_extract(pmap, va);
1205		if (pa != 0) {
1206			pmap_change_wiring(pmap, va, FALSE);
1207			if (!fictitious) {
1208				m = PHYS_TO_VM_PAGE(pa);
1209				vm_page_lock(m);
1210				vm_page_unwire(m, TRUE);
1211				vm_page_unlock(m);
1212			}
1213		}
1214	}
1215}
1216
1217/*
1218 *	Routine:
1219 *		vm_fault_copy_entry
1220 *	Function:
1221 *		Create new shadow object backing dst_entry with private copy of
1222 *		all underlying pages. When src_entry is equal to dst_entry,
1223 *		function implements COW for wired-down map entry. Otherwise,
1224 *		it forks wired entry into dst_map.
1225 *
1226 *	In/out conditions:
1227 *		The source and destination maps must be locked for write.
1228 *		The source map entry must be wired down (or be a sharing map
1229 *		entry corresponding to a main map entry that is wired down).
1230 */
1231void
1232vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
1233    vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
1234    vm_ooffset_t *fork_charge)
1235{
1236	vm_object_t backing_object, dst_object, object, src_object;
1237	vm_pindex_t dst_pindex, pindex, src_pindex;
1238	vm_prot_t access, prot;
1239	vm_offset_t vaddr;
1240	vm_page_t dst_m;
1241	vm_page_t src_m;
1242	boolean_t upgrade;
1243
1244#ifdef	lint
1245	src_map++;
1246#endif	/* lint */
1247
1248	upgrade = src_entry == dst_entry;
1249	access = prot = dst_entry->protection;
1250
1251	src_object = src_entry->object.vm_object;
1252	src_pindex = OFF_TO_IDX(src_entry->offset);
1253
1254	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
1255		dst_object = src_object;
1256		vm_object_reference(dst_object);
1257	} else {
1258		/*
1259		 * Create the top-level object for the destination entry. (Doesn't
1260		 * actually shadow anything - we copy the pages directly.)
1261		 */
1262		dst_object = vm_object_allocate(OBJT_DEFAULT,
1263		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
1264#if VM_NRESERVLEVEL > 0
1265		dst_object->flags |= OBJ_COLORED;
1266		dst_object->pg_color = atop(dst_entry->start);
1267#endif
1268	}
1269
1270	VM_OBJECT_WLOCK(dst_object);
1271	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
1272	    ("vm_fault_copy_entry: vm_object not NULL"));
1273	if (src_object != dst_object) {
1274		dst_entry->object.vm_object = dst_object;
1275		dst_entry->offset = 0;
1276		dst_object->charge = dst_entry->end - dst_entry->start;
1277	}
1278	if (fork_charge != NULL) {
1279		KASSERT(dst_entry->cred == NULL,
1280		    ("vm_fault_copy_entry: leaked swp charge"));
1281		dst_object->cred = curthread->td_ucred;
1282		crhold(dst_object->cred);
1283		*fork_charge += dst_object->charge;
1284	} else if (dst_object->cred == NULL) {
1285		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
1286		    dst_entry));
1287		dst_object->cred = dst_entry->cred;
1288		dst_entry->cred = NULL;
1289	}
1290
1291	/*
1292	 * If not an upgrade, then enter the mappings in the pmap as
1293	 * read and/or execute accesses.  Otherwise, enter them as
1294	 * write accesses.
1295	 *
1296	 * A writeable large page mapping is only created if all of
1297	 * the constituent small page mappings are modified. Marking
1298	 * PTEs as modified on inception allows promotion to happen
1299	 * without taking potentially large number of soft faults.
1300	 */
1301	if (!upgrade)
1302		access &= ~VM_PROT_WRITE;
1303
1304	/*
1305	 * Loop through all of the virtual pages within the entry's
1306	 * range, copying each page from the source object to the
1307	 * destination object.  Since the source is wired, those pages
1308	 * must exist.  In contrast, the destination is pageable.
1309	 * Since the destination object does share any backing storage
1310	 * with the source object, all of its pages must be dirtied,
1311	 * regardless of whether they can be written.
1312	 */
1313	for (vaddr = dst_entry->start, dst_pindex = 0;
1314	    vaddr < dst_entry->end;
1315	    vaddr += PAGE_SIZE, dst_pindex++) {
1316again:
1317		/*
1318		 * Find the page in the source object, and copy it in.
1319		 * Because the source is wired down, the page will be
1320		 * in memory.
1321		 */
1322		if (src_object != dst_object)
1323			VM_OBJECT_RLOCK(src_object);
1324		object = src_object;
1325		pindex = src_pindex + dst_pindex;
1326		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
1327		    (backing_object = object->backing_object) != NULL) {
1328			/*
1329			 * Unless the source mapping is read-only or
1330			 * it is presently being upgraded from
1331			 * read-only, the first object in the shadow
1332			 * chain should provide all of the pages.  In
1333			 * other words, this loop body should never be
1334			 * executed when the source mapping is already
1335			 * read/write.
1336			 */
1337			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
1338			    upgrade,
1339			    ("vm_fault_copy_entry: main object missing page"));
1340
1341			VM_OBJECT_RLOCK(backing_object);
1342			pindex += OFF_TO_IDX(object->backing_object_offset);
1343			if (object != dst_object)
1344				VM_OBJECT_RUNLOCK(object);
1345			object = backing_object;
1346		}
1347		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
1348
1349		if (object != dst_object) {
1350			/*
1351			 * Allocate a page in the destination object.
1352			 */
1353			dst_m = vm_page_alloc(dst_object, (src_object ==
1354			    dst_object ? src_pindex : 0) + dst_pindex,
1355			    VM_ALLOC_NORMAL);
1356			if (dst_m == NULL) {
1357				VM_OBJECT_WUNLOCK(dst_object);
1358				VM_OBJECT_RUNLOCK(object);
1359				VM_WAIT;
1360				VM_OBJECT_WLOCK(dst_object);
1361				goto again;
1362			}
1363			pmap_copy_page(src_m, dst_m);
1364			VM_OBJECT_RUNLOCK(object);
1365			dst_m->valid = VM_PAGE_BITS_ALL;
1366			dst_m->dirty = VM_PAGE_BITS_ALL;
1367		} else {
1368			dst_m = src_m;
1369			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
1370				goto again;
1371			vm_page_xbusy(dst_m);
1372			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
1373			    ("invalid dst page %p", dst_m));
1374		}
1375		VM_OBJECT_WUNLOCK(dst_object);
1376
1377		/*
1378		 * Enter it in the pmap. If a wired, copy-on-write
1379		 * mapping is being replaced by a write-enabled
1380		 * mapping, then wire that new mapping.
1381		 */
1382		pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
1383		    access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
1384
1385		/*
1386		 * Mark it no longer busy, and put it on the active list.
1387		 */
1388		VM_OBJECT_WLOCK(dst_object);
1389
1390		if (upgrade) {
1391			if (src_m != dst_m) {
1392				vm_page_lock(src_m);
1393				vm_page_unwire(src_m, 0);
1394				vm_page_unlock(src_m);
1395				vm_page_lock(dst_m);
1396				vm_page_wire(dst_m);
1397				vm_page_unlock(dst_m);
1398			} else {
1399				KASSERT(dst_m->wire_count > 0,
1400				    ("dst_m %p is not wired", dst_m));
1401			}
1402		} else {
1403			vm_page_lock(dst_m);
1404			vm_page_activate(dst_m);
1405			vm_page_unlock(dst_m);
1406		}
1407		vm_page_xunbusy(dst_m);
1408	}
1409	VM_OBJECT_WUNLOCK(dst_object);
1410	if (upgrade) {
1411		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
1412		vm_object_deallocate(src_object);
1413	}
1414}
1415
1416
1417/*
1418 * This routine checks around the requested page for other pages that
1419 * might be able to be faulted in.  This routine brackets the viable
1420 * pages for the pages to be paged in.
1421 *
1422 * Inputs:
1423 *	m, rbehind, rahead
1424 *
1425 * Outputs:
1426 *  marray (array of vm_page_t), reqpage (index of requested page)
1427 *
1428 * Return value:
1429 *  number of pages in marray
1430 */
1431static int
1432vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
1433	vm_page_t m;
1434	int rbehind;
1435	int rahead;
1436	vm_page_t *marray;
1437	int *reqpage;
1438{
1439	int i,j;
1440	vm_object_t object;
1441	vm_pindex_t pindex, startpindex, endpindex, tpindex;
1442	vm_page_t rtm;
1443	int cbehind, cahead;
1444
1445	VM_OBJECT_ASSERT_WLOCKED(m->object);
1446
1447	object = m->object;
1448	pindex = m->pindex;
1449	cbehind = cahead = 0;
1450
1451	/*
1452	 * if the requested page is not available, then give up now
1453	 */
1454	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
1455		return 0;
1456	}
1457
1458	if ((cbehind == 0) && (cahead == 0)) {
1459		*reqpage = 0;
1460		marray[0] = m;
1461		return 1;
1462	}
1463
1464	if (rahead > cahead) {
1465		rahead = cahead;
1466	}
1467
1468	if (rbehind > cbehind) {
1469		rbehind = cbehind;
1470	}
1471
1472	/*
1473	 * scan backward for the read behind pages -- in memory
1474	 */
1475	if (pindex > 0) {
1476		if (rbehind > pindex) {
1477			rbehind = pindex;
1478			startpindex = 0;
1479		} else {
1480			startpindex = pindex - rbehind;
1481		}
1482
1483		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
1484		    rtm->pindex >= startpindex)
1485			startpindex = rtm->pindex + 1;
1486
1487		/* tpindex is unsigned; beware of numeric underflow. */
1488		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
1489		    tpindex < pindex; i++, tpindex--) {
1490
1491			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
1492			    VM_ALLOC_IFNOTCACHED);
1493			if (rtm == NULL) {
1494				/*
1495				 * Shift the allocated pages to the
1496				 * beginning of the array.
1497				 */
1498				for (j = 0; j < i; j++) {
1499					marray[j] = marray[j + tpindex + 1 -
1500					    startpindex];
1501				}
1502				break;
1503			}
1504
1505			marray[tpindex - startpindex] = rtm;
1506		}
1507	} else {
1508		startpindex = 0;
1509		i = 0;
1510	}
1511
1512	marray[i] = m;
1513	/* page offset of the required page */
1514	*reqpage = i;
1515
1516	tpindex = pindex + 1;
1517	i++;
1518
1519	/*
1520	 * scan forward for the read ahead pages
1521	 */
1522	endpindex = tpindex + rahead;
1523	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
1524		endpindex = rtm->pindex;
1525	if (endpindex > object->size)
1526		endpindex = object->size;
1527
1528	for (; tpindex < endpindex; i++, tpindex++) {
1529
1530		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
1531		    VM_ALLOC_IFNOTCACHED);
1532		if (rtm == NULL) {
1533			break;
1534		}
1535
1536		marray[i] = rtm;
1537	}
1538
1539	/* return number of pages */
1540	return i;
1541}
1542
1543/*
1544 * Block entry into the machine-independent layer's page fault handler by
1545 * the calling thread.  Subsequent calls to vm_fault() by that thread will
1546 * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
1547 * spurious page faults.
1548 */
1549int
1550vm_fault_disable_pagefaults(void)
1551{
1552
1553	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
1554}
1555
1556void
1557vm_fault_enable_pagefaults(int save)
1558{
1559
1560	curthread_pflags_restore(save);
1561}
1562