vm_pageout.c revision 331722
1/*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Yahoo! Technologies Norway AS
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * The Mach Operating System project at Carnegie-Mellon University.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 *    must display the following acknowledgement:
24 *	This product includes software developed by the University of
25 *	California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
43 *
44 *
45 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
46 * All rights reserved.
47 *
48 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
49 *
50 * Permission to use, copy, modify and distribute this software and
51 * its documentation is hereby granted, provided that both the copyright
52 * notice and this permission notice appear in all copies of the
53 * software, derivative works or modified versions, and any portions
54 * thereof, and that both notices appear in supporting documentation.
55 *
56 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
57 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
58 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
59 *
60 * Carnegie Mellon requests users of this software to return to
61 *
62 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
63 *  School of Computer Science
64 *  Carnegie Mellon University
65 *  Pittsburgh PA 15213-3890
66 *
67 * any improvements or extensions that they make and grant Carnegie the
68 * rights to redistribute these changes.
69 */
70
71/*
72 *	The proverbial page-out daemon.
73 */
74
75#include <sys/cdefs.h>
76__FBSDID("$FreeBSD: stable/11/sys/vm/vm_pageout.c 331722 2018-03-29 02:50:57Z eadler $");
77
78#include "opt_vm.h"
79
80#include <sys/param.h>
81#include <sys/systm.h>
82#include <sys/kernel.h>
83#include <sys/eventhandler.h>
84#include <sys/lock.h>
85#include <sys/mutex.h>
86#include <sys/proc.h>
87#include <sys/kthread.h>
88#include <sys/ktr.h>
89#include <sys/mount.h>
90#include <sys/racct.h>
91#include <sys/resourcevar.h>
92#include <sys/sched.h>
93#include <sys/sdt.h>
94#include <sys/signalvar.h>
95#include <sys/smp.h>
96#include <sys/time.h>
97#include <sys/vnode.h>
98#include <sys/vmmeter.h>
99#include <sys/rwlock.h>
100#include <sys/sx.h>
101#include <sys/sysctl.h>
102
103#include <vm/vm.h>
104#include <vm/vm_param.h>
105#include <vm/vm_object.h>
106#include <vm/vm_page.h>
107#include <vm/vm_map.h>
108#include <vm/vm_pageout.h>
109#include <vm/vm_pager.h>
110#include <vm/vm_phys.h>
111#include <vm/swap_pager.h>
112#include <vm/vm_extern.h>
113#include <vm/uma.h>
114
115/*
116 * System initialization
117 */
118
119/* the kernel process "vm_pageout"*/
120static void vm_pageout(void);
121static void vm_pageout_init(void);
122static int vm_pageout_clean(vm_page_t m, int *numpagedout);
123static int vm_pageout_cluster(vm_page_t m);
124static bool vm_pageout_scan(struct vm_domain *vmd, int pass);
125static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
126    int starting_page_shortage);
127
128SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
129    NULL);
130
131struct proc *pageproc;
132
133static struct kproc_desc page_kp = {
134	"pagedaemon",
135	vm_pageout,
136	&pageproc
137};
138SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
139    &page_kp);
140
141SDT_PROVIDER_DEFINE(vm);
142SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
143
144/* Pagedaemon activity rates, in subdivisions of one second. */
145#define	VM_LAUNDER_RATE		10
146#define	VM_INACT_SCAN_RATE	2
147
148int vm_pageout_deficit;		/* Estimated number of pages deficit */
149u_int vm_pageout_wakeup_thresh;
150static int vm_pageout_oom_seq = 12;
151bool vm_pageout_wanted;		/* Event on which pageout daemon sleeps */
152bool vm_pages_needed;		/* Are threads waiting for free pages? */
153
154/* Pending request for dirty page laundering. */
155static enum {
156	VM_LAUNDRY_IDLE,
157	VM_LAUNDRY_BACKGROUND,
158	VM_LAUNDRY_SHORTFALL
159} vm_laundry_request = VM_LAUNDRY_IDLE;
160
161static int vm_pageout_update_period;
162static int disable_swap_pageouts;
163static int lowmem_period = 10;
164static time_t lowmem_uptime;
165
166static int vm_panic_on_oom = 0;
167
168SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
169	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
170	"panic on out of memory instead of killing the largest process");
171
172SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh,
173	CTLFLAG_RWTUN, &vm_pageout_wakeup_thresh, 0,
174	"free page threshold for waking up the pageout daemon");
175
176SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
177	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
178	"Maximum active LRU update period");
179
180SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
181	"Low memory callback period");
182
183SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
184	CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
185
186static int pageout_lock_miss;
187SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
188	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
189
190SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
191	CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
192	"back-to-back calls to oom detector to start OOM");
193
194static int act_scan_laundry_weight = 3;
195SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
196    &act_scan_laundry_weight, 0,
197    "weight given to clean vs. dirty pages in active queue scans");
198
199static u_int vm_background_launder_target;
200SYSCTL_UINT(_vm, OID_AUTO, background_launder_target, CTLFLAG_RWTUN,
201    &vm_background_launder_target, 0,
202    "background laundering target, in pages");
203
204static u_int vm_background_launder_rate = 4096;
205SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
206    &vm_background_launder_rate, 0,
207    "background laundering rate, in kilobytes per second");
208
209static u_int vm_background_launder_max = 20 * 1024;
210SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
211    &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
212
213int vm_pageout_page_count = 32;
214
215int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
216SYSCTL_INT(_vm, OID_AUTO, max_wired,
217	CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count");
218
219static u_int isqrt(u_int num);
220static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *);
221static int vm_pageout_launder(struct vm_domain *vmd, int launder,
222    bool in_shortfall);
223static void vm_pageout_laundry_worker(void *arg);
224static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *);
225
226/*
227 * Initialize a dummy page for marking the caller's place in the specified
228 * paging queue.  In principle, this function only needs to set the flag
229 * PG_MARKER.  Nonetheless, it write busies and initializes the hold count
230 * to one as safety precautions.
231 */
232static void
233vm_pageout_init_marker(vm_page_t marker, u_short queue)
234{
235
236	bzero(marker, sizeof(*marker));
237	marker->flags = PG_MARKER;
238	marker->busy_lock = VPB_SINGLE_EXCLUSIVER;
239	marker->queue = queue;
240	marker->hold_count = 1;
241}
242
243/*
244 * vm_pageout_fallback_object_lock:
245 *
246 * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is
247 * known to have failed and page queue must be either PQ_ACTIVE or
248 * PQ_INACTIVE.  To avoid lock order violation, unlock the page queue
249 * while locking the vm object.  Use marker page to detect page queue
250 * changes and maintain notion of next page on page queue.  Return
251 * TRUE if no changes were detected, FALSE otherwise.  vm object is
252 * locked on return.
253 *
254 * This function depends on both the lock portion of struct vm_object
255 * and normal struct vm_page being type stable.
256 */
257static boolean_t
258vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
259{
260	struct vm_page marker;
261	struct vm_pagequeue *pq;
262	boolean_t unchanged;
263	u_short queue;
264	vm_object_t object;
265
266	queue = m->queue;
267	vm_pageout_init_marker(&marker, queue);
268	pq = vm_page_pagequeue(m);
269	object = m->object;
270
271	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
272	vm_pagequeue_unlock(pq);
273	vm_page_unlock(m);
274	VM_OBJECT_WLOCK(object);
275	vm_page_lock(m);
276	vm_pagequeue_lock(pq);
277
278	/*
279	 * The page's object might have changed, and/or the page might
280	 * have moved from its original position in the queue.  If the
281	 * page's object has changed, then the caller should abandon
282	 * processing the page because the wrong object lock was
283	 * acquired.  Use the marker's plinks.q, not the page's, to
284	 * determine if the page has been moved.  The state of the
285	 * page's plinks.q can be indeterminate; whereas, the marker's
286	 * plinks.q must be valid.
287	 */
288	*next = TAILQ_NEXT(&marker, plinks.q);
289	unchanged = m->object == object &&
290	    m == TAILQ_PREV(&marker, pglist, plinks.q);
291	KASSERT(!unchanged || m->queue == queue,
292	    ("page %p queue %d %d", m, queue, m->queue));
293	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
294	return (unchanged);
295}
296
297/*
298 * Lock the page while holding the page queue lock.  Use marker page
299 * to detect page queue changes and maintain notion of next page on
300 * page queue.  Return TRUE if no changes were detected, FALSE
301 * otherwise.  The page is locked on return. The page queue lock might
302 * be dropped and reacquired.
303 *
304 * This function depends on normal struct vm_page being type stable.
305 */
306static boolean_t
307vm_pageout_page_lock(vm_page_t m, vm_page_t *next)
308{
309	struct vm_page marker;
310	struct vm_pagequeue *pq;
311	boolean_t unchanged;
312	u_short queue;
313
314	vm_page_lock_assert(m, MA_NOTOWNED);
315	if (vm_page_trylock(m))
316		return (TRUE);
317
318	queue = m->queue;
319	vm_pageout_init_marker(&marker, queue);
320	pq = vm_page_pagequeue(m);
321
322	TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q);
323	vm_pagequeue_unlock(pq);
324	vm_page_lock(m);
325	vm_pagequeue_lock(pq);
326
327	/* Page queue might have changed. */
328	*next = TAILQ_NEXT(&marker, plinks.q);
329	unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q);
330	KASSERT(!unchanged || m->queue == queue,
331	    ("page %p queue %d %d", m, queue, m->queue));
332	TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q);
333	return (unchanged);
334}
335
336/*
337 * Scan for pages at adjacent offsets within the given page's object that are
338 * eligible for laundering, form a cluster of these pages and the given page,
339 * and launder that cluster.
340 */
341static int
342vm_pageout_cluster(vm_page_t m)
343{
344	vm_object_t object;
345	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
346	vm_pindex_t pindex;
347	int ib, is, page_base, pageout_count;
348
349	vm_page_assert_locked(m);
350	object = m->object;
351	VM_OBJECT_ASSERT_WLOCKED(object);
352	pindex = m->pindex;
353
354	/*
355	 * We can't clean the page if it is busy or held.
356	 */
357	vm_page_assert_unbusied(m);
358	KASSERT(m->hold_count == 0, ("page %p is held", m));
359
360	pmap_remove_write(m);
361	vm_page_unlock(m);
362
363	mc[vm_pageout_page_count] = pb = ps = m;
364	pageout_count = 1;
365	page_base = vm_pageout_page_count;
366	ib = 1;
367	is = 1;
368
369	/*
370	 * We can cluster only if the page is not clean, busy, or held, and
371	 * the page is in the laundry queue.
372	 *
373	 * During heavy mmap/modification loads the pageout
374	 * daemon can really fragment the underlying file
375	 * due to flushing pages out of order and not trying to
376	 * align the clusters (which leaves sporadic out-of-order
377	 * holes).  To solve this problem we do the reverse scan
378	 * first and attempt to align our cluster, then do a
379	 * forward scan if room remains.
380	 */
381more:
382	while (ib != 0 && pageout_count < vm_pageout_page_count) {
383		if (ib > pindex) {
384			ib = 0;
385			break;
386		}
387		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
388			ib = 0;
389			break;
390		}
391		vm_page_test_dirty(p);
392		if (p->dirty == 0) {
393			ib = 0;
394			break;
395		}
396		vm_page_lock(p);
397		if (!vm_page_in_laundry(p) ||
398		    p->hold_count != 0) {	/* may be undergoing I/O */
399			vm_page_unlock(p);
400			ib = 0;
401			break;
402		}
403		pmap_remove_write(p);
404		vm_page_unlock(p);
405		mc[--page_base] = pb = p;
406		++pageout_count;
407		++ib;
408
409		/*
410		 * We are at an alignment boundary.  Stop here, and switch
411		 * directions.  Do not clear ib.
412		 */
413		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
414			break;
415	}
416	while (pageout_count < vm_pageout_page_count &&
417	    pindex + is < object->size) {
418		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
419			break;
420		vm_page_test_dirty(p);
421		if (p->dirty == 0)
422			break;
423		vm_page_lock(p);
424		if (!vm_page_in_laundry(p) ||
425		    p->hold_count != 0) {	/* may be undergoing I/O */
426			vm_page_unlock(p);
427			break;
428		}
429		pmap_remove_write(p);
430		vm_page_unlock(p);
431		mc[page_base + pageout_count] = ps = p;
432		++pageout_count;
433		++is;
434	}
435
436	/*
437	 * If we exhausted our forward scan, continue with the reverse scan
438	 * when possible, even past an alignment boundary.  This catches
439	 * boundary conditions.
440	 */
441	if (ib != 0 && pageout_count < vm_pageout_page_count)
442		goto more;
443
444	return (vm_pageout_flush(&mc[page_base], pageout_count,
445	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
446}
447
448/*
449 * vm_pageout_flush() - launder the given pages
450 *
451 *	The given pages are laundered.  Note that we setup for the start of
452 *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
453 *	reference count all in here rather then in the parent.  If we want
454 *	the parent to do more sophisticated things we may have to change
455 *	the ordering.
456 *
457 *	Returned runlen is the count of pages between mreq and first
458 *	page after mreq with status VM_PAGER_AGAIN.
459 *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
460 *	for any page in runlen set.
461 */
462int
463vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
464    boolean_t *eio)
465{
466	vm_object_t object = mc[0]->object;
467	int pageout_status[count];
468	int numpagedout = 0;
469	int i, runlen;
470
471	VM_OBJECT_ASSERT_WLOCKED(object);
472
473	/*
474	 * Initiate I/O.  Mark the pages busy and verify that they're valid
475	 * and read-only.
476	 *
477	 * We do not have to fixup the clean/dirty bits here... we can
478	 * allow the pager to do it after the I/O completes.
479	 *
480	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
481	 * edge case with file fragments.
482	 */
483	for (i = 0; i < count; i++) {
484		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
485		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
486			mc[i], i, count));
487		KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
488		    ("vm_pageout_flush: writeable page %p", mc[i]));
489		vm_page_sbusy(mc[i]);
490	}
491	vm_object_pip_add(object, count);
492
493	vm_pager_put_pages(object, mc, count, flags, pageout_status);
494
495	runlen = count - mreq;
496	if (eio != NULL)
497		*eio = FALSE;
498	for (i = 0; i < count; i++) {
499		vm_page_t mt = mc[i];
500
501		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
502		    !pmap_page_is_write_mapped(mt),
503		    ("vm_pageout_flush: page %p is not write protected", mt));
504		switch (pageout_status[i]) {
505		case VM_PAGER_OK:
506			vm_page_lock(mt);
507			if (vm_page_in_laundry(mt))
508				vm_page_deactivate_noreuse(mt);
509			vm_page_unlock(mt);
510			/* FALLTHROUGH */
511		case VM_PAGER_PEND:
512			numpagedout++;
513			break;
514		case VM_PAGER_BAD:
515			/*
516			 * The page is outside the object's range.  We pretend
517			 * that the page out worked and clean the page, so the
518			 * changes will be lost if the page is reclaimed by
519			 * the page daemon.
520			 */
521			vm_page_undirty(mt);
522			vm_page_lock(mt);
523			if (vm_page_in_laundry(mt))
524				vm_page_deactivate_noreuse(mt);
525			vm_page_unlock(mt);
526			break;
527		case VM_PAGER_ERROR:
528		case VM_PAGER_FAIL:
529			/*
530			 * If the page couldn't be paged out, then reactivate
531			 * it so that it doesn't clog the laundry and inactive
532			 * queues.  (We will try paging it out again later).
533			 */
534			vm_page_lock(mt);
535			vm_page_activate(mt);
536			vm_page_unlock(mt);
537			if (eio != NULL && i >= mreq && i - mreq < runlen)
538				*eio = TRUE;
539			break;
540		case VM_PAGER_AGAIN:
541			if (i >= mreq && i - mreq < runlen)
542				runlen = i - mreq;
543			break;
544		}
545
546		/*
547		 * If the operation is still going, leave the page busy to
548		 * block all other accesses. Also, leave the paging in
549		 * progress indicator set so that we don't attempt an object
550		 * collapse.
551		 */
552		if (pageout_status[i] != VM_PAGER_PEND) {
553			vm_object_pip_wakeup(object);
554			vm_page_sunbusy(mt);
555		}
556	}
557	if (prunlen != NULL)
558		*prunlen = runlen;
559	return (numpagedout);
560}
561
562/*
563 * Attempt to acquire all of the necessary locks to launder a page and
564 * then call through the clustering layer to PUTPAGES.  Wait a short
565 * time for a vnode lock.
566 *
567 * Requires the page and object lock on entry, releases both before return.
568 * Returns 0 on success and an errno otherwise.
569 */
570static int
571vm_pageout_clean(vm_page_t m, int *numpagedout)
572{
573	struct vnode *vp;
574	struct mount *mp;
575	vm_object_t object;
576	vm_pindex_t pindex;
577	int error, lockmode;
578
579	vm_page_assert_locked(m);
580	object = m->object;
581	VM_OBJECT_ASSERT_WLOCKED(object);
582	error = 0;
583	vp = NULL;
584	mp = NULL;
585
586	/*
587	 * The object is already known NOT to be dead.   It
588	 * is possible for the vget() to block the whole
589	 * pageout daemon, but the new low-memory handling
590	 * code should prevent it.
591	 *
592	 * We can't wait forever for the vnode lock, we might
593	 * deadlock due to a vn_read() getting stuck in
594	 * vm_wait while holding this vnode.  We skip the
595	 * vnode if we can't get it in a reasonable amount
596	 * of time.
597	 */
598	if (object->type == OBJT_VNODE) {
599		vm_page_unlock(m);
600		vp = object->handle;
601		if (vp->v_type == VREG &&
602		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
603			mp = NULL;
604			error = EDEADLK;
605			goto unlock_all;
606		}
607		KASSERT(mp != NULL,
608		    ("vp %p with NULL v_mount", vp));
609		vm_object_reference_locked(object);
610		pindex = m->pindex;
611		VM_OBJECT_WUNLOCK(object);
612		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
613		    LK_SHARED : LK_EXCLUSIVE;
614		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
615			vp = NULL;
616			error = EDEADLK;
617			goto unlock_mp;
618		}
619		VM_OBJECT_WLOCK(object);
620
621		/*
622		 * Ensure that the object and vnode were not disassociated
623		 * while locks were dropped.
624		 */
625		if (vp->v_object != object) {
626			error = ENOENT;
627			goto unlock_all;
628		}
629		vm_page_lock(m);
630
631		/*
632		 * While the object and page were unlocked, the page
633		 * may have been:
634		 * (1) moved to a different queue,
635		 * (2) reallocated to a different object,
636		 * (3) reallocated to a different offset, or
637		 * (4) cleaned.
638		 */
639		if (!vm_page_in_laundry(m) || m->object != object ||
640		    m->pindex != pindex || m->dirty == 0) {
641			vm_page_unlock(m);
642			error = ENXIO;
643			goto unlock_all;
644		}
645
646		/*
647		 * The page may have been busied or held while the object
648		 * and page locks were released.
649		 */
650		if (vm_page_busied(m) || m->hold_count != 0) {
651			vm_page_unlock(m);
652			error = EBUSY;
653			goto unlock_all;
654		}
655	}
656
657	/*
658	 * If a page is dirty, then it is either being washed
659	 * (but not yet cleaned) or it is still in the
660	 * laundry.  If it is still in the laundry, then we
661	 * start the cleaning operation.
662	 */
663	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
664		error = EIO;
665
666unlock_all:
667	VM_OBJECT_WUNLOCK(object);
668
669unlock_mp:
670	vm_page_lock_assert(m, MA_NOTOWNED);
671	if (mp != NULL) {
672		if (vp != NULL)
673			vput(vp);
674		vm_object_deallocate(object);
675		vn_finished_write(mp);
676	}
677
678	return (error);
679}
680
681/*
682 * Attempt to launder the specified number of pages.
683 *
684 * Returns the number of pages successfully laundered.
685 */
686static int
687vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
688{
689	struct vm_pagequeue *pq;
690	vm_object_t object;
691	vm_page_t m, next;
692	int act_delta, error, maxscan, numpagedout, starting_target;
693	int vnodes_skipped;
694	bool pageout_ok, queue_locked;
695
696	starting_target = launder;
697	vnodes_skipped = 0;
698
699	/*
700	 * Scan the laundry queue for pages eligible to be laundered.  We stop
701	 * once the target number of dirty pages have been laundered, or once
702	 * we've reached the end of the queue.  A single iteration of this loop
703	 * may cause more than one page to be laundered because of clustering.
704	 *
705	 * maxscan ensures that we don't re-examine requeued pages.  Any
706	 * additional pages written as part of a cluster are subtracted from
707	 * maxscan since they must be taken from the laundry queue.
708	 */
709	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
710	maxscan = pq->pq_cnt;
711
712	vm_pagequeue_lock(pq);
713	queue_locked = true;
714	for (m = TAILQ_FIRST(&pq->pq_pl);
715	    m != NULL && maxscan-- > 0 && launder > 0;
716	    m = next) {
717		vm_pagequeue_assert_locked(pq);
718		KASSERT(queue_locked, ("unlocked laundry queue"));
719		KASSERT(vm_page_in_laundry(m),
720		    ("page %p has an inconsistent queue", m));
721		next = TAILQ_NEXT(m, plinks.q);
722		if ((m->flags & PG_MARKER) != 0)
723			continue;
724		KASSERT((m->flags & PG_FICTITIOUS) == 0,
725		    ("PG_FICTITIOUS page %p cannot be in laundry queue", m));
726		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
727		    ("VPO_UNMANAGED page %p cannot be in laundry queue", m));
728		if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) {
729			vm_page_unlock(m);
730			continue;
731		}
732		object = m->object;
733		if ((!VM_OBJECT_TRYWLOCK(object) &&
734		    (!vm_pageout_fallback_object_lock(m, &next) ||
735		    m->hold_count != 0)) || vm_page_busied(m)) {
736			VM_OBJECT_WUNLOCK(object);
737			vm_page_unlock(m);
738			continue;
739		}
740
741		/*
742		 * Unlock the laundry queue, invalidating the 'next' pointer.
743		 * Use a marker to remember our place in the laundry queue.
744		 */
745		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_laundry_marker,
746		    plinks.q);
747		vm_pagequeue_unlock(pq);
748		queue_locked = false;
749
750		/*
751		 * Invalid pages can be easily freed.  They cannot be
752		 * mapped; vm_page_free() asserts this.
753		 */
754		if (m->valid == 0)
755			goto free_page;
756
757		/*
758		 * If the page has been referenced and the object is not dead,
759		 * reactivate or requeue the page depending on whether the
760		 * object is mapped.
761		 */
762		if ((m->aflags & PGA_REFERENCED) != 0) {
763			vm_page_aflag_clear(m, PGA_REFERENCED);
764			act_delta = 1;
765		} else
766			act_delta = 0;
767		if (object->ref_count != 0)
768			act_delta += pmap_ts_referenced(m);
769		else {
770			KASSERT(!pmap_page_is_mapped(m),
771			    ("page %p is mapped", m));
772		}
773		if (act_delta != 0) {
774			if (object->ref_count != 0) {
775				PCPU_INC(cnt.v_reactivated);
776				vm_page_activate(m);
777
778				/*
779				 * Increase the activation count if the page
780				 * was referenced while in the laundry queue.
781				 * This makes it less likely that the page will
782				 * be returned prematurely to the inactive
783				 * queue.
784 				 */
785				m->act_count += act_delta + ACT_ADVANCE;
786
787				/*
788				 * If this was a background laundering, count
789				 * activated pages towards our target.  The
790				 * purpose of background laundering is to ensure
791				 * that pages are eventually cycled through the
792				 * laundry queue, and an activation is a valid
793				 * way out.
794				 */
795				if (!in_shortfall)
796					launder--;
797				goto drop_page;
798			} else if ((object->flags & OBJ_DEAD) == 0)
799				goto requeue_page;
800		}
801
802		/*
803		 * If the page appears to be clean at the machine-independent
804		 * layer, then remove all of its mappings from the pmap in
805		 * anticipation of freeing it.  If, however, any of the page's
806		 * mappings allow write access, then the page may still be
807		 * modified until the last of those mappings are removed.
808		 */
809		if (object->ref_count != 0) {
810			vm_page_test_dirty(m);
811			if (m->dirty == 0)
812				pmap_remove_all(m);
813		}
814
815		/*
816		 * Clean pages are freed, and dirty pages are paged out unless
817		 * they belong to a dead object.  Requeueing dirty pages from
818		 * dead objects is pointless, as they are being paged out and
819		 * freed by the thread that destroyed the object.
820		 */
821		if (m->dirty == 0) {
822free_page:
823			vm_page_free(m);
824			PCPU_INC(cnt.v_dfree);
825		} else if ((object->flags & OBJ_DEAD) == 0) {
826			if (object->type != OBJT_SWAP &&
827			    object->type != OBJT_DEFAULT)
828				pageout_ok = true;
829			else if (disable_swap_pageouts)
830				pageout_ok = false;
831			else
832				pageout_ok = true;
833			if (!pageout_ok) {
834requeue_page:
835				vm_pagequeue_lock(pq);
836				queue_locked = true;
837				vm_page_requeue_locked(m);
838				goto drop_page;
839			}
840
841			/*
842			 * Form a cluster with adjacent, dirty pages from the
843			 * same object, and page out that entire cluster.
844			 *
845			 * The adjacent, dirty pages must also be in the
846			 * laundry.  However, their mappings are not checked
847			 * for new references.  Consequently, a recently
848			 * referenced page may be paged out.  However, that
849			 * page will not be prematurely reclaimed.  After page
850			 * out, the page will be placed in the inactive queue,
851			 * where any new references will be detected and the
852			 * page reactivated.
853			 */
854			error = vm_pageout_clean(m, &numpagedout);
855			if (error == 0) {
856				launder -= numpagedout;
857				maxscan -= numpagedout - 1;
858			} else if (error == EDEADLK) {
859				pageout_lock_miss++;
860				vnodes_skipped++;
861			}
862			goto relock_queue;
863		}
864drop_page:
865		vm_page_unlock(m);
866		VM_OBJECT_WUNLOCK(object);
867relock_queue:
868		if (!queue_locked) {
869			vm_pagequeue_lock(pq);
870			queue_locked = true;
871		}
872		next = TAILQ_NEXT(&vmd->vmd_laundry_marker, plinks.q);
873		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_laundry_marker, plinks.q);
874	}
875	vm_pagequeue_unlock(pq);
876
877	/*
878	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
879	 * and we didn't launder enough pages.
880	 */
881	if (vnodes_skipped > 0 && launder > 0)
882		(void)speedup_syncer();
883
884	return (starting_target - launder);
885}
886
887/*
888 * Compute the integer square root.
889 */
890static u_int
891isqrt(u_int num)
892{
893	u_int bit, root, tmp;
894
895	bit = 1u << ((NBBY * sizeof(u_int)) - 2);
896	while (bit > num)
897		bit >>= 2;
898	root = 0;
899	while (bit != 0) {
900		tmp = root + bit;
901		root >>= 1;
902		if (num >= tmp) {
903			num -= tmp;
904			root += bit;
905		}
906		bit >>= 2;
907	}
908	return (root);
909}
910
911/*
912 * Perform the work of the laundry thread: periodically wake up and determine
913 * whether any pages need to be laundered.  If so, determine the number of pages
914 * that need to be laundered, and launder them.
915 */
916static void
917vm_pageout_laundry_worker(void *arg)
918{
919	struct vm_domain *domain;
920	struct vm_pagequeue *pq;
921	uint64_t nclean, ndirty;
922	u_int last_launder, wakeups;
923	int domidx, last_target, launder, shortfall, shortfall_cycle, target;
924	bool in_shortfall;
925
926	domidx = (uintptr_t)arg;
927	domain = &vm_dom[domidx];
928	pq = &domain->vmd_pagequeues[PQ_LAUNDRY];
929	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
930	vm_pageout_init_marker(&domain->vmd_laundry_marker, PQ_LAUNDRY);
931
932	shortfall = 0;
933	in_shortfall = false;
934	shortfall_cycle = 0;
935	target = 0;
936	last_launder = 0;
937
938	/*
939	 * The pageout laundry worker is never done, so loop forever.
940	 */
941	for (;;) {
942		KASSERT(target >= 0, ("negative target %d", target));
943		KASSERT(shortfall_cycle >= 0,
944		    ("negative cycle %d", shortfall_cycle));
945		launder = 0;
946		wakeups = VM_METER_PCPU_CNT(v_pdwakeups);
947
948		/*
949		 * First determine whether we need to launder pages to meet a
950		 * shortage of free pages.
951		 */
952		if (shortfall > 0) {
953			in_shortfall = true;
954			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
955			target = shortfall;
956		} else if (!in_shortfall)
957			goto trybackground;
958		else if (shortfall_cycle == 0 || vm_laundry_target() <= 0) {
959			/*
960			 * We recently entered shortfall and began laundering
961			 * pages.  If we have completed that laundering run
962			 * (and we are no longer in shortfall) or we have met
963			 * our laundry target through other activity, then we
964			 * can stop laundering pages.
965			 */
966			in_shortfall = false;
967			target = 0;
968			goto trybackground;
969		}
970		last_launder = wakeups;
971		launder = target / shortfall_cycle--;
972		goto dolaundry;
973
974		/*
975		 * There's no immediate need to launder any pages; see if we
976		 * meet the conditions to perform background laundering:
977		 *
978		 * 1. The ratio of dirty to clean inactive pages exceeds the
979		 *    background laundering threshold and the pagedaemon has
980		 *    been woken up to reclaim pages since our last
981		 *    laundering, or
982		 * 2. we haven't yet reached the target of the current
983		 *    background laundering run.
984		 *
985		 * The background laundering threshold is not a constant.
986		 * Instead, it is a slowly growing function of the number of
987		 * page daemon wakeups since the last laundering.  Thus, as the
988		 * ratio of dirty to clean inactive pages grows, the amount of
989		 * memory pressure required to trigger laundering decreases.
990		 */
991trybackground:
992		nclean = vm_cnt.v_inactive_count + vm_cnt.v_free_count;
993		ndirty = vm_cnt.v_laundry_count;
994		if (target == 0 && wakeups != last_launder &&
995		    ndirty * isqrt(wakeups - last_launder) >= nclean) {
996			target = vm_background_launder_target;
997		}
998
999		/*
1000		 * We have a non-zero background laundering target.  If we've
1001		 * laundered up to our maximum without observing a page daemon
1002		 * wakeup, just stop.  This is a safety belt that ensures we
1003		 * don't launder an excessive amount if memory pressure is low
1004		 * and the ratio of dirty to clean pages is large.  Otherwise,
1005		 * proceed at the background laundering rate.
1006		 */
1007		if (target > 0) {
1008			if (wakeups != last_launder) {
1009				last_launder = wakeups;
1010				last_target = target;
1011			} else if (last_target - target >=
1012			    vm_background_launder_max * PAGE_SIZE / 1024) {
1013				target = 0;
1014			}
1015			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
1016			launder /= VM_LAUNDER_RATE;
1017			if (launder > target)
1018				launder = target;
1019		}
1020
1021dolaundry:
1022		if (launder > 0) {
1023			/*
1024			 * Because of I/O clustering, the number of laundered
1025			 * pages could exceed "target" by the maximum size of
1026			 * a cluster minus one.
1027			 */
1028			target -= min(vm_pageout_launder(domain, launder,
1029			    in_shortfall), target);
1030			pause("laundp", hz / VM_LAUNDER_RATE);
1031		}
1032
1033		/*
1034		 * If we're not currently laundering pages and the page daemon
1035		 * hasn't posted a new request, sleep until the page daemon
1036		 * kicks us.
1037		 */
1038		vm_pagequeue_lock(pq);
1039		if (target == 0 && vm_laundry_request == VM_LAUNDRY_IDLE)
1040			(void)mtx_sleep(&vm_laundry_request,
1041			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
1042
1043		/*
1044		 * If the pagedaemon has indicated that it's in shortfall, start
1045		 * a shortfall laundering unless we're already in the middle of
1046		 * one.  This may preempt a background laundering.
1047		 */
1048		if (vm_laundry_request == VM_LAUNDRY_SHORTFALL &&
1049		    (!in_shortfall || shortfall_cycle == 0)) {
1050			shortfall = vm_laundry_target() + vm_pageout_deficit;
1051			target = 0;
1052		} else
1053			shortfall = 0;
1054
1055		if (target == 0)
1056			vm_laundry_request = VM_LAUNDRY_IDLE;
1057		vm_pagequeue_unlock(pq);
1058	}
1059}
1060
1061/*
1062 *	vm_pageout_scan does the dirty work for the pageout daemon.
1063 *
1064 *	pass == 0: Update active LRU/deactivate pages
1065 *	pass >= 1: Free inactive pages
1066 *
1067 * Returns true if pass was zero or enough pages were freed by the inactive
1068 * queue scan to meet the target.
1069 */
1070static bool
1071vm_pageout_scan(struct vm_domain *vmd, int pass)
1072{
1073	vm_page_t m, next;
1074	struct vm_pagequeue *pq;
1075	vm_object_t object;
1076	long min_scan;
1077	int act_delta, addl_page_shortage, deficit, inactq_shortage, maxscan;
1078	int page_shortage, scan_tick, scanned, starting_page_shortage;
1079	boolean_t queue_locked;
1080
1081	/*
1082	 * If we need to reclaim memory ask kernel caches to return
1083	 * some.  We rate limit to avoid thrashing.
1084	 */
1085	if (vmd == &vm_dom[0] && pass > 0 &&
1086	    (time_uptime - lowmem_uptime) >= lowmem_period) {
1087		/*
1088		 * Decrease registered cache sizes.
1089		 */
1090		SDT_PROBE0(vm, , , vm__lowmem_scan);
1091		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
1092		/*
1093		 * We do this explicitly after the caches have been
1094		 * drained above.
1095		 */
1096		uma_reclaim();
1097		lowmem_uptime = time_uptime;
1098	}
1099
1100	/*
1101	 * The addl_page_shortage is the number of temporarily
1102	 * stuck pages in the inactive queue.  In other words, the
1103	 * number of pages from the inactive count that should be
1104	 * discounted in setting the target for the active queue scan.
1105	 */
1106	addl_page_shortage = 0;
1107
1108	/*
1109	 * Calculate the number of pages that we want to free.  This number
1110	 * can be negative if many pages are freed between the wakeup call to
1111	 * the page daemon and this calculation.
1112	 */
1113	if (pass > 0) {
1114		deficit = atomic_readandclear_int(&vm_pageout_deficit);
1115		page_shortage = vm_paging_target() + deficit;
1116	} else
1117		page_shortage = deficit = 0;
1118	starting_page_shortage = page_shortage;
1119
1120	/*
1121	 * Start scanning the inactive queue for pages that we can free.  The
1122	 * scan will stop when we reach the target or we have scanned the
1123	 * entire queue.  (Note that m->act_count is not used to make
1124	 * decisions for the inactive queue, only for the active queue.)
1125	 */
1126	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
1127	maxscan = pq->pq_cnt;
1128	vm_pagequeue_lock(pq);
1129	queue_locked = TRUE;
1130	for (m = TAILQ_FIRST(&pq->pq_pl);
1131	     m != NULL && maxscan-- > 0 && page_shortage > 0;
1132	     m = next) {
1133		vm_pagequeue_assert_locked(pq);
1134		KASSERT(queue_locked, ("unlocked inactive queue"));
1135		KASSERT(vm_page_inactive(m), ("Inactive queue %p", m));
1136
1137		PCPU_INC(cnt.v_pdpages);
1138		next = TAILQ_NEXT(m, plinks.q);
1139
1140		/*
1141		 * skip marker pages
1142		 */
1143		if (m->flags & PG_MARKER)
1144			continue;
1145
1146		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1147		    ("Fictitious page %p cannot be in inactive queue", m));
1148		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1149		    ("Unmanaged page %p cannot be in inactive queue", m));
1150
1151		/*
1152		 * The page or object lock acquisitions fail if the
1153		 * page was removed from the queue or moved to a
1154		 * different position within the queue.  In either
1155		 * case, addl_page_shortage should not be incremented.
1156		 */
1157		if (!vm_pageout_page_lock(m, &next))
1158			goto unlock_page;
1159		else if (m->hold_count != 0) {
1160			/*
1161			 * Held pages are essentially stuck in the
1162			 * queue.  So, they ought to be discounted
1163			 * from the inactive count.  See the
1164			 * calculation of inactq_shortage before the
1165			 * loop over the active queue below.
1166			 */
1167			addl_page_shortage++;
1168			goto unlock_page;
1169		}
1170		object = m->object;
1171		if (!VM_OBJECT_TRYWLOCK(object)) {
1172			if (!vm_pageout_fallback_object_lock(m, &next))
1173				goto unlock_object;
1174			else if (m->hold_count != 0) {
1175				addl_page_shortage++;
1176				goto unlock_object;
1177			}
1178		}
1179		if (vm_page_busied(m)) {
1180			/*
1181			 * Don't mess with busy pages.  Leave them at
1182			 * the front of the queue.  Most likely, they
1183			 * are being paged out and will leave the
1184			 * queue shortly after the scan finishes.  So,
1185			 * they ought to be discounted from the
1186			 * inactive count.
1187			 */
1188			addl_page_shortage++;
1189unlock_object:
1190			VM_OBJECT_WUNLOCK(object);
1191unlock_page:
1192			vm_page_unlock(m);
1193			continue;
1194		}
1195		KASSERT(m->hold_count == 0, ("Held page %p", m));
1196
1197		/*
1198		 * Dequeue the inactive page and unlock the inactive page
1199		 * queue, invalidating the 'next' pointer.  Dequeueing the
1200		 * page here avoids a later reacquisition (and release) of
1201		 * the inactive page queue lock when vm_page_activate(),
1202		 * vm_page_free(), or vm_page_launder() is called.  Use a
1203		 * marker to remember our place in the inactive queue.
1204		 */
1205		TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q);
1206		vm_page_dequeue_locked(m);
1207		vm_pagequeue_unlock(pq);
1208		queue_locked = FALSE;
1209
1210		/*
1211		 * Invalid pages can be easily freed. They cannot be
1212		 * mapped, vm_page_free() asserts this.
1213		 */
1214		if (m->valid == 0)
1215			goto free_page;
1216
1217		/*
1218		 * If the page has been referenced and the object is not dead,
1219		 * reactivate or requeue the page depending on whether the
1220		 * object is mapped.
1221		 */
1222		if ((m->aflags & PGA_REFERENCED) != 0) {
1223			vm_page_aflag_clear(m, PGA_REFERENCED);
1224			act_delta = 1;
1225		} else
1226			act_delta = 0;
1227		if (object->ref_count != 0) {
1228			act_delta += pmap_ts_referenced(m);
1229		} else {
1230			KASSERT(!pmap_page_is_mapped(m),
1231			    ("vm_pageout_scan: page %p is mapped", m));
1232		}
1233		if (act_delta != 0) {
1234			if (object->ref_count != 0) {
1235				PCPU_INC(cnt.v_reactivated);
1236				vm_page_activate(m);
1237
1238				/*
1239				 * Increase the activation count if the page
1240				 * was referenced while in the inactive queue.
1241				 * This makes it less likely that the page will
1242				 * be returned prematurely to the inactive
1243				 * queue.
1244 				 */
1245				m->act_count += act_delta + ACT_ADVANCE;
1246				goto drop_page;
1247			} else if ((object->flags & OBJ_DEAD) == 0) {
1248				vm_pagequeue_lock(pq);
1249				queue_locked = TRUE;
1250				m->queue = PQ_INACTIVE;
1251				TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
1252				vm_pagequeue_cnt_inc(pq);
1253				goto drop_page;
1254			}
1255		}
1256
1257		/*
1258		 * If the page appears to be clean at the machine-independent
1259		 * layer, then remove all of its mappings from the pmap in
1260		 * anticipation of freeing it.  If, however, any of the page's
1261		 * mappings allow write access, then the page may still be
1262		 * modified until the last of those mappings are removed.
1263		 */
1264		if (object->ref_count != 0) {
1265			vm_page_test_dirty(m);
1266			if (m->dirty == 0)
1267				pmap_remove_all(m);
1268		}
1269
1270		/*
1271		 * Clean pages can be freed, but dirty pages must be sent back
1272		 * to the laundry, unless they belong to a dead object.
1273		 * Requeueing dirty pages from dead objects is pointless, as
1274		 * they are being paged out and freed by the thread that
1275		 * destroyed the object.
1276		 */
1277		if (m->dirty == 0) {
1278free_page:
1279			vm_page_free(m);
1280			PCPU_INC(cnt.v_dfree);
1281			--page_shortage;
1282		} else if ((object->flags & OBJ_DEAD) == 0)
1283			vm_page_launder(m);
1284drop_page:
1285		vm_page_unlock(m);
1286		VM_OBJECT_WUNLOCK(object);
1287		if (!queue_locked) {
1288			vm_pagequeue_lock(pq);
1289			queue_locked = TRUE;
1290		}
1291		next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q);
1292		TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q);
1293	}
1294	vm_pagequeue_unlock(pq);
1295
1296	/*
1297	 * Wake up the laundry thread so that it can perform any needed
1298	 * laundering.  If we didn't meet our target, we're in shortfall and
1299	 * need to launder more aggressively.
1300	 */
1301	if (vm_laundry_request == VM_LAUNDRY_IDLE &&
1302	    starting_page_shortage > 0) {
1303		pq = &vm_dom[0].vmd_pagequeues[PQ_LAUNDRY];
1304		vm_pagequeue_lock(pq);
1305		if (page_shortage > 0) {
1306			vm_laundry_request = VM_LAUNDRY_SHORTFALL;
1307			PCPU_INC(cnt.v_pdshortfalls);
1308		} else if (vm_laundry_request != VM_LAUNDRY_SHORTFALL)
1309			vm_laundry_request = VM_LAUNDRY_BACKGROUND;
1310		wakeup(&vm_laundry_request);
1311		vm_pagequeue_unlock(pq);
1312	}
1313
1314	/*
1315	 * Wakeup the swapout daemon if we didn't free the targeted number of
1316	 * pages.
1317	 */
1318	if (page_shortage > 0)
1319		vm_swapout_run();
1320
1321	/*
1322	 * If the inactive queue scan fails repeatedly to meet its
1323	 * target, kill the largest process.
1324	 */
1325	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
1326
1327	/*
1328	 * Compute the number of pages we want to try to move from the
1329	 * active queue to either the inactive or laundry queue.
1330	 *
1331	 * When scanning active pages, we make clean pages count more heavily
1332	 * towards the page shortage than dirty pages.  This is because dirty
1333	 * pages must be laundered before they can be reused and thus have less
1334	 * utility when attempting to quickly alleviate a shortage.  However,
1335	 * this weighting also causes the scan to deactivate dirty pages more
1336	 * more aggressively, improving the effectiveness of clustering and
1337	 * ensuring that they can eventually be reused.
1338	 */
1339	inactq_shortage = vm_cnt.v_inactive_target - (vm_cnt.v_inactive_count +
1340	    vm_cnt.v_laundry_count / act_scan_laundry_weight) +
1341	    vm_paging_target() + deficit + addl_page_shortage;
1342	inactq_shortage *= act_scan_laundry_weight;
1343
1344	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
1345	vm_pagequeue_lock(pq);
1346	maxscan = pq->pq_cnt;
1347
1348	/*
1349	 * If we're just idle polling attempt to visit every
1350	 * active page within 'update_period' seconds.
1351	 */
1352	scan_tick = ticks;
1353	if (vm_pageout_update_period != 0) {
1354		min_scan = pq->pq_cnt;
1355		min_scan *= scan_tick - vmd->vmd_last_active_scan;
1356		min_scan /= hz * vm_pageout_update_period;
1357	} else
1358		min_scan = 0;
1359	if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0))
1360		vmd->vmd_last_active_scan = scan_tick;
1361
1362	/*
1363	 * Scan the active queue for pages that can be deactivated.  Update
1364	 * the per-page activity counter and use it to identify deactivation
1365	 * candidates.  Held pages may be deactivated.
1366	 */
1367	for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned <
1368	    min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next,
1369	    scanned++) {
1370		KASSERT(m->queue == PQ_ACTIVE,
1371		    ("vm_pageout_scan: page %p isn't active", m));
1372		next = TAILQ_NEXT(m, plinks.q);
1373		if ((m->flags & PG_MARKER) != 0)
1374			continue;
1375		KASSERT((m->flags & PG_FICTITIOUS) == 0,
1376		    ("Fictitious page %p cannot be in active queue", m));
1377		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1378		    ("Unmanaged page %p cannot be in active queue", m));
1379		if (!vm_pageout_page_lock(m, &next)) {
1380			vm_page_unlock(m);
1381			continue;
1382		}
1383
1384		/*
1385		 * The count for page daemon pages is updated after checking
1386		 * the page for eligibility.
1387		 */
1388		PCPU_INC(cnt.v_pdpages);
1389
1390		/*
1391		 * Check to see "how much" the page has been used.
1392		 */
1393		if ((m->aflags & PGA_REFERENCED) != 0) {
1394			vm_page_aflag_clear(m, PGA_REFERENCED);
1395			act_delta = 1;
1396		} else
1397			act_delta = 0;
1398
1399		/*
1400		 * Perform an unsynchronized object ref count check.  While
1401		 * the page lock ensures that the page is not reallocated to
1402		 * another object, in particular, one with unmanaged mappings
1403		 * that cannot support pmap_ts_referenced(), two races are,
1404		 * nonetheless, possible:
1405		 * 1) The count was transitioning to zero, but we saw a non-
1406		 *    zero value.  pmap_ts_referenced() will return zero
1407		 *    because the page is not mapped.
1408		 * 2) The count was transitioning to one, but we saw zero.
1409		 *    This race delays the detection of a new reference.  At
1410		 *    worst, we will deactivate and reactivate the page.
1411		 */
1412		if (m->object->ref_count != 0)
1413			act_delta += pmap_ts_referenced(m);
1414
1415		/*
1416		 * Advance or decay the act_count based on recent usage.
1417		 */
1418		if (act_delta != 0) {
1419			m->act_count += ACT_ADVANCE + act_delta;
1420			if (m->act_count > ACT_MAX)
1421				m->act_count = ACT_MAX;
1422		} else
1423			m->act_count -= min(m->act_count, ACT_DECLINE);
1424
1425		/*
1426		 * Move this page to the tail of the active, inactive or laundry
1427		 * queue depending on usage.
1428		 */
1429		if (m->act_count == 0) {
1430			/* Dequeue to avoid later lock recursion. */
1431			vm_page_dequeue_locked(m);
1432
1433			/*
1434			 * When not short for inactive pages, let dirty pages go
1435			 * through the inactive queue before moving to the
1436			 * laundry queues.  This gives them some extra time to
1437			 * be reactivated, potentially avoiding an expensive
1438			 * pageout.  During a page shortage, the inactive queue
1439			 * is necessarily small, so we may move dirty pages
1440			 * directly to the laundry queue.
1441			 */
1442			if (inactq_shortage <= 0)
1443				vm_page_deactivate(m);
1444			else {
1445				/*
1446				 * Calling vm_page_test_dirty() here would
1447				 * require acquisition of the object's write
1448				 * lock.  However, during a page shortage,
1449				 * directing dirty pages into the laundry
1450				 * queue is only an optimization and not a
1451				 * requirement.  Therefore, we simply rely on
1452				 * the opportunistic updates to the page's
1453				 * dirty field by the pmap.
1454				 */
1455				if (m->dirty == 0) {
1456					vm_page_deactivate(m);
1457					inactq_shortage -=
1458					    act_scan_laundry_weight;
1459				} else {
1460					vm_page_launder(m);
1461					inactq_shortage--;
1462				}
1463			}
1464		} else
1465			vm_page_requeue_locked(m);
1466		vm_page_unlock(m);
1467	}
1468	vm_pagequeue_unlock(pq);
1469	if (pass > 0)
1470		vm_swapout_run_idle();
1471	return (page_shortage <= 0);
1472}
1473
1474static int vm_pageout_oom_vote;
1475
1476/*
1477 * The pagedaemon threads randlomly select one to perform the
1478 * OOM.  Trying to kill processes before all pagedaemons
1479 * failed to reach free target is premature.
1480 */
1481static void
1482vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
1483    int starting_page_shortage)
1484{
1485	int old_vote;
1486
1487	if (starting_page_shortage <= 0 || starting_page_shortage !=
1488	    page_shortage)
1489		vmd->vmd_oom_seq = 0;
1490	else
1491		vmd->vmd_oom_seq++;
1492	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
1493		if (vmd->vmd_oom) {
1494			vmd->vmd_oom = FALSE;
1495			atomic_subtract_int(&vm_pageout_oom_vote, 1);
1496		}
1497		return;
1498	}
1499
1500	/*
1501	 * Do not follow the call sequence until OOM condition is
1502	 * cleared.
1503	 */
1504	vmd->vmd_oom_seq = 0;
1505
1506	if (vmd->vmd_oom)
1507		return;
1508
1509	vmd->vmd_oom = TRUE;
1510	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
1511	if (old_vote != vm_ndomains - 1)
1512		return;
1513
1514	/*
1515	 * The current pagedaemon thread is the last in the quorum to
1516	 * start OOM.  Initiate the selection and signaling of the
1517	 * victim.
1518	 */
1519	vm_pageout_oom(VM_OOM_MEM);
1520
1521	/*
1522	 * After one round of OOM terror, recall our vote.  On the
1523	 * next pass, current pagedaemon would vote again if the low
1524	 * memory condition is still there, due to vmd_oom being
1525	 * false.
1526	 */
1527	vmd->vmd_oom = FALSE;
1528	atomic_subtract_int(&vm_pageout_oom_vote, 1);
1529}
1530
1531/*
1532 * The OOM killer is the page daemon's action of last resort when
1533 * memory allocation requests have been stalled for a prolonged period
1534 * of time because it cannot reclaim memory.  This function computes
1535 * the approximate number of physical pages that could be reclaimed if
1536 * the specified address space is destroyed.
1537 *
1538 * Private, anonymous memory owned by the address space is the
1539 * principal resource that we expect to recover after an OOM kill.
1540 * Since the physical pages mapped by the address space's COW entries
1541 * are typically shared pages, they are unlikely to be released and so
1542 * they are not counted.
1543 *
1544 * To get to the point where the page daemon runs the OOM killer, its
1545 * efforts to write-back vnode-backed pages may have stalled.  This
1546 * could be caused by a memory allocation deadlock in the write path
1547 * that might be resolved by an OOM kill.  Therefore, physical pages
1548 * belonging to vnode-backed objects are counted, because they might
1549 * be freed without being written out first if the address space holds
1550 * the last reference to an unlinked vnode.
1551 *
1552 * Similarly, physical pages belonging to OBJT_PHYS objects are
1553 * counted because the address space might hold the last reference to
1554 * the object.
1555 */
1556static long
1557vm_pageout_oom_pagecount(struct vmspace *vmspace)
1558{
1559	vm_map_t map;
1560	vm_map_entry_t entry;
1561	vm_object_t obj;
1562	long res;
1563
1564	map = &vmspace->vm_map;
1565	KASSERT(!map->system_map, ("system map"));
1566	sx_assert(&map->lock, SA_LOCKED);
1567	res = 0;
1568	for (entry = map->header.next; entry != &map->header;
1569	    entry = entry->next) {
1570		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
1571			continue;
1572		obj = entry->object.vm_object;
1573		if (obj == NULL)
1574			continue;
1575		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
1576		    obj->ref_count != 1)
1577			continue;
1578		switch (obj->type) {
1579		case OBJT_DEFAULT:
1580		case OBJT_SWAP:
1581		case OBJT_PHYS:
1582		case OBJT_VNODE:
1583			res += obj->resident_page_count;
1584			break;
1585		}
1586	}
1587	return (res);
1588}
1589
1590void
1591vm_pageout_oom(int shortage)
1592{
1593	struct proc *p, *bigproc;
1594	vm_offset_t size, bigsize;
1595	struct thread *td;
1596	struct vmspace *vm;
1597	bool breakout;
1598
1599	/*
1600	 * We keep the process bigproc locked once we find it to keep anyone
1601	 * from messing with it; however, there is a possibility of
1602	 * deadlock if process B is bigproc and one of it's child processes
1603	 * attempts to propagate a signal to B while we are waiting for A's
1604	 * lock while walking this list.  To avoid this, we don't block on
1605	 * the process lock but just skip a process if it is already locked.
1606	 */
1607	bigproc = NULL;
1608	bigsize = 0;
1609	sx_slock(&allproc_lock);
1610	FOREACH_PROC_IN_SYSTEM(p) {
1611		PROC_LOCK(p);
1612
1613		/*
1614		 * If this is a system, protected or killed process, skip it.
1615		 */
1616		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
1617		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
1618		    p->p_pid == 1 || P_KILLED(p) ||
1619		    (p->p_pid < 48 && swap_pager_avail != 0)) {
1620			PROC_UNLOCK(p);
1621			continue;
1622		}
1623		/*
1624		 * If the process is in a non-running type state,
1625		 * don't touch it.  Check all the threads individually.
1626		 */
1627		breakout = false;
1628		FOREACH_THREAD_IN_PROC(p, td) {
1629			thread_lock(td);
1630			if (!TD_ON_RUNQ(td) &&
1631			    !TD_IS_RUNNING(td) &&
1632			    !TD_IS_SLEEPING(td) &&
1633			    !TD_IS_SUSPENDED(td) &&
1634			    !TD_IS_SWAPPED(td)) {
1635				thread_unlock(td);
1636				breakout = true;
1637				break;
1638			}
1639			thread_unlock(td);
1640		}
1641		if (breakout) {
1642			PROC_UNLOCK(p);
1643			continue;
1644		}
1645		/*
1646		 * get the process size
1647		 */
1648		vm = vmspace_acquire_ref(p);
1649		if (vm == NULL) {
1650			PROC_UNLOCK(p);
1651			continue;
1652		}
1653		_PHOLD_LITE(p);
1654		PROC_UNLOCK(p);
1655		sx_sunlock(&allproc_lock);
1656		if (!vm_map_trylock_read(&vm->vm_map)) {
1657			vmspace_free(vm);
1658			sx_slock(&allproc_lock);
1659			PRELE(p);
1660			continue;
1661		}
1662		size = vmspace_swap_count(vm);
1663		if (shortage == VM_OOM_MEM)
1664			size += vm_pageout_oom_pagecount(vm);
1665		vm_map_unlock_read(&vm->vm_map);
1666		vmspace_free(vm);
1667		sx_slock(&allproc_lock);
1668
1669		/*
1670		 * If this process is bigger than the biggest one,
1671		 * remember it.
1672		 */
1673		if (size > bigsize) {
1674			if (bigproc != NULL)
1675				PRELE(bigproc);
1676			bigproc = p;
1677			bigsize = size;
1678		} else {
1679			PRELE(p);
1680		}
1681	}
1682	sx_sunlock(&allproc_lock);
1683	if (bigproc != NULL) {
1684		if (vm_panic_on_oom != 0)
1685			panic("out of swap space");
1686		PROC_LOCK(bigproc);
1687		killproc(bigproc, "out of swap space");
1688		sched_nice(bigproc, PRIO_MIN);
1689		_PRELE(bigproc);
1690		PROC_UNLOCK(bigproc);
1691		wakeup(&vm_cnt.v_free_count);
1692	}
1693}
1694
1695static void
1696vm_pageout_worker(void *arg)
1697{
1698	struct vm_domain *domain;
1699	int domidx, pass;
1700	bool target_met;
1701
1702	domidx = (uintptr_t)arg;
1703	domain = &vm_dom[domidx];
1704	pass = 0;
1705	target_met = true;
1706
1707	/*
1708	 * XXXKIB It could be useful to bind pageout daemon threads to
1709	 * the cores belonging to the domain, from which vm_page_array
1710	 * is allocated.
1711	 */
1712
1713	KASSERT(domain->vmd_segs != 0, ("domain without segments"));
1714	domain->vmd_last_active_scan = ticks;
1715	vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE);
1716	vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE);
1717	TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl,
1718	    &domain->vmd_inacthead, plinks.q);
1719
1720	/*
1721	 * The pageout daemon worker is never done, so loop forever.
1722	 */
1723	while (TRUE) {
1724		mtx_lock(&vm_page_queue_free_mtx);
1725
1726		/*
1727		 * Generally, after a level >= 1 scan, if there are enough
1728		 * free pages to wakeup the waiters, then they are already
1729		 * awake.  A call to vm_page_free() during the scan awakened
1730		 * them.  However, in the following case, this wakeup serves
1731		 * to bound the amount of time that a thread might wait.
1732		 * Suppose a thread's call to vm_page_alloc() fails, but
1733		 * before that thread calls VM_WAIT, enough pages are freed by
1734		 * other threads to alleviate the free page shortage.  The
1735		 * thread will, nonetheless, wait until another page is freed
1736		 * or this wakeup is performed.
1737		 */
1738		if (vm_pages_needed && !vm_page_count_min()) {
1739			vm_pages_needed = false;
1740			wakeup(&vm_cnt.v_free_count);
1741		}
1742
1743		/*
1744		 * Do not clear vm_pageout_wanted until we reach our free page
1745		 * target.  Otherwise, we may be awakened over and over again,
1746		 * wasting CPU time.
1747		 */
1748		if (vm_pageout_wanted && target_met)
1749			vm_pageout_wanted = false;
1750
1751		/*
1752		 * Might the page daemon receive a wakeup call?
1753		 */
1754		if (vm_pageout_wanted) {
1755			/*
1756			 * No.  Either vm_pageout_wanted was set by another
1757			 * thread during the previous scan, which must have
1758			 * been a level 0 scan, or vm_pageout_wanted was
1759			 * already set and the scan failed to free enough
1760			 * pages.  If we haven't yet performed a level >= 1
1761			 * (page reclamation) scan, then increase the level
1762			 * and scan again now.  Otherwise, sleep a bit and
1763			 * try again later.
1764			 */
1765			mtx_unlock(&vm_page_queue_free_mtx);
1766			if (pass >= 1)
1767				pause("pwait", hz / VM_INACT_SCAN_RATE);
1768			pass++;
1769		} else {
1770			/*
1771			 * Yes.  If threads are still sleeping in VM_WAIT
1772			 * then we immediately start a new scan.  Otherwise,
1773			 * sleep until the next wakeup or until pages need to
1774			 * have their reference stats updated.
1775			 */
1776			if (vm_pages_needed) {
1777				mtx_unlock(&vm_page_queue_free_mtx);
1778				if (pass == 0)
1779					pass++;
1780			} else if (mtx_sleep(&vm_pageout_wanted,
1781			    &vm_page_queue_free_mtx, PDROP | PVM, "psleep",
1782			    hz) == 0) {
1783				PCPU_INC(cnt.v_pdwakeups);
1784				pass = 1;
1785			} else
1786				pass = 0;
1787		}
1788
1789		target_met = vm_pageout_scan(domain, pass);
1790	}
1791}
1792
1793/*
1794 *	vm_pageout_init initialises basic pageout daemon settings.
1795 */
1796static void
1797vm_pageout_init(void)
1798{
1799	/*
1800	 * Initialize some paging parameters.
1801	 */
1802	vm_cnt.v_interrupt_free_min = 2;
1803	if (vm_cnt.v_page_count < 2000)
1804		vm_pageout_page_count = 8;
1805
1806	/*
1807	 * v_free_reserved needs to include enough for the largest
1808	 * swap pager structures plus enough for any pv_entry structs
1809	 * when paging.
1810	 */
1811	if (vm_cnt.v_page_count > 1024)
1812		vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200;
1813	else
1814		vm_cnt.v_free_min = 4;
1815	vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
1816	    vm_cnt.v_interrupt_free_min;
1817	vm_cnt.v_free_reserved = vm_pageout_page_count +
1818	    vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768);
1819	vm_cnt.v_free_severe = vm_cnt.v_free_min / 2;
1820	vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved;
1821	vm_cnt.v_free_min += vm_cnt.v_free_reserved;
1822	vm_cnt.v_free_severe += vm_cnt.v_free_reserved;
1823	vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2;
1824	if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3)
1825		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
1826
1827	/*
1828	 * Set the default wakeup threshold to be 10% above the minimum
1829	 * page limit.  This keeps the steady state out of shortfall.
1830	 */
1831	vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11;
1832
1833	/*
1834	 * Set interval in seconds for active scan.  We want to visit each
1835	 * page at least once every ten minutes.  This is to prevent worst
1836	 * case paging behaviors with stale active LRU.
1837	 */
1838	if (vm_pageout_update_period == 0)
1839		vm_pageout_update_period = 600;
1840
1841	/* XXX does not really belong here */
1842	if (vm_page_max_wired == 0)
1843		vm_page_max_wired = vm_cnt.v_free_count / 3;
1844
1845	/*
1846	 * Target amount of memory to move out of the laundry queue during a
1847	 * background laundering.  This is proportional to the amount of system
1848	 * memory.
1849	 */
1850	vm_background_launder_target = (vm_cnt.v_free_target -
1851	    vm_cnt.v_free_min) / 10;
1852}
1853
1854/*
1855 *     vm_pageout is the high level pageout daemon.
1856 */
1857static void
1858vm_pageout(void)
1859{
1860	int error;
1861#ifdef VM_NUMA_ALLOC
1862	int i;
1863#endif
1864
1865	swap_pager_swap_init();
1866	snprintf(curthread->td_name, sizeof(curthread->td_name), "dom0");
1867	error = kthread_add(vm_pageout_laundry_worker, NULL, curproc, NULL,
1868	    0, 0, "laundry: dom0");
1869	if (error != 0)
1870		panic("starting laundry for domain 0, error %d", error);
1871#ifdef VM_NUMA_ALLOC
1872	for (i = 1; i < vm_ndomains; i++) {
1873		error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i,
1874		    curproc, NULL, 0, 0, "dom%d", i);
1875		if (error != 0) {
1876			panic("starting pageout for domain %d, error %d\n",
1877			    i, error);
1878		}
1879	}
1880#endif
1881	error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL,
1882	    0, 0, "uma");
1883	if (error != 0)
1884		panic("starting uma_reclaim helper, error %d\n", error);
1885	vm_pageout_worker((void *)(uintptr_t)0);
1886}
1887
1888/*
1889 * Perform an advisory wakeup of the page daemon.
1890 */
1891void
1892pagedaemon_wakeup(void)
1893{
1894
1895	mtx_assert(&vm_page_queue_free_mtx, MA_NOTOWNED);
1896
1897	if (!vm_pageout_wanted && curthread->td_proc != pageproc) {
1898		vm_pageout_wanted = true;
1899		wakeup(&vm_pageout_wanted);
1900	}
1901}
1902
1903/*
1904 * Wake up the page daemon and wait for it to reclaim free pages.
1905 *
1906 * This function returns with the free queues mutex unlocked.
1907 */
1908void
1909pagedaemon_wait(int pri, const char *wmesg)
1910{
1911
1912	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
1913
1914	/*
1915	 * vm_pageout_wanted may have been set by an advisory wakeup, but if the
1916	 * page daemon is running on a CPU, the wakeup will have been lost.
1917	 * Thus, deliver a potentially spurious wakeup to ensure that the page
1918	 * daemon has been notified of the shortage.
1919	 */
1920	if (!vm_pageout_wanted || !vm_pages_needed) {
1921		vm_pageout_wanted = true;
1922		wakeup(&vm_pageout_wanted);
1923	}
1924	vm_pages_needed = true;
1925	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | pri,
1926	    wmesg, 0);
1927}
1928