vm_pageout.c revision 284665
138889Sjdp/*- 238889Sjdp * Copyright (c) 1991 Regents of the University of California. 360484Sobrien * All rights reserved. 460484Sobrien * Copyright (c) 1994 John S. Dyson 560484Sobrien * All rights reserved. 638889Sjdp * Copyright (c) 1994 David Greenman 738889Sjdp * All rights reserved. 860484Sobrien * Copyright (c) 2005 Yahoo! Technologies Norway AS 938889Sjdp * All rights reserved. 1038889Sjdp * 1138889Sjdp * This code is derived from software contributed to Berkeley by 1260484Sobrien * The Mach Operating System project at Carnegie-Mellon University. 1360484Sobrien * 1438889Sjdp * Redistribution and use in source and binary forms, with or without 1577298Sobrien * modification, are permitted provided that the following conditions 1677298Sobrien * are met: 1777298Sobrien * 1. Redistributions of source code must retain the above copyright 1860484Sobrien * notice, this list of conditions and the following disclaimer. 1938889Sjdp * 2. Redistributions in binary form must reproduce the above copyright 2038889Sjdp * notice, this list of conditions and the following disclaimer in the 2138889Sjdp * documentation and/or other materials provided with the distribution. 2238889Sjdp * 3. All advertising materials mentioning features or use of this software 2338889Sjdp * must display the following acknowledgement: 2438889Sjdp * This product includes software developed by the University of 2538889Sjdp * California, Berkeley and its contributors. 2638889Sjdp * 4. Neither the name of the University nor the names of its contributors 2738889Sjdp * may be used to endorse or promote products derived from this software 2838889Sjdp * without specific prior written permission. 2938889Sjdp * 3038889Sjdp * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 3138889Sjdp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 3238889Sjdp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 3338889Sjdp * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 3438889Sjdp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 3538889Sjdp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 3638889Sjdp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 3738889Sjdp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3838889Sjdp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3938889Sjdp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 4038889Sjdp * SUCH DAMAGE. 4138889Sjdp * 4238889Sjdp * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 4338889Sjdp * 4460484Sobrien * 4577298Sobrien * Copyright (c) 1987, 1990 Carnegie-Mellon University. 4638889Sjdp * All rights reserved. 4760484Sobrien * 4860484Sobrien * Authors: Avadis Tevanian, Jr., Michael Wayne Young 4938889Sjdp * 5038889Sjdp * Permission to use, copy, modify and distribute this software and 5138889Sjdp * its documentation is hereby granted, provided that both the copyright 5277298Sobrien * notice and this permission notice appear in all copies of the 5360484Sobrien * software, derivative works or modified versions, and any portions 5438889Sjdp * thereof, and that both notices appear in supporting documentation. 5538889Sjdp * 5638889Sjdp * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 5738889Sjdp * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 5877298Sobrien * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 5938889Sjdp * 6038889Sjdp * Carnegie Mellon requests users of this software to return to 6160484Sobrien * 6238889Sjdp * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 6338889Sjdp * School of Computer Science 6438889Sjdp * Carnegie Mellon University 6538889Sjdp * Pittsburgh PA 15213-3890 6660484Sobrien * 6738889Sjdp * any improvements or extensions that they make and grant Carnegie the 6838889Sjdp * rights to redistribute these changes. 6938889Sjdp */ 7038889Sjdp 7138889Sjdp/* 7277298Sobrien * The proverbial page-out daemon. 7360484Sobrien */ 7438889Sjdp 7538889Sjdp#include <sys/cdefs.h> 7638889Sjdp__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 284665 2015-06-21 06:28:26Z trasz $"); 7738889Sjdp 7838889Sjdp#include "opt_vm.h" 7938889Sjdp#include "opt_kdtrace.h" 8078828Sobrien#include <sys/param.h> 8138889Sjdp#include <sys/systm.h> 8238889Sjdp#include <sys/kernel.h> 8338889Sjdp#include <sys/eventhandler.h> 8438889Sjdp#include <sys/lock.h> 8538889Sjdp#include <sys/mutex.h> 8638889Sjdp#include <sys/proc.h> 8738889Sjdp#include <sys/kthread.h> 8838889Sjdp#include <sys/ktr.h> 8938889Sjdp#include <sys/mount.h> 9038889Sjdp#include <sys/racct.h> 9138889Sjdp#include <sys/resourcevar.h> 9238889Sjdp#include <sys/sched.h> 9338889Sjdp#include <sys/sdt.h> 9438889Sjdp#include <sys/signalvar.h> 9538889Sjdp#include <sys/smp.h> 9638889Sjdp#include <sys/vnode.h> 9738889Sjdp#include <sys/vmmeter.h> 9838889Sjdp#include <sys/rwlock.h> 9938889Sjdp#include <sys/sx.h> 10038889Sjdp#include <sys/sysctl.h> 10138889Sjdp 10238889Sjdp#include <vm/vm.h> 10377298Sobrien#include <vm/vm_param.h> 10438889Sjdp#include <vm/vm_object.h> 10538889Sjdp#include <vm/vm_page.h> 10638889Sjdp#include <vm/vm_map.h> 10738889Sjdp#include <vm/vm_pageout.h> 10838889Sjdp#include <vm/vm_pager.h> 10938889Sjdp#include <vm/vm_phys.h> 11077298Sobrien#include <vm/swap_pager.h> 11177298Sobrien#include <vm/vm_extern.h> 11277298Sobrien#include <vm/uma.h> 11377298Sobrien 11477298Sobrien/* 11538889Sjdp * System initialization 11638889Sjdp */ 11738889Sjdp 11838889Sjdp/* the kernel process "vm_pageout"*/ 11938889Sjdpstatic void vm_pageout(void); 12038889Sjdpstatic void vm_pageout_init(void); 12138889Sjdpstatic int vm_pageout_clean(vm_page_t); 12238889Sjdpstatic void vm_pageout_scan(struct vm_domain *vmd, int pass); 12338889Sjdpstatic void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); 12438889Sjdp 12538889SjdpSYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, 12638889Sjdp NULL); 12738889Sjdp 12838889Sjdpstruct proc *pageproc; 12938889Sjdp 13038889Sjdpstatic struct kproc_desc page_kp = { 13138889Sjdp "pagedaemon", 13238889Sjdp vm_pageout, 13338889Sjdp &pageproc 13460484Sobrien}; 13538889SjdpSYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, 13677298Sobrien &page_kp); 13760484Sobrien 13860484SobrienSDT_PROVIDER_DEFINE(vm); 13938889SjdpSDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); 14060484SobrienSDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); 14160484Sobrien 14260484Sobrien#if !defined(NO_SWAPPING) 14377298Sobrien/* the kernel process "vm_daemon"*/ 14460484Sobrienstatic void vm_daemon(void); 14577298Sobrienstatic struct proc *vmproc; 14677298Sobrien 14777298Sobrienstatic struct kproc_desc vm_kp = { 14877298Sobrien "vmdaemon", 14960484Sobrien vm_daemon, 15060484Sobrien &vmproc 15160484Sobrien}; 15260484SobrienSYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 15360484Sobrien#endif 15438889Sjdp 15538889Sjdp 15638889Sjdpint vm_pages_needed; /* Event on which pageout daemon sleeps */ 15738889Sjdpint vm_pageout_deficit; /* Estimated number of pages deficit */ 15838889Sjdpint vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ 15938889Sjdpint vm_pageout_wakeup_thresh; 16038889Sjdp 16138889Sjdp#if !defined(NO_SWAPPING) 16238889Sjdpstatic int vm_pageout_req_swapout; /* XXX */ 16338889Sjdpstatic int vm_daemon_needed; 16438889Sjdpstatic struct mtx vm_daemon_mtx; 16560484Sobrien/* Allow for use by vm_pageout before vm_daemon is initialized. */ 16638889SjdpMTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 16738889Sjdp#endif 16838889Sjdpstatic int vm_max_launder = 32; 16938889Sjdpstatic int vm_pageout_update_period; 17038889Sjdpstatic int defer_swap_pageouts; 17138889Sjdpstatic int disable_swap_pageouts; 17238889Sjdpstatic int lowmem_period = 10; 17338889Sjdpstatic int lowmem_ticks; 17438889Sjdp 17538889Sjdp#if defined(NO_SWAPPING) 17638889Sjdpstatic int vm_swap_enabled = 0; 17738889Sjdpstatic int vm_swap_idle_enabled = 0; 17838889Sjdp#else 17938889Sjdpstatic int vm_swap_enabled = 1; 18038889Sjdpstatic int vm_swap_idle_enabled = 0; 18138889Sjdp#endif 18238889Sjdp 18338889SjdpSYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, 18438889Sjdp CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, 18538889Sjdp "free page threshold for waking up the pageout daemon"); 18638889Sjdp 18738889SjdpSYSCTL_INT(_vm, OID_AUTO, max_launder, 18838889Sjdp CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 18938889Sjdp 19038889SjdpSYSCTL_INT(_vm, OID_AUTO, pageout_update_period, 19138889Sjdp CTLFLAG_RW, &vm_pageout_update_period, 0, 19260484Sobrien "Maximum active LRU update period"); 19338889Sjdp 19438889SjdpSYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, 19560484Sobrien "Low memory callback period"); 19660484Sobrien 19738889Sjdp#if defined(NO_SWAPPING) 19838889SjdpSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 19938889Sjdp CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); 20038889SjdpSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 20138889Sjdp CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 20238889Sjdp#else 20338889SjdpSYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 20438889Sjdp CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 20538889SjdpSYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 20638889Sjdp CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 20738889Sjdp#endif 20838889Sjdp 20938889SjdpSYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 21038889Sjdp CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 21138889Sjdp 21238889SjdpSYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 21338889Sjdp CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 21438889Sjdp 21538889Sjdpstatic int pageout_lock_miss; 21638889SjdpSYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 21738889Sjdp CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 21838889Sjdp 21938889Sjdp#define VM_PAGEOUT_PAGE_COUNT 16 22038889Sjdpint vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 22138889Sjdp 22238889Sjdpint vm_page_max_wired; /* XXX max # of wired pages system-wide */ 22338889SjdpSYSCTL_INT(_vm, OID_AUTO, max_wired, 22438889Sjdp CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); 22560484Sobrien 22677298Sobrienstatic boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); 22738889Sjdpstatic boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, 22860484Sobrien vm_paddr_t); 22960484Sobrien#if !defined(NO_SWAPPING) 23038889Sjdpstatic void vm_pageout_map_deactivate_pages(vm_map_t, long); 23138889Sjdpstatic void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); 23238889Sjdpstatic void vm_req_vmdaemon(int req); 23377298Sobrien#endif 23460484Sobrienstatic boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); 23538889Sjdp 23638889Sjdp/* 23738889Sjdp * Initialize a dummy page for marking the caller's place in the specified 23838889Sjdp * paging queue. In principle, this function only needs to set the flag 23977298Sobrien * PG_MARKER. Nonetheless, it wirte busies and initializes the hold count 24038889Sjdp * to one as safety precautions. 24138889Sjdp */ 24260484Sobrienstatic void 24338889Sjdpvm_pageout_init_marker(vm_page_t marker, u_short queue) 24438889Sjdp{ 24538889Sjdp 24638889Sjdp bzero(marker, sizeof(*marker)); 24760484Sobrien marker->flags = PG_MARKER; 24838889Sjdp marker->busy_lock = VPB_SINGLE_EXCLUSIVER; 24938889Sjdp marker->queue = queue; 25038889Sjdp marker->hold_count = 1; 25138889Sjdp} 25238889Sjdp 25377298Sobrien/* 25460484Sobrien * vm_pageout_fallback_object_lock: 25538889Sjdp * 25638889Sjdp * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is 25738889Sjdp * known to have failed and page queue must be either PQ_ACTIVE or 25838889Sjdp * PQ_INACTIVE. To avoid lock order violation, unlock the page queues 25938889Sjdp * while locking the vm object. Use marker page to detect page queue 26038889Sjdp * changes and maintain notion of next page on page queue. Return 26138889Sjdp * TRUE if no changes were detected, FALSE otherwise. vm object is 26238889Sjdp * locked on return. 26338889Sjdp * 26438889Sjdp * This function depends on both the lock portion of struct vm_object 26560484Sobrien * and normal struct vm_page being type stable. 26677298Sobrien */ 26738889Sjdpstatic boolean_t 26860484Sobrienvm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 26960484Sobrien{ 27038889Sjdp struct vm_page marker; 27138889Sjdp struct vm_pagequeue *pq; 27238889Sjdp boolean_t unchanged; 27377298Sobrien u_short queue; 27460484Sobrien vm_object_t object; 27538889Sjdp 27638889Sjdp queue = m->queue; 27738889Sjdp vm_pageout_init_marker(&marker, queue); 27838889Sjdp pq = vm_page_pagequeue(m); 27977298Sobrien object = m->object; 28038889Sjdp 28138889Sjdp TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 28260484Sobrien vm_pagequeue_unlock(pq); 28338889Sjdp vm_page_unlock(m); 28438889Sjdp VM_OBJECT_WLOCK(object); 28538889Sjdp vm_page_lock(m); 28638889Sjdp vm_pagequeue_lock(pq); 28760484Sobrien 28838889Sjdp /* Page queue might have changed. */ 28938889Sjdp *next = TAILQ_NEXT(&marker, plinks.q); 29038889Sjdp unchanged = (m->queue == queue && 29138889Sjdp m->object == object && 29238889Sjdp &marker == TAILQ_NEXT(m, plinks.q)); 29377298Sobrien TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 29460484Sobrien return (unchanged); 29538889Sjdp} 29638889Sjdp 29738889Sjdp/* 29838889Sjdp * Lock the page while holding the page queue lock. Use marker page 29938889Sjdp * to detect page queue changes and maintain notion of next page on 30038889Sjdp * page queue. Return TRUE if no changes were detected, FALSE 30138889Sjdp * otherwise. The page is locked on return. The page queue lock might 30238889Sjdp * be dropped and reacquired. 30338889Sjdp * 30438889Sjdp * This function depends on normal struct vm_page being type stable. 30538889Sjdp */ 30638889Sjdpstatic boolean_t 30738889Sjdpvm_pageout_page_lock(vm_page_t m, vm_page_t *next) 30838889Sjdp{ 30938889Sjdp struct vm_page marker; 31038889Sjdp struct vm_pagequeue *pq; 31138889Sjdp boolean_t unchanged; 31238889Sjdp u_short queue; 31338889Sjdp 31438889Sjdp vm_page_lock_assert(m, MA_NOTOWNED); 31538889Sjdp if (vm_page_trylock(m)) 31638889Sjdp return (TRUE); 31738889Sjdp 31838889Sjdp queue = m->queue; 31938889Sjdp vm_pageout_init_marker(&marker, queue); 32038889Sjdp pq = vm_page_pagequeue(m); 32138889Sjdp 32238889Sjdp TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 32338889Sjdp vm_pagequeue_unlock(pq); 32438889Sjdp vm_page_lock(m); 32538889Sjdp vm_pagequeue_lock(pq); 32638889Sjdp 32738889Sjdp /* Page queue might have changed. */ 32838889Sjdp *next = TAILQ_NEXT(&marker, plinks.q); 32938889Sjdp unchanged = (m->queue == queue && &marker == TAILQ_NEXT(m, plinks.q)); 33038889Sjdp TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 33138889Sjdp return (unchanged); 33238889Sjdp} 33338889Sjdp 33438889Sjdp/* 33560484Sobrien * vm_pageout_clean: 33638889Sjdp * 33738889Sjdp * Clean the page and remove it from the laundry. 33838889Sjdp * 33938889Sjdp * We set the busy bit to cause potential page faults on this page to 34078828Sobrien * block. Note the careful timing, however, the busy bit isn't set till 34178828Sobrien * late and we cannot do anything that will mess with the page. 34238889Sjdp */ 34338889Sjdpstatic int 34438889Sjdpvm_pageout_clean(vm_page_t m) 34538889Sjdp{ 34638889Sjdp vm_object_t object; 34738889Sjdp vm_page_t mc[2*vm_pageout_page_count], pb, ps; 34838889Sjdp int pageout_count; 34938889Sjdp int ib, is, page_base; 35038889Sjdp vm_pindex_t pindex = m->pindex; 35138889Sjdp 35238889Sjdp vm_page_lock_assert(m, MA_OWNED); 35338889Sjdp object = m->object; 35438889Sjdp VM_OBJECT_ASSERT_WLOCKED(object); 35538889Sjdp 35638889Sjdp /* 35738889Sjdp * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 35838889Sjdp * with the new swapper, but we could have serious problems paging 35978828Sobrien * out other object types if there is insufficient memory. 36078828Sobrien * 36138889Sjdp * Unfortunately, checking free memory here is far too late, so the 36238889Sjdp * check has been moved up a procedural level. 36338889Sjdp */ 36438889Sjdp 36577298Sobrien /* 36677298Sobrien * Can't clean the page if it's busy or held. 36760484Sobrien */ 36838889Sjdp vm_page_assert_unbusied(m); 36938889Sjdp KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); 37038889Sjdp vm_page_unlock(m); 37138889Sjdp 37238889Sjdp mc[vm_pageout_page_count] = pb = ps = m; 37338889Sjdp pageout_count = 1; 37438889Sjdp page_base = vm_pageout_page_count; 37538889Sjdp ib = 1; 37638889Sjdp is = 1; 37738889Sjdp 37838889Sjdp /* 37938889Sjdp * Scan object for clusterable pages. 38038889Sjdp * 38138889Sjdp * We can cluster ONLY if: ->> the page is NOT 38238889Sjdp * clean, wired, busy, held, or mapped into a 38338889Sjdp * buffer, and one of the following: 38438889Sjdp * 1) The page is inactive, or a seldom used 38538889Sjdp * active page. 38660484Sobrien * -or- 38738889Sjdp * 2) we force the issue. 38838889Sjdp * 38938889Sjdp * During heavy mmap/modification loads the pageout 39038889Sjdp * daemon can really fragment the underlying file 39138889Sjdp * due to flushing pages out of order and not trying 39238889Sjdp * align the clusters (which leave sporatic out-of-order 39338889Sjdp * holes). To solve this problem we do the reverse scan 39438889Sjdp * first and attempt to align our cluster, then do a 39538889Sjdp * forward scan if room remains. 39638889Sjdp */ 39738889Sjdpmore: 39838889Sjdp while (ib && pageout_count < vm_pageout_page_count) { 39938889Sjdp vm_page_t p; 40038889Sjdp 40138889Sjdp if (ib > pindex) { 40238889Sjdp ib = 0; 40338889Sjdp break; 40438889Sjdp } 40538889Sjdp 40638889Sjdp if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { 40738889Sjdp ib = 0; 40838889Sjdp break; 40938889Sjdp } 41038889Sjdp vm_page_lock(p); 41160484Sobrien vm_page_test_dirty(p); 41260484Sobrien if (p->dirty == 0 || 41360484Sobrien p->queue != PQ_INACTIVE || 41460484Sobrien p->hold_count != 0) { /* may be undergoing I/O */ 41560484Sobrien vm_page_unlock(p); 41660484Sobrien ib = 0; 41760484Sobrien break; 41838889Sjdp } 41960484Sobrien vm_page_unlock(p); 42060484Sobrien mc[--page_base] = pb = p; 42138889Sjdp ++pageout_count; 42238889Sjdp ++ib; 42338889Sjdp /* 42438889Sjdp * alignment boundry, stop here and switch directions. Do 42538889Sjdp * not clear ib. 42638889Sjdp */ 42738889Sjdp if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 42838889Sjdp break; 42938889Sjdp } 43038889Sjdp 43138889Sjdp while (pageout_count < vm_pageout_page_count && 43238889Sjdp pindex + is < object->size) { 43360484Sobrien vm_page_t p; 43438889Sjdp 43538889Sjdp if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) 43638889Sjdp break; 43738889Sjdp vm_page_lock(p); 43838889Sjdp vm_page_test_dirty(p); 43938889Sjdp if (p->dirty == 0 || 44038889Sjdp p->queue != PQ_INACTIVE || 44138889Sjdp p->hold_count != 0) { /* may be undergoing I/O */ 44238889Sjdp vm_page_unlock(p); 44338889Sjdp break; 44438889Sjdp } 44538889Sjdp vm_page_unlock(p); 44660484Sobrien mc[page_base + pageout_count] = ps = p; 44738889Sjdp ++pageout_count; 44838889Sjdp ++is; 44938889Sjdp } 45038889Sjdp 45138889Sjdp /* 45260484Sobrien * If we exhausted our forward scan, continue with the reverse scan 45338889Sjdp * when possible, even past a page boundry. This catches boundry 45438889Sjdp * conditions. 45538889Sjdp */ 45638889Sjdp if (ib && pageout_count < vm_pageout_page_count) 45738889Sjdp goto more; 45838889Sjdp 45938889Sjdp /* 46038889Sjdp * we allow reads during pageouts... 46138889Sjdp */ 46238889Sjdp return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, 46338889Sjdp NULL)); 46438889Sjdp} 46538889Sjdp 46677298Sobrien/* 46738889Sjdp * vm_pageout_flush() - launder the given pages 46860484Sobrien * 46938889Sjdp * The given pages are laundered. Note that we setup for the start of 47038889Sjdp * I/O ( i.e. busy the page ), mark it read-only, and bump the object 47138889Sjdp * reference count all in here rather then in the parent. If we want 47277298Sobrien * the parent to do more sophisticated things we may have to change 47377298Sobrien * the ordering. 47477298Sobrien * 47577298Sobrien * Returned runlen is the count of pages between mreq and first 47638889Sjdp * page after mreq with status VM_PAGER_AGAIN. 47738889Sjdp * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL 47860484Sobrien * for any page in runlen set. 47960484Sobrien */ 48038889Sjdpint 48138889Sjdpvm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, 48238889Sjdp boolean_t *eio) 48338889Sjdp{ 48438889Sjdp vm_object_t object = mc[0]->object; 48538889Sjdp int pageout_status[count]; 48638889Sjdp int numpagedout = 0; 48738889Sjdp int i, runlen; 48838889Sjdp 48938889Sjdp VM_OBJECT_ASSERT_WLOCKED(object); 49038889Sjdp 49138889Sjdp /* 49238889Sjdp * Initiate I/O. Bump the vm_page_t->busy counter and 49338889Sjdp * mark the pages read-only. 49438889Sjdp * 49538889Sjdp * We do not have to fixup the clean/dirty bits here... we can 49638889Sjdp * allow the pager to do it after the I/O completes. 49738889Sjdp * 49838889Sjdp * NOTE! mc[i]->dirty may be partial or fragmented due to an 49938889Sjdp * edge case with file fragments. 50038889Sjdp */ 50138889Sjdp for (i = 0; i < count; i++) { 50238889Sjdp KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 50338889Sjdp ("vm_pageout_flush: partially invalid page %p index %d/%d", 50438889Sjdp mc[i], i, count)); 50538889Sjdp vm_page_sbusy(mc[i]); 50638889Sjdp pmap_remove_write(mc[i]); 50738889Sjdp } 50838889Sjdp vm_object_pip_add(object, count); 50938889Sjdp 51038889Sjdp vm_pager_put_pages(object, mc, count, flags, pageout_status); 51138889Sjdp 51238889Sjdp runlen = count - mreq; 51338889Sjdp if (eio != NULL) 51438889Sjdp *eio = FALSE; 51577298Sobrien for (i = 0; i < count; i++) { 51638889Sjdp vm_page_t mt = mc[i]; 51738889Sjdp 51838889Sjdp KASSERT(pageout_status[i] == VM_PAGER_PEND || 51938889Sjdp !pmap_page_is_write_mapped(mt), 52038889Sjdp ("vm_pageout_flush: page %p is not write protected", mt)); 52138889Sjdp switch (pageout_status[i]) { 52238889Sjdp case VM_PAGER_OK: 52338889Sjdp case VM_PAGER_PEND: 52438889Sjdp numpagedout++; 52538889Sjdp break; 52638889Sjdp case VM_PAGER_BAD: 52738889Sjdp /* 52838889Sjdp * Page outside of range of object. Right now we 52938889Sjdp * essentially lose the changes by pretending it 53038889Sjdp * worked. 53138889Sjdp */ 53238889Sjdp vm_page_undirty(mt); 53338889Sjdp break; 53438889Sjdp case VM_PAGER_ERROR: 53538889Sjdp case VM_PAGER_FAIL: 53638889Sjdp /* 53738889Sjdp * If page couldn't be paged out, then reactivate the 53838889Sjdp * page so it doesn't clog the inactive list. (We 53938889Sjdp * will try paging out it again later). 54038889Sjdp */ 54138889Sjdp vm_page_lock(mt); 54238889Sjdp vm_page_activate(mt); 54338889Sjdp vm_page_unlock(mt); 54438889Sjdp if (eio != NULL && i >= mreq && i - mreq < runlen) 54538889Sjdp *eio = TRUE; 54638889Sjdp break; 54738889Sjdp case VM_PAGER_AGAIN: 54838889Sjdp if (i >= mreq && i - mreq < runlen) 54938889Sjdp runlen = i - mreq; 55038889Sjdp break; 55138889Sjdp } 55238889Sjdp 55338889Sjdp /* 55438889Sjdp * If the operation is still going, leave the page busy to 55538889Sjdp * block all other accesses. Also, leave the paging in 55660484Sobrien * progress indicator set so that we don't attempt an object 55760484Sobrien * collapse. 55838889Sjdp */ 55938889Sjdp if (pageout_status[i] != VM_PAGER_PEND) { 56038889Sjdp vm_object_pip_wakeup(object); 56138889Sjdp vm_page_sunbusy(mt); 56277298Sobrien if (vm_page_count_severe()) { 56377298Sobrien vm_page_lock(mt); 56477298Sobrien vm_page_try_to_cache(mt); 56577298Sobrien vm_page_unlock(mt); 56638889Sjdp } 56738889Sjdp } 56838889Sjdp } 56938889Sjdp if (prunlen != NULL) 57038889Sjdp *prunlen = runlen; 57160484Sobrien return (numpagedout); 57260484Sobrien} 57360484Sobrien 57460484Sobrienstatic boolean_t 57560484Sobrienvm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, 57660484Sobrien vm_paddr_t high) 57760484Sobrien{ 57860484Sobrien struct mount *mp; 57960484Sobrien struct vnode *vp; 58060484Sobrien vm_object_t object; 58138889Sjdp vm_paddr_t pa; 58260484Sobrien vm_page_t m, m_tmp, next; 58360484Sobrien int lockmode; 58460484Sobrien 58560484Sobrien vm_pagequeue_lock(pq); 58660484Sobrien TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) { 58760484Sobrien if ((m->flags & PG_MARKER) != 0) 58860484Sobrien continue; 58960484Sobrien pa = VM_PAGE_TO_PHYS(m); 59060484Sobrien if (pa < low || pa + PAGE_SIZE > high) 59160484Sobrien continue; 59260484Sobrien if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { 59338889Sjdp vm_page_unlock(m); 59438889Sjdp continue; 59538889Sjdp } 59638889Sjdp object = m->object; 59738889Sjdp if ((!VM_OBJECT_TRYWLOCK(object) && 59838889Sjdp (!vm_pageout_fallback_object_lock(m, &next) || 59938889Sjdp m->hold_count != 0)) || vm_page_busied(m)) { 60038889Sjdp vm_page_unlock(m); 60138889Sjdp VM_OBJECT_WUNLOCK(object); 60238889Sjdp continue; 60338889Sjdp } 60438889Sjdp vm_page_test_dirty(m); 60538889Sjdp if (m->dirty == 0 && object->ref_count != 0) 60638889Sjdp pmap_remove_all(m); 60738889Sjdp if (m->dirty != 0) { 60838889Sjdp vm_page_unlock(m); 60938889Sjdp if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { 61038889Sjdp VM_OBJECT_WUNLOCK(object); 61138889Sjdp continue; 61238889Sjdp } 61338889Sjdp if (object->type == OBJT_VNODE) { 61438889Sjdp vm_pagequeue_unlock(pq); 61560484Sobrien vp = object->handle; 61638889Sjdp vm_object_reference_locked(object); 61760484Sobrien VM_OBJECT_WUNLOCK(object); 61860484Sobrien (void)vn_start_write(vp, &mp, V_WAIT); 61938889Sjdp lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 62038889Sjdp LK_SHARED : LK_EXCLUSIVE; 62138889Sjdp vn_lock(vp, lockmode | LK_RETRY); 62238889Sjdp VM_OBJECT_WLOCK(object); 62338889Sjdp vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 62438889Sjdp VM_OBJECT_WUNLOCK(object); 62560484Sobrien VOP_UNLOCK(vp, 0); 62660484Sobrien vm_object_deallocate(object); 62760484Sobrien vn_finished_write(mp); 62838889Sjdp return (TRUE); 62960484Sobrien } else if (object->type == OBJT_SWAP || 63060484Sobrien object->type == OBJT_DEFAULT) { 63138889Sjdp vm_pagequeue_unlock(pq); 63238889Sjdp m_tmp = m; 63338889Sjdp vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, 63438889Sjdp 0, NULL, NULL); 63538889Sjdp VM_OBJECT_WUNLOCK(object); 63638889Sjdp return (TRUE); 63738889Sjdp } 63877298Sobrien } else { 63938889Sjdp /* 64038889Sjdp * Dequeue here to prevent lock recursion in 64160484Sobrien * vm_page_cache(). 64238889Sjdp */ 64360484Sobrien vm_page_dequeue_locked(m); 64460484Sobrien vm_page_cache(m); 64560484Sobrien vm_page_unlock(m); 64660484Sobrien } 64738889Sjdp VM_OBJECT_WUNLOCK(object); 64838889Sjdp } 64960484Sobrien vm_pagequeue_unlock(pq); 65060484Sobrien return (FALSE); 65138889Sjdp} 65238889Sjdp 65360484Sobrien/* 65460484Sobrien * Increase the number of cached pages. The specified value, "tries", 65560484Sobrien * determines which categories of pages are cached: 65638889Sjdp * 65738889Sjdp * 0: All clean, inactive pages within the specified physical address range 65860484Sobrien * are cached. Will not sleep. 65938889Sjdp * 1: The vm_lowmem handlers are called. All inactive pages within 66038889Sjdp * the specified physical address range are cached. May sleep. 66160484Sobrien * 2: The vm_lowmem handlers are called. All inactive and active pages 66260484Sobrien * within the specified physical address range are cached. May sleep. 66338889Sjdp */ 66438889Sjdpvoid 66538889Sjdpvm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) 66638889Sjdp{ 66738889Sjdp int actl, actmax, inactl, inactmax, dom, initial_dom; 66838889Sjdp static int start_dom = 0; 66938889Sjdp 67038889Sjdp if (tries > 0) { 67138889Sjdp /* 67238889Sjdp * Decrease registered cache sizes. The vm_lowmem handlers 67338889Sjdp * may acquire locks and/or sleep, so they can only be invoked 67438889Sjdp * when "tries" is greater than zero. 67538889Sjdp */ 67638889Sjdp SDT_PROBE0(vm, , , vm__lowmem_cache); 67738889Sjdp EVENTHANDLER_INVOKE(vm_lowmem, 0); 67838889Sjdp 67938889Sjdp /* 68038889Sjdp * We do this explicitly after the caches have been drained 68138889Sjdp * above. 68238889Sjdp */ 68338889Sjdp uma_reclaim(); 68438889Sjdp } 68538889Sjdp 68638889Sjdp /* 68738889Sjdp * Make the next scan start on the next domain. 68838889Sjdp */ 68938889Sjdp initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; 69038889Sjdp 69138889Sjdp inactl = 0; 69238889Sjdp inactmax = cnt.v_inactive_count; 69338889Sjdp actl = 0; 69438889Sjdp actmax = tries < 2 ? 0 : cnt.v_active_count; 69538889Sjdp dom = initial_dom; 69638889Sjdp 69738889Sjdp /* 69838889Sjdp * Scan domains in round-robin order, first inactive queues, 69938889Sjdp * then active. Since domain usually owns large physically 70038889Sjdp * contiguous chunk of memory, it makes sense to completely 70138889Sjdp * exhaust one domain before switching to next, while growing 70238889Sjdp * the pool of contiguous physical pages. 70338889Sjdp * 70438889Sjdp * Do not even start launder a domain which cannot contain 70538889Sjdp * the specified address range, as indicated by segments 70638889Sjdp * constituting the domain. 70738889Sjdp */ 70838889Sjdpagain: 70938889Sjdp if (inactl < inactmax) { 71038889Sjdp if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, 71138889Sjdp low, high) && 71238889Sjdp vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], 71338889Sjdp tries, low, high)) { 71438889Sjdp inactl++; 71538889Sjdp goto again; 71638889Sjdp } 71738889Sjdp if (++dom == vm_ndomains) 71838889Sjdp dom = 0; 71938889Sjdp if (dom != initial_dom) 72038889Sjdp goto again; 72138889Sjdp } 72238889Sjdp if (actl < actmax) { 72338889Sjdp if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, 72438889Sjdp low, high) && 72538889Sjdp vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], 72638889Sjdp tries, low, high)) { 72738889Sjdp actl++; 72838889Sjdp goto again; 72938889Sjdp } 73038889Sjdp if (++dom == vm_ndomains) 73138889Sjdp dom = 0; 73238889Sjdp if (dom != initial_dom) 73338889Sjdp goto again; 73438889Sjdp } 73538889Sjdp} 73638889Sjdp 73738889Sjdp#if !defined(NO_SWAPPING) 73838889Sjdp/* 73938889Sjdp * vm_pageout_object_deactivate_pages 74038889Sjdp * 74138889Sjdp * Deactivate enough pages to satisfy the inactive target 74238889Sjdp * requirements. 74338889Sjdp * 74438889Sjdp * The object and map must be locked. 74577298Sobrien */ 74677298Sobrienstatic void 74777298Sobrienvm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, 74877298Sobrien long desired) 74977298Sobrien{ 75077298Sobrien vm_object_t backing_object, object; 75160484Sobrien vm_page_t p; 75277298Sobrien int act_delta, remove_mode; 75338889Sjdp 75460484Sobrien VM_OBJECT_ASSERT_LOCKED(first_object); 75560484Sobrien if ((first_object->flags & OBJ_FICTITIOUS) != 0) 75677298Sobrien return; 75760484Sobrien for (object = first_object;; object = backing_object) { 75860484Sobrien if (pmap_resident_count(pmap) <= desired) 75960484Sobrien goto unlock_return; 76077298Sobrien VM_OBJECT_ASSERT_LOCKED(object); 76177298Sobrien if ((object->flags & OBJ_UNMANAGED) != 0 || 76277298Sobrien object->paging_in_progress != 0) 76360484Sobrien goto unlock_return; 76460484Sobrien 76560484Sobrien remove_mode = 0; 76677298Sobrien if (object->shadow_count > 1) 76777298Sobrien remove_mode = 1; 76877298Sobrien /* 76977298Sobrien * Scan the object's entire memory queue. 77077298Sobrien */ 77177298Sobrien TAILQ_FOREACH(p, &object->memq, listq) { 77238889Sjdp if (pmap_resident_count(pmap) <= desired) 77360484Sobrien goto unlock_return; 77460484Sobrien if (vm_page_busied(p)) 77538889Sjdp continue; 77660484Sobrien PCPU_INC(cnt.v_pdpages); 77738889Sjdp vm_page_lock(p); 77838889Sjdp if (p->wire_count != 0 || p->hold_count != 0 || 77938889Sjdp !pmap_page_exists_quick(pmap, p)) { 78038889Sjdp vm_page_unlock(p); 78160484Sobrien continue; 78277298Sobrien } 78360484Sobrien act_delta = pmap_ts_referenced(p); 78438889Sjdp if ((p->aflags & PGA_REFERENCED) != 0) { 78538889Sjdp if (act_delta == 0) 78660484Sobrien act_delta = 1; 78760484Sobrien vm_page_aflag_clear(p, PGA_REFERENCED); 78860484Sobrien } 78938889Sjdp if (p->queue != PQ_ACTIVE && act_delta != 0) { 79060484Sobrien vm_page_activate(p); 79138889Sjdp p->act_count += act_delta; 79238889Sjdp } else if (p->queue == PQ_ACTIVE) { 79338889Sjdp if (act_delta == 0) { 79438889Sjdp p->act_count -= min(p->act_count, 79538889Sjdp ACT_DECLINE); 79638889Sjdp if (!remove_mode && p->act_count == 0) { 79738889Sjdp pmap_remove_all(p); 79838889Sjdp vm_page_deactivate(p); 79960484Sobrien } else 80060484Sobrien vm_page_requeue(p); 80138889Sjdp } else { 80238889Sjdp vm_page_activate(p); 80360484Sobrien if (p->act_count < ACT_MAX - 80460484Sobrien ACT_ADVANCE) 80538889Sjdp p->act_count += ACT_ADVANCE; 80638889Sjdp vm_page_requeue(p); 80738889Sjdp } 80838889Sjdp } else if (p->queue == PQ_INACTIVE) 80960484Sobrien pmap_remove_all(p); 81060484Sobrien vm_page_unlock(p); 81160484Sobrien } 81260484Sobrien if ((backing_object = object->backing_object) == NULL) 81377298Sobrien goto unlock_return; 81460484Sobrien VM_OBJECT_RLOCK(backing_object); 81560484Sobrien if (object != first_object) 81638889Sjdp VM_OBJECT_RUNLOCK(object); 81760484Sobrien } 81838889Sjdpunlock_return: 81960484Sobrien if (object != first_object) 82060484Sobrien VM_OBJECT_RUNLOCK(object); 82138889Sjdp} 82260484Sobrien 82338889Sjdp/* 82438889Sjdp * deactivate some number of pages in a map, try to do it fairly, but 82560484Sobrien * that is really hard to do. 82660484Sobrien */ 82760484Sobrienstatic void 82838889Sjdpvm_pageout_map_deactivate_pages(map, desired) 82960484Sobrien vm_map_t map; 83038889Sjdp long desired; 83138889Sjdp{ 83238889Sjdp vm_map_entry_t tmpe; 83338889Sjdp vm_object_t obj, bigobj; 83438889Sjdp int nothingwired; 83538889Sjdp 83638889Sjdp if (!vm_map_trylock(map)) 83738889Sjdp return; 83860484Sobrien 83938889Sjdp bigobj = NULL; 84038889Sjdp nothingwired = TRUE; 84160484Sobrien 84260484Sobrien /* 84338889Sjdp * first, search out the biggest object, and try to free pages from 84438889Sjdp * that. 84538889Sjdp */ 84638889Sjdp tmpe = map->header.next; 84760484Sobrien while (tmpe != &map->header) { 84860484Sobrien if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 84960484Sobrien obj = tmpe->object.vm_object; 85060484Sobrien if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 85160484Sobrien if (obj->shadow_count <= 1 && 85277298Sobrien (bigobj == NULL || 85360484Sobrien bigobj->resident_page_count < obj->resident_page_count)) { 85460484Sobrien if (bigobj != NULL) 85538889Sjdp VM_OBJECT_RUNLOCK(bigobj); 85660484Sobrien bigobj = obj; 85738889Sjdp } else 85860484Sobrien VM_OBJECT_RUNLOCK(obj); 85960484Sobrien } 86038889Sjdp } 86160484Sobrien if (tmpe->wired_count > 0) 86238889Sjdp nothingwired = FALSE; 86338889Sjdp tmpe = tmpe->next; 86438889Sjdp } 86560484Sobrien 86677298Sobrien if (bigobj != NULL) { 86760484Sobrien vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); 86838889Sjdp VM_OBJECT_RUNLOCK(bigobj); 86960484Sobrien } 87038889Sjdp /* 87138889Sjdp * Next, hunt around for other pages to deactivate. We actually 87238889Sjdp * do this search sort of wrong -- .text first is not the best idea. 87338889Sjdp */ 87438889Sjdp tmpe = map->header.next; 87538889Sjdp while (tmpe != &map->header) { 87660484Sobrien if (pmap_resident_count(vm_map_pmap(map)) <= desired) 87777298Sobrien break; 87838889Sjdp if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 87938889Sjdp obj = tmpe->object.vm_object; 88038889Sjdp if (obj != NULL) { 88177298Sobrien VM_OBJECT_RLOCK(obj); 88260484Sobrien vm_pageout_object_deactivate_pages(map->pmap, obj, desired); 88377298Sobrien VM_OBJECT_RUNLOCK(obj); 88460484Sobrien } 88560484Sobrien } 88638889Sjdp tmpe = tmpe->next; 88777298Sobrien } 88838889Sjdp 88960484Sobrien#ifdef __ia64__ 89077298Sobrien /* 89138889Sjdp * Remove all non-wired, managed mappings if a process is swapped out. 89277298Sobrien * This will free page table pages. 89338889Sjdp */ 89438889Sjdp if (desired == 0) 89538889Sjdp pmap_remove_pages(map->pmap); 89638889Sjdp#else 89738889Sjdp /* 89838889Sjdp * Remove all mappings if a process is swapped out, this will free page 89938889Sjdp * table pages. 90038889Sjdp */ 90160484Sobrien if (desired == 0 && nothingwired) { 90277298Sobrien pmap_remove(vm_map_pmap(map), vm_map_min(map), 90377298Sobrien vm_map_max(map)); 90460484Sobrien } 90538889Sjdp#endif 90638889Sjdp 90760484Sobrien vm_map_unlock(map); 90877298Sobrien} 90977298Sobrien#endif /* !defined(NO_SWAPPING) */ 91060484Sobrien 91138889Sjdp/* 91238889Sjdp * vm_pageout_scan does the dirty work for the pageout daemon. 91360484Sobrien * 91477298Sobrien * pass 0 - Update active LRU/deactivate pages 91577298Sobrien * pass 1 - Move inactive to cache or free 91660484Sobrien * pass 2 - Launder dirty pages 91738889Sjdp */ 91838889Sjdpstatic void 91977298Sobrienvm_pageout_scan(struct vm_domain *vmd, int pass) 92077298Sobrien{ 92138889Sjdp vm_page_t m, next; 92238889Sjdp struct vm_pagequeue *pq; 92377298Sobrien vm_object_t object; 92477298Sobrien int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; 92577298Sobrien int vnodes_skipped = 0; 92677298Sobrien int maxlaunder; 92777298Sobrien int lockmode; 92877298Sobrien boolean_t queues_locked; 92977298Sobrien 93077298Sobrien /* 93177298Sobrien * If we need to reclaim memory ask kernel caches to return 93277298Sobrien * some. We rate limit to avoid thrashing. 93377298Sobrien */ 93477298Sobrien if (vmd == &vm_dom[0] && pass > 0 && 93577298Sobrien (ticks - lowmem_ticks) / hz >= lowmem_period) { 93677298Sobrien /* 93738889Sjdp * Decrease registered cache sizes. 93877298Sobrien */ 93938889Sjdp SDT_PROBE0(vm, , , vm__lowmem_scan); 94077298Sobrien EVENTHANDLER_INVOKE(vm_lowmem, 0); 94177298Sobrien /* 94277298Sobrien * We do this explicitly after the caches have been 94377298Sobrien * drained above. 94477298Sobrien */ 94577298Sobrien uma_reclaim(); 94677298Sobrien lowmem_ticks = ticks; 94777298Sobrien } 94877298Sobrien 94977298Sobrien /* 95077298Sobrien * The addl_page_shortage is the number of temporarily 95178828Sobrien * stuck pages in the inactive queue. In other words, the 95278828Sobrien * number of pages from the inactive count that should be 95378828Sobrien * discounted in setting the target for the active queue scan. 95477298Sobrien */ 95577298Sobrien addl_page_shortage = 0; 95678828Sobrien 95778828Sobrien /* 95878828Sobrien * Calculate the number of pages we want to either free or move 95978828Sobrien * to the cache. 96077298Sobrien */ 96177298Sobrien if (pass > 0) { 96277298Sobrien deficit = atomic_readandclear_int(&vm_pageout_deficit); 96377298Sobrien page_shortage = vm_paging_target() + deficit; 96477298Sobrien } else 96577298Sobrien page_shortage = deficit = 0; 96677298Sobrien 96777298Sobrien /* 96877298Sobrien * maxlaunder limits the number of dirty pages we flush per scan. 96977298Sobrien * For most systems a smaller value (16 or 32) is more robust under 97077298Sobrien * extreme memory and disk pressure because any unnecessary writes 97177298Sobrien * to disk can result in extreme performance degredation. However, 97277298Sobrien * systems with excessive dirty pages (especially when MAP_NOSYNC is 97377298Sobrien * used) will die horribly with limited laundering. If the pageout 97477298Sobrien * daemon cannot clean enough pages in the first pass, we let it go 97577298Sobrien * all out in succeeding passes. 97677298Sobrien */ 97777298Sobrien if ((maxlaunder = vm_max_launder) <= 1) 97877298Sobrien maxlaunder = 1; 97977298Sobrien if (pass > 1) 98077298Sobrien maxlaunder = 10000; 98177298Sobrien 98277298Sobrien /* 98377298Sobrien * Start scanning the inactive queue for pages we can move to the 98477298Sobrien * cache or free. The scan will stop when the target is reached or 98577298Sobrien * we have scanned the entire inactive queue. Note that m->act_count 98677298Sobrien * is not used to form decisions for the inactive queue, only for the 98777298Sobrien * active queue. 98877298Sobrien */ 98977298Sobrien pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; 99077298Sobrien maxscan = pq->pq_cnt; 99160484Sobrien vm_pagequeue_lock(pq); 99260484Sobrien queues_locked = TRUE; 99377298Sobrien for (m = TAILQ_FIRST(&pq->pq_pl); 99477298Sobrien m != NULL && maxscan-- > 0 && page_shortage > 0; 99577298Sobrien m = next) { 99677298Sobrien vm_pagequeue_assert_locked(pq); 99777298Sobrien KASSERT(queues_locked, ("unlocked queues")); 99877298Sobrien KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); 99960484Sobrien 100077298Sobrien PCPU_INC(cnt.v_pdpages); 100177298Sobrien next = TAILQ_NEXT(m, plinks.q); 100277298Sobrien 100377298Sobrien /* 100477298Sobrien * skip marker pages 100577298Sobrien */ 100677298Sobrien if (m->flags & PG_MARKER) 100777298Sobrien continue; 100860484Sobrien 100960484Sobrien KASSERT((m->flags & PG_FICTITIOUS) == 0, 101060484Sobrien ("Fictitious page %p cannot be in inactive queue", m)); 101177298Sobrien KASSERT((m->oflags & VPO_UNMANAGED) == 0, 101277298Sobrien ("Unmanaged page %p cannot be in inactive queue", m)); 101377298Sobrien 101477298Sobrien /* 101577298Sobrien * The page or object lock acquisitions fail if the 101677298Sobrien * page was removed from the queue or moved to a 101777298Sobrien * different position within the queue. In either 101877298Sobrien * case, addl_page_shortage should not be incremented. 101977298Sobrien */ 102077298Sobrien if (!vm_pageout_page_lock(m, &next)) { 102177298Sobrien vm_page_unlock(m); 102277298Sobrien continue; 102377298Sobrien } 102477298Sobrien object = m->object; 102577298Sobrien if (!VM_OBJECT_TRYWLOCK(object) && 102677298Sobrien !vm_pageout_fallback_object_lock(m, &next)) { 102777298Sobrien vm_page_unlock(m); 102877298Sobrien VM_OBJECT_WUNLOCK(object); 102977298Sobrien continue; 103077298Sobrien } 103177298Sobrien 103277298Sobrien /* 103377298Sobrien * Don't mess with busy pages, keep them at at the 103477298Sobrien * front of the queue, most likely they are being 103577298Sobrien * paged out. Increment addl_page_shortage for busy 103677298Sobrien * pages, because they may leave the inactive queue 103777298Sobrien * shortly after page scan is finished. 103877298Sobrien */ 103977298Sobrien if (vm_page_busied(m)) { 104077298Sobrien vm_page_unlock(m); 104177298Sobrien VM_OBJECT_WUNLOCK(object); 104277298Sobrien addl_page_shortage++; 104377298Sobrien continue; 104477298Sobrien } 104577298Sobrien 104677298Sobrien /* 104777298Sobrien * We unlock the inactive page queue, invalidating the 104877298Sobrien * 'next' pointer. Use our marker to remember our 104977298Sobrien * place. 105077298Sobrien */ 105177298Sobrien TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); 105277298Sobrien vm_pagequeue_unlock(pq); 105377298Sobrien queues_locked = FALSE; 105477298Sobrien 105577298Sobrien /* 105677298Sobrien * We bump the activation count if the page has been 105777298Sobrien * referenced while in the inactive queue. This makes 105877298Sobrien * it less likely that the page will be added back to the 105977298Sobrien * inactive queue prematurely again. Here we check the 106077298Sobrien * page tables (or emulated bits, if any), given the upper 106177298Sobrien * level VM system not knowing anything about existing 106277298Sobrien * references. 106377298Sobrien */ 106477298Sobrien act_delta = 0; 106577298Sobrien if ((m->aflags & PGA_REFERENCED) != 0) { 106677298Sobrien vm_page_aflag_clear(m, PGA_REFERENCED); 106777298Sobrien act_delta = 1; 106877298Sobrien } 106977298Sobrien if (object->ref_count != 0) { 107077298Sobrien act_delta += pmap_ts_referenced(m); 107177298Sobrien } else { 107277298Sobrien KASSERT(!pmap_page_is_mapped(m), 107338889Sjdp ("vm_pageout_scan: page %p is mapped", m)); 107477298Sobrien } 107577298Sobrien 107677298Sobrien /* 107777298Sobrien * If the upper level VM system knows about any page 107877298Sobrien * references, we reactivate the page or requeue it. 107977298Sobrien */ 108077298Sobrien if (act_delta != 0) { 108177298Sobrien if (object->ref_count) { 108260484Sobrien vm_page_activate(m); 108360484Sobrien m->act_count += act_delta + ACT_ADVANCE; 108460484Sobrien } else { 108577298Sobrien vm_pagequeue_lock(pq); 108677298Sobrien queues_locked = TRUE; 108777298Sobrien vm_page_requeue_locked(m); 108877298Sobrien } 108977298Sobrien VM_OBJECT_WUNLOCK(object); 109077298Sobrien vm_page_unlock(m); 109177298Sobrien goto relock_queues; 109277298Sobrien } 109377298Sobrien 109477298Sobrien if (m->hold_count != 0) { 109577298Sobrien vm_page_unlock(m); 109677298Sobrien VM_OBJECT_WUNLOCK(object); 109777298Sobrien 109877298Sobrien /* 109977298Sobrien * Held pages are essentially stuck in the 110077298Sobrien * queue. So, they ought to be discounted 110177298Sobrien * from the inactive count. See the 110277298Sobrien * calculation of the page_shortage for the 110377298Sobrien * loop over the active queue below. 110477298Sobrien */ 110577298Sobrien addl_page_shortage++; 110660484Sobrien goto relock_queues; 110777298Sobrien } 110838889Sjdp 110977298Sobrien /* 111077298Sobrien * If the page appears to be clean at the machine-independent 111177298Sobrien * layer, then remove all of its mappings from the pmap in 111277298Sobrien * anticipation of placing it onto the cache queue. If, 111377298Sobrien * however, any of the page's mappings allow write access, 111477298Sobrien * then the page may still be modified until the last of those 111577298Sobrien * mappings are removed. 111677298Sobrien */ 111777298Sobrien vm_page_test_dirty(m); 111877298Sobrien if (m->dirty == 0 && object->ref_count != 0) 111977298Sobrien pmap_remove_all(m); 112077298Sobrien 112177298Sobrien if (m->valid == 0) { 112277298Sobrien /* 112377298Sobrien * Invalid pages can be easily freed 112477298Sobrien */ 112577298Sobrien vm_page_free(m); 112677298Sobrien PCPU_INC(cnt.v_dfree); 112760484Sobrien --page_shortage; 112877298Sobrien } else if (m->dirty == 0) { 112977298Sobrien /* 113077298Sobrien * Clean pages can be placed onto the cache queue. 113177298Sobrien * This effectively frees them. 113277298Sobrien */ 113360484Sobrien vm_page_cache(m); 113438889Sjdp --page_shortage; 113560484Sobrien } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { 113660484Sobrien /* 113777298Sobrien * Dirty pages need to be paged out, but flushing 113877298Sobrien * a page is extremely expensive verses freeing 113977298Sobrien * a clean page. Rather then artificially limiting 114060484Sobrien * the number of pages we can flush, we instead give 114138889Sjdp * dirty pages extra priority on the inactive queue 114277298Sobrien * by forcing them to be cycled through the queue 114377298Sobrien * twice before being flushed, after which the 114477298Sobrien * (now clean) page will cycle through once more 114577298Sobrien * before being freed. This significantly extends 114677298Sobrien * the thrash point for a heavily loaded machine. 114760484Sobrien */ 114860484Sobrien m->flags |= PG_WINATCFLS; 114977298Sobrien vm_pagequeue_lock(pq); 115077298Sobrien queues_locked = TRUE; 115177298Sobrien vm_page_requeue_locked(m); 115277298Sobrien } else if (maxlaunder > 0) { 115377298Sobrien /* 115477298Sobrien * We always want to try to flush some dirty pages if 115538889Sjdp * we encounter them, to keep the system stable. 115677298Sobrien * Normally this number is small, but under extreme 115777298Sobrien * pressure where there are insufficient clean pages 115877298Sobrien * on the inactive queue, we may have to go all out. 115977298Sobrien */ 116077298Sobrien int swap_pageouts_ok; 116177298Sobrien struct vnode *vp = NULL; 116277298Sobrien struct mount *mp = NULL; 116377298Sobrien 116477298Sobrien if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 116577298Sobrien swap_pageouts_ok = 1; 116638889Sjdp } else { 116777298Sobrien swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 116877298Sobrien swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 116977298Sobrien vm_page_count_min()); 117077298Sobrien 117177298Sobrien } 117277298Sobrien 117377298Sobrien /* 117477298Sobrien * We don't bother paging objects that are "dead". 117577298Sobrien * Those objects are in a "rundown" state. 117677298Sobrien */ 117777298Sobrien if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 117877298Sobrien vm_pagequeue_lock(pq); 117977298Sobrien vm_page_unlock(m); 118077298Sobrien VM_OBJECT_WUNLOCK(object); 118177298Sobrien queues_locked = TRUE; 118277298Sobrien vm_page_requeue_locked(m); 118377298Sobrien goto relock_queues; 118477298Sobrien } 118577298Sobrien 118677298Sobrien /* 118778828Sobrien * The object is already known NOT to be dead. It 118877298Sobrien * is possible for the vget() to block the whole 118977298Sobrien * pageout daemon, but the new low-memory handling 119077298Sobrien * code should prevent it. 119177298Sobrien * 119277298Sobrien * The previous code skipped locked vnodes and, worse, 119377298Sobrien * reordered pages in the queue. This results in 119477298Sobrien * completely non-deterministic operation and, on a 119577298Sobrien * busy system, can lead to extremely non-optimal 119677298Sobrien * pageouts. For example, it can cause clean pages 119777298Sobrien * to be freed and dirty pages to be moved to the end 119877298Sobrien * of the queue. Since dirty pages are also moved to 119977298Sobrien * the end of the queue once-cleaned, this gives 120077298Sobrien * way too large a weighting to defering the freeing 120177298Sobrien * of dirty pages. 120277298Sobrien * 120377298Sobrien * We can't wait forever for the vnode lock, we might 120477298Sobrien * deadlock due to a vn_read() getting stuck in 120577298Sobrien * vm_wait while holding this vnode. We skip the 120677298Sobrien * vnode if we can't get it in a reasonable amount 120777298Sobrien * of time. 120877298Sobrien */ 120977298Sobrien if (object->type == OBJT_VNODE) { 121038889Sjdp vm_page_unlock(m); 121177298Sobrien vp = object->handle; 121277298Sobrien if (vp->v_type == VREG && 121377298Sobrien vn_start_write(vp, &mp, V_NOWAIT) != 0) { 121477298Sobrien mp = NULL; 121577298Sobrien ++pageout_lock_miss; 121677298Sobrien if (object->flags & OBJ_MIGHTBEDIRTY) 121777298Sobrien vnodes_skipped++; 121877298Sobrien goto unlock_and_continue; 121977298Sobrien } 122077298Sobrien KASSERT(mp != NULL, 122177298Sobrien ("vp %p with NULL v_mount", vp)); 122277298Sobrien vm_object_reference_locked(object); 122377298Sobrien VM_OBJECT_WUNLOCK(object); 122477298Sobrien lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 122577298Sobrien LK_SHARED : LK_EXCLUSIVE; 122677298Sobrien if (vget(vp, lockmode | LK_TIMELOCK, 122777298Sobrien curthread)) { 122877298Sobrien VM_OBJECT_WLOCK(object); 122977298Sobrien ++pageout_lock_miss; 123077298Sobrien if (object->flags & OBJ_MIGHTBEDIRTY) 123177298Sobrien vnodes_skipped++; 123277298Sobrien vp = NULL; 123377298Sobrien goto unlock_and_continue; 123477298Sobrien } 123577298Sobrien VM_OBJECT_WLOCK(object); 123677298Sobrien vm_page_lock(m); 123777298Sobrien vm_pagequeue_lock(pq); 123877298Sobrien queues_locked = TRUE; 123977298Sobrien /* 124077298Sobrien * The page might have been moved to another 124177298Sobrien * queue during potential blocking in vget() 124277298Sobrien * above. The page might have been freed and 124377298Sobrien * reused for another vnode. 124477298Sobrien */ 124538889Sjdp if (m->queue != PQ_INACTIVE || 124677298Sobrien m->object != object || 124777298Sobrien TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) { 124877298Sobrien vm_page_unlock(m); 124977298Sobrien if (object->flags & OBJ_MIGHTBEDIRTY) 125077298Sobrien vnodes_skipped++; 125177298Sobrien goto unlock_and_continue; 125238889Sjdp } 125377298Sobrien 125477298Sobrien /* 125577298Sobrien * The page may have been busied during the 125677298Sobrien * blocking in vget(). We don't move the 125777298Sobrien * page back onto the end of the queue so that 125877298Sobrien * statistics are more correct if we don't. 125977298Sobrien */ 126077298Sobrien if (vm_page_busied(m)) { 126177298Sobrien vm_page_unlock(m); 126277298Sobrien addl_page_shortage++; 126377298Sobrien goto unlock_and_continue; 126477298Sobrien } 126577298Sobrien 126677298Sobrien /* 126777298Sobrien * If the page has become held it might 126877298Sobrien * be undergoing I/O, so skip it 126977298Sobrien */ 127077298Sobrien if (m->hold_count != 0) { 127177298Sobrien vm_page_unlock(m); 127277298Sobrien addl_page_shortage++; 127377298Sobrien if (object->flags & OBJ_MIGHTBEDIRTY) 127477298Sobrien vnodes_skipped++; 127560484Sobrien goto unlock_and_continue; 127677298Sobrien } 127760484Sobrien vm_pagequeue_unlock(pq); 127838889Sjdp queues_locked = FALSE; 127938889Sjdp } 128060484Sobrien 128160484Sobrien /* 128260484Sobrien * If a page is dirty, then it is either being washed 128360484Sobrien * (but not yet cleaned) or it is still in the 128477298Sobrien * laundry. If it is still in the laundry, then we 128577298Sobrien * start the cleaning operation. 128677298Sobrien * 128777298Sobrien * decrement page_shortage on success to account for 128877298Sobrien * the (future) cleaned page. Otherwise we could wind 128977298Sobrien * up laundering or cleaning too many pages. 129077298Sobrien */ 129177298Sobrien if (vm_pageout_clean(m) != 0) { 129277298Sobrien --page_shortage; 129377298Sobrien --maxlaunder; 129477298Sobrien } 129577298Sobrienunlock_and_continue: 129677298Sobrien vm_page_lock_assert(m, MA_NOTOWNED); 129777298Sobrien VM_OBJECT_WUNLOCK(object); 129877298Sobrien if (mp != NULL) { 129977298Sobrien if (queues_locked) { 130077298Sobrien vm_pagequeue_unlock(pq); 130177298Sobrien queues_locked = FALSE; 130277298Sobrien } 130377298Sobrien if (vp != NULL) 130477298Sobrien vput(vp); 130577298Sobrien vm_object_deallocate(object); 130677298Sobrien vn_finished_write(mp); 130760484Sobrien } 130877298Sobrien vm_page_lock_assert(m, MA_NOTOWNED); 130977298Sobrien goto relock_queues; 131077298Sobrien } 131178828Sobrien vm_page_unlock(m); 131277298Sobrien VM_OBJECT_WUNLOCK(object); 131377298Sobrienrelock_queues: 131478828Sobrien if (!queues_locked) { 131578828Sobrien vm_pagequeue_lock(pq); 131677298Sobrien queues_locked = TRUE; 131777298Sobrien } 131877298Sobrien next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); 131977298Sobrien TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); 132077298Sobrien } 132177298Sobrien vm_pagequeue_unlock(pq); 132277298Sobrien 132377298Sobrien#if !defined(NO_SWAPPING) 132477298Sobrien /* 132577298Sobrien * Wakeup the swapout daemon if we didn't cache or free the targeted 132677298Sobrien * number of pages. 132777298Sobrien */ 132877298Sobrien if (vm_swap_enabled && page_shortage > 0) 132977298Sobrien vm_req_vmdaemon(VM_SWAP_NORMAL); 133077298Sobrien#endif 133177298Sobrien 133277298Sobrien /* 133377298Sobrien * Wakeup the sync daemon if we skipped a vnode in a writeable object 133477298Sobrien * and we didn't cache or free enough pages. 133577298Sobrien */ 133677298Sobrien if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target - 133777298Sobrien cnt.v_free_min) 133877298Sobrien (void)speedup_syncer(); 133977298Sobrien 134077298Sobrien /* 134177298Sobrien * Compute the number of pages we want to try to move from the 134277298Sobrien * active queue to the inactive queue. 134377298Sobrien */ 134477298Sobrien page_shortage = cnt.v_inactive_target - cnt.v_inactive_count + 134577298Sobrien vm_paging_target() + deficit + addl_page_shortage; 134677298Sobrien 134777298Sobrien pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; 134877298Sobrien vm_pagequeue_lock(pq); 134977298Sobrien maxscan = pq->pq_cnt; 135077298Sobrien 135177298Sobrien /* 135277298Sobrien * If we're just idle polling attempt to visit every 135377298Sobrien * active page within 'update_period' seconds. 135477298Sobrien */ 135577298Sobrien if (pass == 0 && vm_pageout_update_period != 0) { 135677298Sobrien maxscan /= vm_pageout_update_period; 135777298Sobrien page_shortage = maxscan; 135877298Sobrien } 135977298Sobrien 136077298Sobrien /* 136177298Sobrien * Scan the active queue for things we can deactivate. We nominally 136277298Sobrien * track the per-page activity counter and use it to locate 136377298Sobrien * deactivation candidates. 136477298Sobrien */ 136577298Sobrien m = TAILQ_FIRST(&pq->pq_pl); 136677298Sobrien while (m != NULL && maxscan-- > 0 && page_shortage > 0) { 136777298Sobrien 136877298Sobrien KASSERT(m->queue == PQ_ACTIVE, 136977298Sobrien ("vm_pageout_scan: page %p isn't active", m)); 137077298Sobrien 137177298Sobrien next = TAILQ_NEXT(m, plinks.q); 137277298Sobrien if ((m->flags & PG_MARKER) != 0) { 137377298Sobrien m = next; 137477298Sobrien continue; 137577298Sobrien } 137677298Sobrien KASSERT((m->flags & PG_FICTITIOUS) == 0, 137777298Sobrien ("Fictitious page %p cannot be in active queue", m)); 137877298Sobrien KASSERT((m->oflags & VPO_UNMANAGED) == 0, 137977298Sobrien ("Unmanaged page %p cannot be in active queue", m)); 138077298Sobrien if (!vm_pageout_page_lock(m, &next)) { 138177298Sobrien vm_page_unlock(m); 138277298Sobrien m = next; 138377298Sobrien continue; 138477298Sobrien } 138577298Sobrien 138677298Sobrien /* 138777298Sobrien * The count for pagedaemon pages is done after checking the 138877298Sobrien * page for eligibility... 138977298Sobrien */ 139077298Sobrien PCPU_INC(cnt.v_pdpages); 139177298Sobrien 139277298Sobrien /* 139377298Sobrien * Check to see "how much" the page has been used. 139477298Sobrien */ 139577298Sobrien act_delta = 0; 139677298Sobrien if (m->aflags & PGA_REFERENCED) { 139777298Sobrien vm_page_aflag_clear(m, PGA_REFERENCED); 139877298Sobrien act_delta += 1; 139977298Sobrien } 140077298Sobrien /* 140177298Sobrien * Unlocked object ref count check. Two races are possible. 140277298Sobrien * 1) The ref was transitioning to zero and we saw non-zero, 140377298Sobrien * the pmap bits will be checked unnecessarily. 140477298Sobrien * 2) The ref was transitioning to one and we saw zero. 140577298Sobrien * The page lock prevents a new reference to this page so 140677298Sobrien * we need not check the reference bits. 140777298Sobrien */ 140877298Sobrien if (m->object->ref_count != 0) 140977298Sobrien act_delta += pmap_ts_referenced(m); 141077298Sobrien 141177298Sobrien /* 141277298Sobrien * Advance or decay the act_count based on recent usage. 141377298Sobrien */ 141477298Sobrien if (act_delta) { 141577298Sobrien m->act_count += ACT_ADVANCE + act_delta; 141677298Sobrien if (m->act_count > ACT_MAX) 141777298Sobrien m->act_count = ACT_MAX; 141877298Sobrien } else { 141977298Sobrien m->act_count -= min(m->act_count, ACT_DECLINE); 142077298Sobrien act_delta = m->act_count; 142177298Sobrien } 142277298Sobrien 142377298Sobrien /* 142477298Sobrien * Move this page to the tail of the active or inactive 142577298Sobrien * queue depending on usage. 142677298Sobrien */ 142777298Sobrien if (act_delta == 0) { 142877298Sobrien /* Dequeue to avoid later lock recursion. */ 142977298Sobrien vm_page_dequeue_locked(m); 143077298Sobrien vm_page_deactivate(m); 143177298Sobrien page_shortage--; 143277298Sobrien } else 143377298Sobrien vm_page_requeue_locked(m); 143477298Sobrien vm_page_unlock(m); 143577298Sobrien m = next; 143677298Sobrien } 143777298Sobrien vm_pagequeue_unlock(pq); 143877298Sobrien#if !defined(NO_SWAPPING) 143977298Sobrien /* 144077298Sobrien * Idle process swapout -- run once per second. 144177298Sobrien */ 144277298Sobrien if (vm_swap_idle_enabled) { 144377298Sobrien static long lsec; 144477298Sobrien if (time_second != lsec) { 144560484Sobrien vm_req_vmdaemon(VM_SWAP_IDLE); 144677298Sobrien lsec = time_second; 144777298Sobrien } 144877298Sobrien } 144977298Sobrien#endif 145077298Sobrien 145177298Sobrien /* 145277298Sobrien * If we are critically low on one of RAM or swap and low on 145377298Sobrien * the other, kill the largest process. However, we avoid 145477298Sobrien * doing this on the first pass in order to give ourselves a 145577298Sobrien * chance to flush out dirty vnode-backed pages and to allow 145677298Sobrien * active pages to be moved to the inactive queue and reclaimed. 145777298Sobrien */ 145877298Sobrien vm_pageout_mightbe_oom(vmd, pass); 145977298Sobrien} 146077298Sobrien 146177298Sobrienstatic int vm_pageout_oom_vote; 146277298Sobrien 146338889Sjdp/* 146477298Sobrien * The pagedaemon threads randlomly select one to perform the 146577298Sobrien * OOM. Trying to kill processes before all pagedaemons 146677298Sobrien * failed to reach free target is premature. 146777298Sobrien */ 146877298Sobrienstatic void 146977298Sobrienvm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) 147077298Sobrien{ 147177298Sobrien int old_vote; 147277298Sobrien 147377298Sobrien if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) || 147477298Sobrien (swap_pager_full && vm_paging_target() > 0))) { 147577298Sobrien if (vmd->vmd_oom) { 147677298Sobrien vmd->vmd_oom = FALSE; 147777298Sobrien atomic_subtract_int(&vm_pageout_oom_vote, 1); 147877298Sobrien } 147977298Sobrien return; 148077298Sobrien } 148177298Sobrien 148277298Sobrien if (vmd->vmd_oom) 148377298Sobrien return; 148477298Sobrien 148577298Sobrien vmd->vmd_oom = TRUE; 148677298Sobrien old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); 148777298Sobrien if (old_vote != vm_ndomains - 1) 148877298Sobrien return; 148977298Sobrien 149077298Sobrien /* 149177298Sobrien * The current pagedaemon thread is the last in the quorum to 149277298Sobrien * start OOM. Initiate the selection and signaling of the 149377298Sobrien * victim. 149477298Sobrien */ 149577298Sobrien vm_pageout_oom(VM_OOM_MEM); 149677298Sobrien 149777298Sobrien /* 149877298Sobrien * After one round of OOM terror, recall our vote. On the 149977298Sobrien * next pass, current pagedaemon would vote again if the low 150077298Sobrien * memory condition is still there, due to vmd_oom being 150177298Sobrien * false. 150277298Sobrien */ 150377298Sobrien vmd->vmd_oom = FALSE; 150477298Sobrien atomic_subtract_int(&vm_pageout_oom_vote, 1); 150577298Sobrien} 150677298Sobrien 150777298Sobrienvoid 150877298Sobrienvm_pageout_oom(int shortage) 150977298Sobrien{ 151077298Sobrien struct proc *p, *bigproc; 151177298Sobrien vm_offset_t size, bigsize; 151277298Sobrien struct thread *td; 151377298Sobrien struct vmspace *vm; 151460484Sobrien 151577298Sobrien /* 151677298Sobrien * We keep the process bigproc locked once we find it to keep anyone 151777298Sobrien * from messing with it; however, there is a possibility of 151877298Sobrien * deadlock if process B is bigproc and one of it's child processes 151977298Sobrien * attempts to propagate a signal to B while we are waiting for A's 152077298Sobrien * lock while walking this list. To avoid this, we don't block on 152177298Sobrien * the process lock but just skip a process if it is already locked. 152277298Sobrien */ 152377298Sobrien bigproc = NULL; 152477298Sobrien bigsize = 0; 152577298Sobrien sx_slock(&allproc_lock); 152677298Sobrien FOREACH_PROC_IN_SYSTEM(p) { 152777298Sobrien int breakout; 152877298Sobrien 152977298Sobrien PROC_LOCK(p); 153077298Sobrien 153177298Sobrien /* 153277298Sobrien * If this is a system, protected or killed process, skip it. 153377298Sobrien */ 153477298Sobrien if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | 153577298Sobrien P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || 153677298Sobrien p->p_pid == 1 || P_KILLED(p) || 153777298Sobrien (p->p_pid < 48 && swap_pager_avail != 0)) { 153877298Sobrien PROC_UNLOCK(p); 153977298Sobrien continue; 154077298Sobrien } 154177298Sobrien /* 154277298Sobrien * If the process is in a non-running type state, 154377298Sobrien * don't touch it. Check all the threads individually. 154477298Sobrien */ 154577298Sobrien breakout = 0; 154677298Sobrien FOREACH_THREAD_IN_PROC(p, td) { 154777298Sobrien thread_lock(td); 154877298Sobrien if (!TD_ON_RUNQ(td) && 154977298Sobrien !TD_IS_RUNNING(td) && 155077298Sobrien !TD_IS_SLEEPING(td) && 155177298Sobrien !TD_IS_SUSPENDED(td)) { 155277298Sobrien thread_unlock(td); 155377298Sobrien breakout = 1; 155477298Sobrien break; 155560484Sobrien } 155677298Sobrien thread_unlock(td); 155777298Sobrien } 155877298Sobrien if (breakout) { 155977298Sobrien PROC_UNLOCK(p); 156077298Sobrien continue; 156177298Sobrien } 156277298Sobrien /* 156377298Sobrien * get the process size 156477298Sobrien */ 156577298Sobrien vm = vmspace_acquire_ref(p); 156677298Sobrien if (vm == NULL) { 156777298Sobrien PROC_UNLOCK(p); 156877298Sobrien continue; 156977298Sobrien } 157077298Sobrien _PHOLD(p); 157177298Sobrien if (!vm_map_trylock_read(&vm->vm_map)) { 157277298Sobrien _PRELE(p); 157377298Sobrien PROC_UNLOCK(p); 157477298Sobrien vmspace_free(vm); 157577298Sobrien continue; 157677298Sobrien } 157777298Sobrien PROC_UNLOCK(p); 157877298Sobrien size = vmspace_swap_count(vm); 157977298Sobrien vm_map_unlock_read(&vm->vm_map); 158077298Sobrien if (shortage == VM_OOM_MEM) 158177298Sobrien size += vmspace_resident_count(vm); 158277298Sobrien vmspace_free(vm); 158377298Sobrien /* 158477298Sobrien * if the this process is bigger than the biggest one 158577298Sobrien * remember it. 158677298Sobrien */ 158777298Sobrien if (size > bigsize) { 158877298Sobrien if (bigproc != NULL) 158977298Sobrien PRELE(bigproc); 159077298Sobrien bigproc = p; 159177298Sobrien bigsize = size; 159277298Sobrien } else { 159377298Sobrien PRELE(p); 159477298Sobrien } 159577298Sobrien } 159677298Sobrien sx_sunlock(&allproc_lock); 159777298Sobrien if (bigproc != NULL) { 159877298Sobrien PROC_LOCK(bigproc); 159977298Sobrien killproc(bigproc, "out of swap space"); 160077298Sobrien sched_nice(bigproc, PRIO_MIN); 160160484Sobrien _PRELE(bigproc); 160238889Sjdp PROC_UNLOCK(bigproc); 160338889Sjdp wakeup(&cnt.v_free_count); 160460484Sobrien } 160560484Sobrien} 160660484Sobrien 160760484Sobrienstatic void 160877298Sobrienvm_pageout_worker(void *arg) 160938889Sjdp{ 161038889Sjdp struct vm_domain *domain; 161138889Sjdp int domidx; 161238889Sjdp 161338889Sjdp domidx = (uintptr_t)arg; 161438889Sjdp domain = &vm_dom[domidx]; 161538889Sjdp 161638889Sjdp /* 161738889Sjdp * XXXKIB It could be useful to bind pageout daemon threads to 161838889Sjdp * the cores belonging to the domain, from which vm_page_array 161938889Sjdp * is allocated. 162038889Sjdp */ 162138889Sjdp 162238889Sjdp KASSERT(domain->vmd_segs != 0, ("domain without segments")); 162338889Sjdp vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); 162438889Sjdp 162538889Sjdp /* 162638889Sjdp * The pageout daemon worker is never done, so loop forever. 162738889Sjdp */ 162838889Sjdp while (TRUE) { 162938889Sjdp /* 163038889Sjdp * If we have enough free memory, wakeup waiters. Do 163138889Sjdp * not clear vm_pages_needed until we reach our target, 163238889Sjdp * otherwise we may be woken up over and over again and 163338889Sjdp * waste a lot of cpu. 163438889Sjdp */ 163538889Sjdp mtx_lock(&vm_page_queue_free_mtx); 163638889Sjdp if (vm_pages_needed && !vm_page_count_min()) { 163738889Sjdp if (!vm_paging_needed()) 163860484Sobrien vm_pages_needed = 0; 163960484Sobrien wakeup(&cnt.v_free_count); 164060484Sobrien } 164160484Sobrien if (vm_pages_needed) { 164260484Sobrien /* 164377298Sobrien * Still not done, take a second pass without waiting 164477298Sobrien * (unlimited dirty cleaning), otherwise sleep a bit 164577298Sobrien * and try again. 164677298Sobrien */ 164777298Sobrien if (domain->vmd_pass > 1) 164838889Sjdp msleep(&vm_pages_needed, 164938889Sjdp &vm_page_queue_free_mtx, PVM, "psleep", 165038889Sjdp hz / 2); 165138889Sjdp } else { 165238889Sjdp /* 165360484Sobrien * Good enough, sleep until required to refresh 165460484Sobrien * stats. 165560484Sobrien */ 165660484Sobrien domain->vmd_pass = 0; 165760484Sobrien msleep(&vm_pages_needed, &vm_page_queue_free_mtx, 165860484Sobrien PVM, "psleep", hz); 165960484Sobrien 166060484Sobrien } 166160484Sobrien if (vm_pages_needed) { 166260484Sobrien cnt.v_pdwakeups++; 166338889Sjdp domain->vmd_pass++; 166438889Sjdp } 166538889Sjdp mtx_unlock(&vm_page_queue_free_mtx); 166638889Sjdp vm_pageout_scan(domain, domain->vmd_pass); 166738889Sjdp } 166838889Sjdp} 166938889Sjdp 167038889Sjdp/* 167138889Sjdp * vm_pageout_init initialises basic pageout daemon settings. 167238889Sjdp */ 167338889Sjdpstatic void 167438889Sjdpvm_pageout_init(void) 167538889Sjdp{ 167638889Sjdp /* 167777298Sobrien * Initialize some paging parameters. 167877298Sobrien */ 167977298Sobrien cnt.v_interrupt_free_min = 2; 168077298Sobrien if (cnt.v_page_count < 2000) 168177298Sobrien vm_pageout_page_count = 8; 168277298Sobrien 168377298Sobrien /* 168477298Sobrien * v_free_reserved needs to include enough for the largest 168577298Sobrien * swap pager structures plus enough for any pv_entry structs 168660484Sobrien * when paging. 168760484Sobrien */ 168860484Sobrien if (cnt.v_page_count > 1024) 168938889Sjdp cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; 169060484Sobrien else 169138889Sjdp cnt.v_free_min = 4; 169238889Sjdp cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 169338889Sjdp cnt.v_interrupt_free_min; 169438889Sjdp cnt.v_free_reserved = vm_pageout_page_count + 169538889Sjdp cnt.v_pageout_free_min + (cnt.v_page_count / 768); 169638889Sjdp cnt.v_free_severe = cnt.v_free_min / 2; 169738889Sjdp cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; 169838889Sjdp cnt.v_free_min += cnt.v_free_reserved; 169938889Sjdp cnt.v_free_severe += cnt.v_free_reserved; 170038889Sjdp cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; 170138889Sjdp if (cnt.v_inactive_target > cnt.v_free_count / 3) 170238889Sjdp cnt.v_inactive_target = cnt.v_free_count / 3; 170338889Sjdp 170438889Sjdp /* 170538889Sjdp * Set the default wakeup threshold to be 10% above the minimum 170638889Sjdp * page limit. This keeps the steady state out of shortfall. 170738889Sjdp */ 170838889Sjdp vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11; 170938889Sjdp 171038889Sjdp /* 171138889Sjdp * Set interval in seconds for active scan. We want to visit each 171238889Sjdp * page at least once every ten minutes. This is to prevent worst 171338889Sjdp * case paging behaviors with stale active LRU. 171477298Sobrien */ 171577298Sobrien if (vm_pageout_update_period == 0) 171677298Sobrien vm_pageout_update_period = 600; 171777298Sobrien 171877298Sobrien /* XXX does not really belong here */ 171977298Sobrien if (vm_page_max_wired == 0) 172038889Sjdp vm_page_max_wired = cnt.v_free_count / 3; 172138889Sjdp} 172238889Sjdp 172338889Sjdp/* 172438889Sjdp * vm_pageout is the high level pageout daemon. 172538889Sjdp */ 172638889Sjdpstatic void 172738889Sjdpvm_pageout(void) 172838889Sjdp{ 172938889Sjdp int error; 173038889Sjdp#if MAXMEMDOM > 1 173138889Sjdp int i; 173238889Sjdp#endif 173338889Sjdp 173438889Sjdp swap_pager_swap_init(); 173538889Sjdp#if MAXMEMDOM > 1 173660484Sobrien for (i = 1; i < vm_ndomains; i++) { 173760484Sobrien error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, 173860484Sobrien curproc, NULL, 0, 0, "dom%d", i); 173960484Sobrien if (error != 0) { 174060484Sobrien panic("starting pageout for domain %d, error %d\n", 174160484Sobrien i, error); 174238889Sjdp } 174338889Sjdp } 174438889Sjdp#endif 174560484Sobrien error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 174660484Sobrien 0, 0, "uma"); 174738889Sjdp if (error != 0) 174838889Sjdp panic("starting uma_reclaim helper, error %d\n", error); 174938889Sjdp vm_pageout_worker((void *)(uintptr_t)0); 175038889Sjdp} 175138889Sjdp 175238889Sjdp/* 175338889Sjdp * Unless the free page queue lock is held by the caller, this function 175438889Sjdp * should be regarded as advisory. Specifically, the caller should 175538889Sjdp * not msleep() on &cnt.v_free_count following this function unless 175638889Sjdp * the free page queue lock is held until the msleep() is performed. 175738889Sjdp */ 175838889Sjdpvoid 175938889Sjdppagedaemon_wakeup(void) 176038889Sjdp{ 176138889Sjdp 176238889Sjdp if (!vm_pages_needed && curthread->td_proc != pageproc) { 176338889Sjdp vm_pages_needed = 1; 176438889Sjdp wakeup(&vm_pages_needed); 176538889Sjdp } 176638889Sjdp} 176738889Sjdp 176838889Sjdp#if !defined(NO_SWAPPING) 176960484Sobrienstatic void 177060484Sobrienvm_req_vmdaemon(int req) 177160484Sobrien{ 177260484Sobrien static int lastrun = 0; 177360484Sobrien 177438889Sjdp mtx_lock(&vm_daemon_mtx); 177538889Sjdp vm_pageout_req_swapout |= req; 177638889Sjdp if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 177738889Sjdp wakeup(&vm_daemon_needed); 177838889Sjdp lastrun = ticks; 177938889Sjdp } 178038889Sjdp mtx_unlock(&vm_daemon_mtx); 178138889Sjdp} 178238889Sjdp 178338889Sjdpstatic void 178438889Sjdpvm_daemon(void) 178538889Sjdp{ 178638889Sjdp struct rlimit rsslim; 178738889Sjdp struct proc *p; 178838889Sjdp struct thread *td; 178938889Sjdp struct vmspace *vm; 179038889Sjdp int breakout, swapout_flags, tryagain, attempts; 179138889Sjdp#ifdef RACCT 179238889Sjdp uint64_t rsize, ravailable; 179338889Sjdp#endif 179438889Sjdp 179538889Sjdp while (TRUE) { 179638889Sjdp mtx_lock(&vm_daemon_mtx); 179738889Sjdp msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 179838889Sjdp#ifdef RACCT 179938889Sjdp racct_enable ? hz : 0 180038889Sjdp#else 180138889Sjdp 0 180238889Sjdp#endif 180338889Sjdp ); 180438889Sjdp swapout_flags = vm_pageout_req_swapout; 180538889Sjdp vm_pageout_req_swapout = 0; 180638889Sjdp mtx_unlock(&vm_daemon_mtx); 180738889Sjdp if (swapout_flags) 180838889Sjdp swapout_procs(swapout_flags); 180977298Sobrien 181077298Sobrien /* 181177298Sobrien * scan the processes for exceeding their rlimits or if 181277298Sobrien * process is swapped out -- deactivate pages 181377298Sobrien */ 181477298Sobrien tryagain = 0; 181560484Sobrien attempts = 0; 181660484Sobrienagain: 181760484Sobrien attempts++; 181860484Sobrien sx_slock(&allproc_lock); 181960484Sobrien FOREACH_PROC_IN_SYSTEM(p) { 182060484Sobrien vm_pindex_t limit, size; 182138889Sjdp 182238889Sjdp /* 182338889Sjdp * if this is a system process or if we have already 182438889Sjdp * looked at this process, skip it. 182538889Sjdp */ 182638889Sjdp PROC_LOCK(p); 182738889Sjdp if (p->p_state != PRS_NORMAL || 182838889Sjdp p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 182938889Sjdp PROC_UNLOCK(p); 183038889Sjdp continue; 183138889Sjdp } 183238889Sjdp /* 183338889Sjdp * if the process is in a non-running type state, 183438889Sjdp * don't touch it. 183538889Sjdp */ 183638889Sjdp breakout = 0; 183738889Sjdp FOREACH_THREAD_IN_PROC(p, td) { 183838889Sjdp thread_lock(td); 183938889Sjdp if (!TD_ON_RUNQ(td) && 184038889Sjdp !TD_IS_RUNNING(td) && 184138889Sjdp !TD_IS_SLEEPING(td) && 184238889Sjdp !TD_IS_SUSPENDED(td)) { 184338889Sjdp thread_unlock(td); 184438889Sjdp breakout = 1; 184538889Sjdp break; 184638889Sjdp } 184738889Sjdp thread_unlock(td); 184860484Sobrien } 184960484Sobrien if (breakout) { 185038889Sjdp PROC_UNLOCK(p); 185138889Sjdp continue; 185277298Sobrien } 185377298Sobrien /* 185477298Sobrien * get a limit 185577298Sobrien */ 185677298Sobrien lim_rlimit(p, RLIMIT_RSS, &rsslim); 185778828Sobrien limit = OFF_TO_IDX( 185877298Sobrien qmin(rsslim.rlim_cur, rsslim.rlim_max)); 185977298Sobrien 186077298Sobrien /* 186177298Sobrien * let processes that are swapped out really be 186277298Sobrien * swapped out set the limit to nothing (will force a 186377298Sobrien * swap-out.) 186477298Sobrien */ 186577298Sobrien if ((p->p_flag & P_INMEM) == 0) 186677298Sobrien limit = 0; /* XXX */ 186777298Sobrien vm = vmspace_acquire_ref(p); 186877298Sobrien PROC_UNLOCK(p); 186977298Sobrien if (vm == NULL) 187077298Sobrien continue; 187177298Sobrien 187277298Sobrien size = vmspace_resident_count(vm); 187377298Sobrien if (size >= limit) { 187477298Sobrien vm_pageout_map_deactivate_pages( 187577298Sobrien &vm->vm_map, limit); 187677298Sobrien } 187777298Sobrien#ifdef RACCT 187877298Sobrien if (racct_enable) { 187977298Sobrien rsize = IDX_TO_OFF(size); 188077298Sobrien PROC_LOCK(p); 188177298Sobrien racct_set(p, RACCT_RSS, rsize); 188277298Sobrien ravailable = racct_get_available(p, RACCT_RSS); 188377298Sobrien PROC_UNLOCK(p); 188477298Sobrien if (rsize > ravailable) { 188577298Sobrien /* 188677298Sobrien * Don't be overly aggressive; this 188778828Sobrien * might be an innocent process, 188878828Sobrien * and the limit could've been exceeded 188977298Sobrien * by some memory hog. Don't try 189077298Sobrien * to deactivate more than 1/4th 189177298Sobrien * of process' resident set size. 189277298Sobrien */ 189377298Sobrien if (attempts <= 8) { 189477298Sobrien if (ravailable < rsize - 189577298Sobrien (rsize / 4)) { 189677298Sobrien ravailable = rsize - 189777298Sobrien (rsize / 4); 189838889Sjdp } 189960484Sobrien } 190060484Sobrien vm_pageout_map_deactivate_pages( 190177298Sobrien &vm->vm_map, 1902 OFF_TO_IDX(ravailable)); 1903 /* Update RSS usage after paging out. */ 1904 size = vmspace_resident_count(vm); 1905 rsize = IDX_TO_OFF(size); 1906 PROC_LOCK(p); 1907 racct_set(p, RACCT_RSS, rsize); 1908 PROC_UNLOCK(p); 1909 if (rsize > ravailable) 1910 tryagain = 1; 1911 } 1912 } 1913#endif 1914 vmspace_free(vm); 1915 } 1916 sx_sunlock(&allproc_lock); 1917 if (tryagain != 0 && attempts <= 10) 1918 goto again; 1919 } 1920} 1921#endif /* !defined(NO_SWAPPING) */ 1922