vm_pageout.c revision 291933
1/*- 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * Copyright (c) 2005 Yahoo! Technologies Norway AS 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * The Mach Operating System project at Carnegie-Mellon University. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 * 42 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 43 * 44 * 45 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 46 * All rights reserved. 47 * 48 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 49 * 50 * Permission to use, copy, modify and distribute this software and 51 * its documentation is hereby granted, provided that both the copyright 52 * notice and this permission notice appear in all copies of the 53 * software, derivative works or modified versions, and any portions 54 * thereof, and that both notices appear in supporting documentation. 55 * 56 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 57 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 58 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 59 * 60 * Carnegie Mellon requests users of this software to return to 61 * 62 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 63 * School of Computer Science 64 * Carnegie Mellon University 65 * Pittsburgh PA 15213-3890 66 * 67 * any improvements or extensions that they make and grant Carnegie the 68 * rights to redistribute these changes. 69 */ 70 71/* 72 * The proverbial page-out daemon. 73 */ 74 75#include <sys/cdefs.h> 76__FBSDID("$FreeBSD: stable/10/sys/vm/vm_pageout.c 291933 2015-12-07 11:12:03Z kib $"); 77 78#include "opt_vm.h" 79#include "opt_kdtrace.h" 80#include <sys/param.h> 81#include <sys/systm.h> 82#include <sys/kernel.h> 83#include <sys/eventhandler.h> 84#include <sys/lock.h> 85#include <sys/mutex.h> 86#include <sys/proc.h> 87#include <sys/kthread.h> 88#include <sys/ktr.h> 89#include <sys/mount.h> 90#include <sys/racct.h> 91#include <sys/resourcevar.h> 92#include <sys/sched.h> 93#include <sys/sdt.h> 94#include <sys/signalvar.h> 95#include <sys/smp.h> 96#include <sys/time.h> 97#include <sys/vnode.h> 98#include <sys/vmmeter.h> 99#include <sys/rwlock.h> 100#include <sys/sx.h> 101#include <sys/sysctl.h> 102 103#include <vm/vm.h> 104#include <vm/vm_param.h> 105#include <vm/vm_object.h> 106#include <vm/vm_page.h> 107#include <vm/vm_map.h> 108#include <vm/vm_pageout.h> 109#include <vm/vm_pager.h> 110#include <vm/vm_phys.h> 111#include <vm/swap_pager.h> 112#include <vm/vm_extern.h> 113#include <vm/uma.h> 114 115/* 116 * System initialization 117 */ 118 119/* the kernel process "vm_pageout"*/ 120static void vm_pageout(void); 121static void vm_pageout_init(void); 122static int vm_pageout_clean(vm_page_t); 123static void vm_pageout_scan(struct vm_domain *vmd, int pass); 124static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass); 125 126SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, 127 NULL); 128 129struct proc *pageproc; 130 131static struct kproc_desc page_kp = { 132 "pagedaemon", 133 vm_pageout, 134 &pageproc 135}; 136SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, 137 &page_kp); 138 139SDT_PROVIDER_DEFINE(vm); 140SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache); 141SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); 142 143#if !defined(NO_SWAPPING) 144/* the kernel process "vm_daemon"*/ 145static void vm_daemon(void); 146static struct proc *vmproc; 147 148static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmproc 152}; 153SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154#endif 155 156 157int vm_pages_needed; /* Event on which pageout daemon sleeps */ 158int vm_pageout_deficit; /* Estimated number of pages deficit */ 159int vm_pageout_pages_needed; /* flag saying that the pageout daemon needs pages */ 160int vm_pageout_wakeup_thresh; 161 162#if !defined(NO_SWAPPING) 163static int vm_pageout_req_swapout; /* XXX */ 164static int vm_daemon_needed; 165static struct mtx vm_daemon_mtx; 166/* Allow for use by vm_pageout before vm_daemon is initialized. */ 167MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); 168#endif 169static int vm_max_launder = 32; 170static int vm_pageout_update_period; 171static int defer_swap_pageouts; 172static int disable_swap_pageouts; 173static int lowmem_period = 10; 174static time_t lowmem_uptime; 175 176#if defined(NO_SWAPPING) 177static int vm_swap_enabled = 0; 178static int vm_swap_idle_enabled = 0; 179#else 180static int vm_swap_enabled = 1; 181static int vm_swap_idle_enabled = 0; 182#endif 183 184SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, 185 CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, 186 "free page threshold for waking up the pageout daemon"); 187 188SYSCTL_INT(_vm, OID_AUTO, max_launder, 189 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 190 191SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, 192 CTLFLAG_RW, &vm_pageout_update_period, 0, 193 "Maximum active LRU update period"); 194 195SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, 196 "Low memory callback period"); 197 198#if defined(NO_SWAPPING) 199SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 200 CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); 201SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 202 CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 203#else 204SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 205 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 206SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 207 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 208#endif 209 210SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 211 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 212 213SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 214 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 215 216static int pageout_lock_miss; 217SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 218 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 219 220#define VM_PAGEOUT_PAGE_COUNT 16 221int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 222 223int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 224SYSCTL_INT(_vm, OID_AUTO, max_wired, 225 CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); 226 227static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); 228static boolean_t vm_pageout_launder(struct vm_pagequeue *pq, int, vm_paddr_t, 229 vm_paddr_t); 230#if !defined(NO_SWAPPING) 231static void vm_pageout_map_deactivate_pages(vm_map_t, long); 232static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); 233static void vm_req_vmdaemon(int req); 234#endif 235static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); 236 237/* 238 * Initialize a dummy page for marking the caller's place in the specified 239 * paging queue. In principle, this function only needs to set the flag 240 * PG_MARKER. Nonetheless, it wirte busies and initializes the hold count 241 * to one as safety precautions. 242 */ 243static void 244vm_pageout_init_marker(vm_page_t marker, u_short queue) 245{ 246 247 bzero(marker, sizeof(*marker)); 248 marker->flags = PG_MARKER; 249 marker->busy_lock = VPB_SINGLE_EXCLUSIVER; 250 marker->queue = queue; 251 marker->hold_count = 1; 252} 253 254/* 255 * vm_pageout_fallback_object_lock: 256 * 257 * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is 258 * known to have failed and page queue must be either PQ_ACTIVE or 259 * PQ_INACTIVE. To avoid lock order violation, unlock the page queues 260 * while locking the vm object. Use marker page to detect page queue 261 * changes and maintain notion of next page on page queue. Return 262 * TRUE if no changes were detected, FALSE otherwise. vm object is 263 * locked on return. 264 * 265 * This function depends on both the lock portion of struct vm_object 266 * and normal struct vm_page being type stable. 267 */ 268static boolean_t 269vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) 270{ 271 struct vm_page marker; 272 struct vm_pagequeue *pq; 273 boolean_t unchanged; 274 u_short queue; 275 vm_object_t object; 276 277 queue = m->queue; 278 vm_pageout_init_marker(&marker, queue); 279 pq = vm_page_pagequeue(m); 280 object = m->object; 281 282 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 283 vm_pagequeue_unlock(pq); 284 vm_page_unlock(m); 285 VM_OBJECT_WLOCK(object); 286 vm_page_lock(m); 287 vm_pagequeue_lock(pq); 288 289 /* 290 * The page's object might have changed, and/or the page might 291 * have moved from its original position in the queue. If the 292 * page's object has changed, then the caller should abandon 293 * processing the page because the wrong object lock was 294 * acquired. Use the marker's plinks.q, not the page's, to 295 * determine if the page has been moved. The state of the 296 * page's plinks.q can be indeterminate; whereas, the marker's 297 * plinks.q must be valid. 298 */ 299 *next = TAILQ_NEXT(&marker, plinks.q); 300 unchanged = m->object == object && 301 m == TAILQ_PREV(&marker, pglist, plinks.q); 302 KASSERT(!unchanged || m->queue == queue, 303 ("page %p queue %d %d", m, queue, m->queue)); 304 TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 305 return (unchanged); 306} 307 308/* 309 * Lock the page while holding the page queue lock. Use marker page 310 * to detect page queue changes and maintain notion of next page on 311 * page queue. Return TRUE if no changes were detected, FALSE 312 * otherwise. The page is locked on return. The page queue lock might 313 * be dropped and reacquired. 314 * 315 * This function depends on normal struct vm_page being type stable. 316 */ 317static boolean_t 318vm_pageout_page_lock(vm_page_t m, vm_page_t *next) 319{ 320 struct vm_page marker; 321 struct vm_pagequeue *pq; 322 boolean_t unchanged; 323 u_short queue; 324 325 vm_page_lock_assert(m, MA_NOTOWNED); 326 if (vm_page_trylock(m)) 327 return (TRUE); 328 329 queue = m->queue; 330 vm_pageout_init_marker(&marker, queue); 331 pq = vm_page_pagequeue(m); 332 333 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); 334 vm_pagequeue_unlock(pq); 335 vm_page_lock(m); 336 vm_pagequeue_lock(pq); 337 338 /* Page queue might have changed. */ 339 *next = TAILQ_NEXT(&marker, plinks.q); 340 unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); 341 KASSERT(!unchanged || m->queue == queue, 342 ("page %p queue %d %d", m, queue, m->queue)); 343 TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); 344 return (unchanged); 345} 346 347/* 348 * vm_pageout_clean: 349 * 350 * Clean the page and remove it from the laundry. 351 * 352 * We set the busy bit to cause potential page faults on this page to 353 * block. Note the careful timing, however, the busy bit isn't set till 354 * late and we cannot do anything that will mess with the page. 355 */ 356static int 357vm_pageout_clean(vm_page_t m) 358{ 359 vm_object_t object; 360 vm_page_t mc[2*vm_pageout_page_count], pb, ps; 361 int pageout_count; 362 int ib, is, page_base; 363 vm_pindex_t pindex = m->pindex; 364 365 vm_page_lock_assert(m, MA_OWNED); 366 object = m->object; 367 VM_OBJECT_ASSERT_WLOCKED(object); 368 369 /* 370 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 371 * with the new swapper, but we could have serious problems paging 372 * out other object types if there is insufficient memory. 373 * 374 * Unfortunately, checking free memory here is far too late, so the 375 * check has been moved up a procedural level. 376 */ 377 378 /* 379 * Can't clean the page if it's busy or held. 380 */ 381 vm_page_assert_unbusied(m); 382 KASSERT(m->hold_count == 0, ("vm_pageout_clean: page %p is held", m)); 383 vm_page_unlock(m); 384 385 mc[vm_pageout_page_count] = pb = ps = m; 386 pageout_count = 1; 387 page_base = vm_pageout_page_count; 388 ib = 1; 389 is = 1; 390 391 /* 392 * Scan object for clusterable pages. 393 * 394 * We can cluster ONLY if: ->> the page is NOT 395 * clean, wired, busy, held, or mapped into a 396 * buffer, and one of the following: 397 * 1) The page is inactive, or a seldom used 398 * active page. 399 * -or- 400 * 2) we force the issue. 401 * 402 * During heavy mmap/modification loads the pageout 403 * daemon can really fragment the underlying file 404 * due to flushing pages out of order and not trying 405 * align the clusters (which leave sporatic out-of-order 406 * holes). To solve this problem we do the reverse scan 407 * first and attempt to align our cluster, then do a 408 * forward scan if room remains. 409 */ 410more: 411 while (ib && pageout_count < vm_pageout_page_count) { 412 vm_page_t p; 413 414 if (ib > pindex) { 415 ib = 0; 416 break; 417 } 418 419 if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { 420 ib = 0; 421 break; 422 } 423 vm_page_test_dirty(p); 424 if (p->dirty == 0) { 425 ib = 0; 426 break; 427 } 428 vm_page_lock(p); 429 if (p->queue != PQ_INACTIVE || 430 p->hold_count != 0) { /* may be undergoing I/O */ 431 vm_page_unlock(p); 432 ib = 0; 433 break; 434 } 435 vm_page_unlock(p); 436 mc[--page_base] = pb = p; 437 ++pageout_count; 438 ++ib; 439 /* 440 * alignment boundry, stop here and switch directions. Do 441 * not clear ib. 442 */ 443 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 444 break; 445 } 446 447 while (pageout_count < vm_pageout_page_count && 448 pindex + is < object->size) { 449 vm_page_t p; 450 451 if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) 452 break; 453 vm_page_test_dirty(p); 454 if (p->dirty == 0) 455 break; 456 vm_page_lock(p); 457 if (p->queue != PQ_INACTIVE || 458 p->hold_count != 0) { /* may be undergoing I/O */ 459 vm_page_unlock(p); 460 break; 461 } 462 vm_page_unlock(p); 463 mc[page_base + pageout_count] = ps = p; 464 ++pageout_count; 465 ++is; 466 } 467 468 /* 469 * If we exhausted our forward scan, continue with the reverse scan 470 * when possible, even past a page boundry. This catches boundry 471 * conditions. 472 */ 473 if (ib && pageout_count < vm_pageout_page_count) 474 goto more; 475 476 /* 477 * we allow reads during pageouts... 478 */ 479 return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, 480 NULL)); 481} 482 483/* 484 * vm_pageout_flush() - launder the given pages 485 * 486 * The given pages are laundered. Note that we setup for the start of 487 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 488 * reference count all in here rather then in the parent. If we want 489 * the parent to do more sophisticated things we may have to change 490 * the ordering. 491 * 492 * Returned runlen is the count of pages between mreq and first 493 * page after mreq with status VM_PAGER_AGAIN. 494 * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL 495 * for any page in runlen set. 496 */ 497int 498vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, 499 boolean_t *eio) 500{ 501 vm_object_t object = mc[0]->object; 502 int pageout_status[count]; 503 int numpagedout = 0; 504 int i, runlen; 505 506 VM_OBJECT_ASSERT_WLOCKED(object); 507 508 /* 509 * Initiate I/O. Bump the vm_page_t->busy counter and 510 * mark the pages read-only. 511 * 512 * We do not have to fixup the clean/dirty bits here... we can 513 * allow the pager to do it after the I/O completes. 514 * 515 * NOTE! mc[i]->dirty may be partial or fragmented due to an 516 * edge case with file fragments. 517 */ 518 for (i = 0; i < count; i++) { 519 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 520 ("vm_pageout_flush: partially invalid page %p index %d/%d", 521 mc[i], i, count)); 522 vm_page_sbusy(mc[i]); 523 pmap_remove_write(mc[i]); 524 } 525 vm_object_pip_add(object, count); 526 527 vm_pager_put_pages(object, mc, count, flags, pageout_status); 528 529 runlen = count - mreq; 530 if (eio != NULL) 531 *eio = FALSE; 532 for (i = 0; i < count; i++) { 533 vm_page_t mt = mc[i]; 534 535 KASSERT(pageout_status[i] == VM_PAGER_PEND || 536 !pmap_page_is_write_mapped(mt), 537 ("vm_pageout_flush: page %p is not write protected", mt)); 538 switch (pageout_status[i]) { 539 case VM_PAGER_OK: 540 case VM_PAGER_PEND: 541 numpagedout++; 542 break; 543 case VM_PAGER_BAD: 544 /* 545 * Page outside of range of object. Right now we 546 * essentially lose the changes by pretending it 547 * worked. 548 */ 549 vm_page_undirty(mt); 550 break; 551 case VM_PAGER_ERROR: 552 case VM_PAGER_FAIL: 553 /* 554 * If page couldn't be paged out, then reactivate the 555 * page so it doesn't clog the inactive list. (We 556 * will try paging out it again later). 557 */ 558 vm_page_lock(mt); 559 vm_page_activate(mt); 560 vm_page_unlock(mt); 561 if (eio != NULL && i >= mreq && i - mreq < runlen) 562 *eio = TRUE; 563 break; 564 case VM_PAGER_AGAIN: 565 if (i >= mreq && i - mreq < runlen) 566 runlen = i - mreq; 567 break; 568 } 569 570 /* 571 * If the operation is still going, leave the page busy to 572 * block all other accesses. Also, leave the paging in 573 * progress indicator set so that we don't attempt an object 574 * collapse. 575 */ 576 if (pageout_status[i] != VM_PAGER_PEND) { 577 vm_object_pip_wakeup(object); 578 vm_page_sunbusy(mt); 579 if (vm_page_count_severe()) { 580 vm_page_lock(mt); 581 vm_page_try_to_cache(mt); 582 vm_page_unlock(mt); 583 } 584 } 585 } 586 if (prunlen != NULL) 587 *prunlen = runlen; 588 return (numpagedout); 589} 590 591static boolean_t 592vm_pageout_launder(struct vm_pagequeue *pq, int tries, vm_paddr_t low, 593 vm_paddr_t high) 594{ 595 struct mount *mp; 596 struct vnode *vp; 597 vm_object_t object; 598 vm_paddr_t pa; 599 vm_page_t m, m_tmp, next; 600 int lockmode; 601 602 vm_pagequeue_lock(pq); 603 TAILQ_FOREACH_SAFE(m, &pq->pq_pl, plinks.q, next) { 604 if ((m->flags & PG_MARKER) != 0) 605 continue; 606 pa = VM_PAGE_TO_PHYS(m); 607 if (pa < low || pa + PAGE_SIZE > high) 608 continue; 609 if (!vm_pageout_page_lock(m, &next) || m->hold_count != 0) { 610 vm_page_unlock(m); 611 continue; 612 } 613 object = m->object; 614 if ((!VM_OBJECT_TRYWLOCK(object) && 615 (!vm_pageout_fallback_object_lock(m, &next) || 616 m->hold_count != 0)) || vm_page_busied(m)) { 617 vm_page_unlock(m); 618 VM_OBJECT_WUNLOCK(object); 619 continue; 620 } 621 vm_page_test_dirty(m); 622 if (m->dirty == 0 && object->ref_count != 0) 623 pmap_remove_all(m); 624 if (m->dirty != 0) { 625 vm_page_unlock(m); 626 if (tries == 0 || (object->flags & OBJ_DEAD) != 0) { 627 VM_OBJECT_WUNLOCK(object); 628 continue; 629 } 630 if (object->type == OBJT_VNODE) { 631 vm_pagequeue_unlock(pq); 632 vp = object->handle; 633 vm_object_reference_locked(object); 634 VM_OBJECT_WUNLOCK(object); 635 (void)vn_start_write(vp, &mp, V_WAIT); 636 lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 637 LK_SHARED : LK_EXCLUSIVE; 638 vn_lock(vp, lockmode | LK_RETRY); 639 VM_OBJECT_WLOCK(object); 640 vm_object_page_clean(object, 0, 0, OBJPC_SYNC); 641 VM_OBJECT_WUNLOCK(object); 642 VOP_UNLOCK(vp, 0); 643 vm_object_deallocate(object); 644 vn_finished_write(mp); 645 return (TRUE); 646 } else if (object->type == OBJT_SWAP || 647 object->type == OBJT_DEFAULT) { 648 vm_pagequeue_unlock(pq); 649 m_tmp = m; 650 vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC, 651 0, NULL, NULL); 652 VM_OBJECT_WUNLOCK(object); 653 return (TRUE); 654 } 655 } else { 656 /* 657 * Dequeue here to prevent lock recursion in 658 * vm_page_cache(). 659 */ 660 vm_page_dequeue_locked(m); 661 vm_page_cache(m); 662 vm_page_unlock(m); 663 } 664 VM_OBJECT_WUNLOCK(object); 665 } 666 vm_pagequeue_unlock(pq); 667 return (FALSE); 668} 669 670/* 671 * Increase the number of cached pages. The specified value, "tries", 672 * determines which categories of pages are cached: 673 * 674 * 0: All clean, inactive pages within the specified physical address range 675 * are cached. Will not sleep. 676 * 1: The vm_lowmem handlers are called. All inactive pages within 677 * the specified physical address range are cached. May sleep. 678 * 2: The vm_lowmem handlers are called. All inactive and active pages 679 * within the specified physical address range are cached. May sleep. 680 */ 681void 682vm_pageout_grow_cache(int tries, vm_paddr_t low, vm_paddr_t high) 683{ 684 int actl, actmax, inactl, inactmax, dom, initial_dom; 685 static int start_dom = 0; 686 687 if (tries > 0) { 688 /* 689 * Decrease registered cache sizes. The vm_lowmem handlers 690 * may acquire locks and/or sleep, so they can only be invoked 691 * when "tries" is greater than zero. 692 */ 693 SDT_PROBE0(vm, , , vm__lowmem_cache); 694 EVENTHANDLER_INVOKE(vm_lowmem, 0); 695 696 /* 697 * We do this explicitly after the caches have been drained 698 * above. 699 */ 700 uma_reclaim(); 701 } 702 703 /* 704 * Make the next scan start on the next domain. 705 */ 706 initial_dom = atomic_fetchadd_int(&start_dom, 1) % vm_ndomains; 707 708 inactl = 0; 709 inactmax = cnt.v_inactive_count; 710 actl = 0; 711 actmax = tries < 2 ? 0 : cnt.v_active_count; 712 dom = initial_dom; 713 714 /* 715 * Scan domains in round-robin order, first inactive queues, 716 * then active. Since domain usually owns large physically 717 * contiguous chunk of memory, it makes sense to completely 718 * exhaust one domain before switching to next, while growing 719 * the pool of contiguous physical pages. 720 * 721 * Do not even start launder a domain which cannot contain 722 * the specified address range, as indicated by segments 723 * constituting the domain. 724 */ 725again: 726 if (inactl < inactmax) { 727 if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, 728 low, high) && 729 vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_INACTIVE], 730 tries, low, high)) { 731 inactl++; 732 goto again; 733 } 734 if (++dom == vm_ndomains) 735 dom = 0; 736 if (dom != initial_dom) 737 goto again; 738 } 739 if (actl < actmax) { 740 if (vm_phys_domain_intersects(vm_dom[dom].vmd_segs, 741 low, high) && 742 vm_pageout_launder(&vm_dom[dom].vmd_pagequeues[PQ_ACTIVE], 743 tries, low, high)) { 744 actl++; 745 goto again; 746 } 747 if (++dom == vm_ndomains) 748 dom = 0; 749 if (dom != initial_dom) 750 goto again; 751 } 752} 753 754#if !defined(NO_SWAPPING) 755/* 756 * vm_pageout_object_deactivate_pages 757 * 758 * Deactivate enough pages to satisfy the inactive target 759 * requirements. 760 * 761 * The object and map must be locked. 762 */ 763static void 764vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, 765 long desired) 766{ 767 vm_object_t backing_object, object; 768 vm_page_t p; 769 int act_delta, remove_mode; 770 771 VM_OBJECT_ASSERT_LOCKED(first_object); 772 if ((first_object->flags & OBJ_FICTITIOUS) != 0) 773 return; 774 for (object = first_object;; object = backing_object) { 775 if (pmap_resident_count(pmap) <= desired) 776 goto unlock_return; 777 VM_OBJECT_ASSERT_LOCKED(object); 778 if ((object->flags & OBJ_UNMANAGED) != 0 || 779 object->paging_in_progress != 0) 780 goto unlock_return; 781 782 remove_mode = 0; 783 if (object->shadow_count > 1) 784 remove_mode = 1; 785 /* 786 * Scan the object's entire memory queue. 787 */ 788 TAILQ_FOREACH(p, &object->memq, listq) { 789 if (pmap_resident_count(pmap) <= desired) 790 goto unlock_return; 791 if (vm_page_busied(p)) 792 continue; 793 PCPU_INC(cnt.v_pdpages); 794 vm_page_lock(p); 795 if (p->wire_count != 0 || p->hold_count != 0 || 796 !pmap_page_exists_quick(pmap, p)) { 797 vm_page_unlock(p); 798 continue; 799 } 800 act_delta = pmap_ts_referenced(p); 801 if ((p->aflags & PGA_REFERENCED) != 0) { 802 if (act_delta == 0) 803 act_delta = 1; 804 vm_page_aflag_clear(p, PGA_REFERENCED); 805 } 806 if (p->queue != PQ_ACTIVE && act_delta != 0) { 807 vm_page_activate(p); 808 p->act_count += act_delta; 809 } else if (p->queue == PQ_ACTIVE) { 810 if (act_delta == 0) { 811 p->act_count -= min(p->act_count, 812 ACT_DECLINE); 813 if (!remove_mode && p->act_count == 0) { 814 pmap_remove_all(p); 815 vm_page_deactivate(p); 816 } else 817 vm_page_requeue(p); 818 } else { 819 vm_page_activate(p); 820 if (p->act_count < ACT_MAX - 821 ACT_ADVANCE) 822 p->act_count += ACT_ADVANCE; 823 vm_page_requeue(p); 824 } 825 } else if (p->queue == PQ_INACTIVE) 826 pmap_remove_all(p); 827 vm_page_unlock(p); 828 } 829 if ((backing_object = object->backing_object) == NULL) 830 goto unlock_return; 831 VM_OBJECT_RLOCK(backing_object); 832 if (object != first_object) 833 VM_OBJECT_RUNLOCK(object); 834 } 835unlock_return: 836 if (object != first_object) 837 VM_OBJECT_RUNLOCK(object); 838} 839 840/* 841 * deactivate some number of pages in a map, try to do it fairly, but 842 * that is really hard to do. 843 */ 844static void 845vm_pageout_map_deactivate_pages(map, desired) 846 vm_map_t map; 847 long desired; 848{ 849 vm_map_entry_t tmpe; 850 vm_object_t obj, bigobj; 851 int nothingwired; 852 853 if (!vm_map_trylock(map)) 854 return; 855 856 bigobj = NULL; 857 nothingwired = TRUE; 858 859 /* 860 * first, search out the biggest object, and try to free pages from 861 * that. 862 */ 863 tmpe = map->header.next; 864 while (tmpe != &map->header) { 865 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 866 obj = tmpe->object.vm_object; 867 if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { 868 if (obj->shadow_count <= 1 && 869 (bigobj == NULL || 870 bigobj->resident_page_count < obj->resident_page_count)) { 871 if (bigobj != NULL) 872 VM_OBJECT_RUNLOCK(bigobj); 873 bigobj = obj; 874 } else 875 VM_OBJECT_RUNLOCK(obj); 876 } 877 } 878 if (tmpe->wired_count > 0) 879 nothingwired = FALSE; 880 tmpe = tmpe->next; 881 } 882 883 if (bigobj != NULL) { 884 vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); 885 VM_OBJECT_RUNLOCK(bigobj); 886 } 887 /* 888 * Next, hunt around for other pages to deactivate. We actually 889 * do this search sort of wrong -- .text first is not the best idea. 890 */ 891 tmpe = map->header.next; 892 while (tmpe != &map->header) { 893 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 894 break; 895 if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { 896 obj = tmpe->object.vm_object; 897 if (obj != NULL) { 898 VM_OBJECT_RLOCK(obj); 899 vm_pageout_object_deactivate_pages(map->pmap, obj, desired); 900 VM_OBJECT_RUNLOCK(obj); 901 } 902 } 903 tmpe = tmpe->next; 904 } 905 906#ifdef __ia64__ 907 /* 908 * Remove all non-wired, managed mappings if a process is swapped out. 909 * This will free page table pages. 910 */ 911 if (desired == 0) 912 pmap_remove_pages(map->pmap); 913#else 914 /* 915 * Remove all mappings if a process is swapped out, this will free page 916 * table pages. 917 */ 918 if (desired == 0 && nothingwired) { 919 pmap_remove(vm_map_pmap(map), vm_map_min(map), 920 vm_map_max(map)); 921 } 922#endif 923 924 vm_map_unlock(map); 925} 926#endif /* !defined(NO_SWAPPING) */ 927 928/* 929 * vm_pageout_scan does the dirty work for the pageout daemon. 930 * 931 * pass 0 - Update active LRU/deactivate pages 932 * pass 1 - Move inactive to cache or free 933 * pass 2 - Launder dirty pages 934 */ 935static void 936vm_pageout_scan(struct vm_domain *vmd, int pass) 937{ 938 vm_page_t m, next; 939 struct vm_pagequeue *pq; 940 vm_object_t object; 941 long min_scan; 942 int act_delta, addl_page_shortage, deficit, maxscan, page_shortage; 943 int vnodes_skipped = 0; 944 int maxlaunder, scan_tick, scanned; 945 int lockmode; 946 boolean_t queues_locked; 947 948 /* 949 * If we need to reclaim memory ask kernel caches to return 950 * some. We rate limit to avoid thrashing. 951 */ 952 if (vmd == &vm_dom[0] && pass > 0 && 953 (time_uptime - lowmem_uptime) >= lowmem_period) { 954 /* 955 * Decrease registered cache sizes. 956 */ 957 SDT_PROBE0(vm, , , vm__lowmem_scan); 958 EVENTHANDLER_INVOKE(vm_lowmem, 0); 959 /* 960 * We do this explicitly after the caches have been 961 * drained above. 962 */ 963 uma_reclaim(); 964 lowmem_uptime = time_uptime; 965 } 966 967 /* 968 * The addl_page_shortage is the number of temporarily 969 * stuck pages in the inactive queue. In other words, the 970 * number of pages from the inactive count that should be 971 * discounted in setting the target for the active queue scan. 972 */ 973 addl_page_shortage = 0; 974 975 /* 976 * Calculate the number of pages we want to either free or move 977 * to the cache. 978 */ 979 if (pass > 0) { 980 deficit = atomic_readandclear_int(&vm_pageout_deficit); 981 page_shortage = vm_paging_target() + deficit; 982 } else 983 page_shortage = deficit = 0; 984 985 /* 986 * maxlaunder limits the number of dirty pages we flush per scan. 987 * For most systems a smaller value (16 or 32) is more robust under 988 * extreme memory and disk pressure because any unnecessary writes 989 * to disk can result in extreme performance degredation. However, 990 * systems with excessive dirty pages (especially when MAP_NOSYNC is 991 * used) will die horribly with limited laundering. If the pageout 992 * daemon cannot clean enough pages in the first pass, we let it go 993 * all out in succeeding passes. 994 */ 995 if ((maxlaunder = vm_max_launder) <= 1) 996 maxlaunder = 1; 997 if (pass > 1) 998 maxlaunder = 10000; 999 1000 /* 1001 * Start scanning the inactive queue for pages we can move to the 1002 * cache or free. The scan will stop when the target is reached or 1003 * we have scanned the entire inactive queue. Note that m->act_count 1004 * is not used to form decisions for the inactive queue, only for the 1005 * active queue. 1006 */ 1007 pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; 1008 maxscan = pq->pq_cnt; 1009 vm_pagequeue_lock(pq); 1010 queues_locked = TRUE; 1011 for (m = TAILQ_FIRST(&pq->pq_pl); 1012 m != NULL && maxscan-- > 0 && page_shortage > 0; 1013 m = next) { 1014 vm_pagequeue_assert_locked(pq); 1015 KASSERT(queues_locked, ("unlocked queues")); 1016 KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); 1017 1018 PCPU_INC(cnt.v_pdpages); 1019 next = TAILQ_NEXT(m, plinks.q); 1020 1021 /* 1022 * skip marker pages 1023 */ 1024 if (m->flags & PG_MARKER) 1025 continue; 1026 1027 KASSERT((m->flags & PG_FICTITIOUS) == 0, 1028 ("Fictitious page %p cannot be in inactive queue", m)); 1029 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1030 ("Unmanaged page %p cannot be in inactive queue", m)); 1031 1032 /* 1033 * The page or object lock acquisitions fail if the 1034 * page was removed from the queue or moved to a 1035 * different position within the queue. In either 1036 * case, addl_page_shortage should not be incremented. 1037 */ 1038 if (!vm_pageout_page_lock(m, &next)) { 1039 vm_page_unlock(m); 1040 continue; 1041 } 1042 object = m->object; 1043 if (!VM_OBJECT_TRYWLOCK(object) && 1044 !vm_pageout_fallback_object_lock(m, &next)) { 1045 vm_page_unlock(m); 1046 VM_OBJECT_WUNLOCK(object); 1047 continue; 1048 } 1049 1050 /* 1051 * Don't mess with busy pages, keep them at at the 1052 * front of the queue, most likely they are being 1053 * paged out. Increment addl_page_shortage for busy 1054 * pages, because they may leave the inactive queue 1055 * shortly after page scan is finished. 1056 */ 1057 if (vm_page_busied(m)) { 1058 vm_page_unlock(m); 1059 VM_OBJECT_WUNLOCK(object); 1060 addl_page_shortage++; 1061 continue; 1062 } 1063 1064 /* 1065 * We unlock the inactive page queue, invalidating the 1066 * 'next' pointer. Use our marker to remember our 1067 * place. 1068 */ 1069 TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); 1070 vm_pagequeue_unlock(pq); 1071 queues_locked = FALSE; 1072 1073 /* 1074 * We bump the activation count if the page has been 1075 * referenced while in the inactive queue. This makes 1076 * it less likely that the page will be added back to the 1077 * inactive queue prematurely again. Here we check the 1078 * page tables (or emulated bits, if any), given the upper 1079 * level VM system not knowing anything about existing 1080 * references. 1081 */ 1082 act_delta = 0; 1083 if ((m->aflags & PGA_REFERENCED) != 0) { 1084 vm_page_aflag_clear(m, PGA_REFERENCED); 1085 act_delta = 1; 1086 } 1087 if (object->ref_count != 0) { 1088 act_delta += pmap_ts_referenced(m); 1089 } else { 1090 KASSERT(!pmap_page_is_mapped(m), 1091 ("vm_pageout_scan: page %p is mapped", m)); 1092 } 1093 1094 /* 1095 * If the upper level VM system knows about any page 1096 * references, we reactivate the page or requeue it. 1097 */ 1098 if (act_delta != 0) { 1099 if (object->ref_count) { 1100 vm_page_activate(m); 1101 m->act_count += act_delta + ACT_ADVANCE; 1102 } else { 1103 vm_pagequeue_lock(pq); 1104 queues_locked = TRUE; 1105 vm_page_requeue_locked(m); 1106 } 1107 VM_OBJECT_WUNLOCK(object); 1108 vm_page_unlock(m); 1109 goto relock_queues; 1110 } 1111 1112 if (m->hold_count != 0) { 1113 vm_page_unlock(m); 1114 VM_OBJECT_WUNLOCK(object); 1115 1116 /* 1117 * Held pages are essentially stuck in the 1118 * queue. So, they ought to be discounted 1119 * from the inactive count. See the 1120 * calculation of the page_shortage for the 1121 * loop over the active queue below. 1122 */ 1123 addl_page_shortage++; 1124 goto relock_queues; 1125 } 1126 1127 /* 1128 * If the page appears to be clean at the machine-independent 1129 * layer, then remove all of its mappings from the pmap in 1130 * anticipation of placing it onto the cache queue. If, 1131 * however, any of the page's mappings allow write access, 1132 * then the page may still be modified until the last of those 1133 * mappings are removed. 1134 */ 1135 if (object->ref_count != 0) { 1136 vm_page_test_dirty(m); 1137 if (m->dirty == 0) 1138 pmap_remove_all(m); 1139 } 1140 1141 if (m->valid == 0) { 1142 /* 1143 * Invalid pages can be easily freed 1144 */ 1145 vm_page_free(m); 1146 PCPU_INC(cnt.v_dfree); 1147 --page_shortage; 1148 } else if (m->dirty == 0) { 1149 /* 1150 * Clean pages can be placed onto the cache queue. 1151 * This effectively frees them. 1152 */ 1153 vm_page_cache(m); 1154 --page_shortage; 1155 } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { 1156 /* 1157 * Dirty pages need to be paged out, but flushing 1158 * a page is extremely expensive verses freeing 1159 * a clean page. Rather then artificially limiting 1160 * the number of pages we can flush, we instead give 1161 * dirty pages extra priority on the inactive queue 1162 * by forcing them to be cycled through the queue 1163 * twice before being flushed, after which the 1164 * (now clean) page will cycle through once more 1165 * before being freed. This significantly extends 1166 * the thrash point for a heavily loaded machine. 1167 */ 1168 m->flags |= PG_WINATCFLS; 1169 vm_pagequeue_lock(pq); 1170 queues_locked = TRUE; 1171 vm_page_requeue_locked(m); 1172 } else if (maxlaunder > 0) { 1173 /* 1174 * We always want to try to flush some dirty pages if 1175 * we encounter them, to keep the system stable. 1176 * Normally this number is small, but under extreme 1177 * pressure where there are insufficient clean pages 1178 * on the inactive queue, we may have to go all out. 1179 */ 1180 int swap_pageouts_ok; 1181 struct vnode *vp = NULL; 1182 struct mount *mp = NULL; 1183 1184 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 1185 swap_pageouts_ok = 1; 1186 } else { 1187 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 1188 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 1189 vm_page_count_min()); 1190 1191 } 1192 1193 /* 1194 * We don't bother paging objects that are "dead". 1195 * Those objects are in a "rundown" state. 1196 */ 1197 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 1198 vm_pagequeue_lock(pq); 1199 vm_page_unlock(m); 1200 VM_OBJECT_WUNLOCK(object); 1201 queues_locked = TRUE; 1202 vm_page_requeue_locked(m); 1203 goto relock_queues; 1204 } 1205 1206 /* 1207 * The object is already known NOT to be dead. It 1208 * is possible for the vget() to block the whole 1209 * pageout daemon, but the new low-memory handling 1210 * code should prevent it. 1211 * 1212 * The previous code skipped locked vnodes and, worse, 1213 * reordered pages in the queue. This results in 1214 * completely non-deterministic operation and, on a 1215 * busy system, can lead to extremely non-optimal 1216 * pageouts. For example, it can cause clean pages 1217 * to be freed and dirty pages to be moved to the end 1218 * of the queue. Since dirty pages are also moved to 1219 * the end of the queue once-cleaned, this gives 1220 * way too large a weighting to defering the freeing 1221 * of dirty pages. 1222 * 1223 * We can't wait forever for the vnode lock, we might 1224 * deadlock due to a vn_read() getting stuck in 1225 * vm_wait while holding this vnode. We skip the 1226 * vnode if we can't get it in a reasonable amount 1227 * of time. 1228 */ 1229 if (object->type == OBJT_VNODE) { 1230 vm_page_unlock(m); 1231 vp = object->handle; 1232 if (vp->v_type == VREG && 1233 vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1234 mp = NULL; 1235 ++pageout_lock_miss; 1236 if (object->flags & OBJ_MIGHTBEDIRTY) 1237 vnodes_skipped++; 1238 goto unlock_and_continue; 1239 } 1240 KASSERT(mp != NULL, 1241 ("vp %p with NULL v_mount", vp)); 1242 vm_object_reference_locked(object); 1243 VM_OBJECT_WUNLOCK(object); 1244 lockmode = MNT_SHARED_WRITES(vp->v_mount) ? 1245 LK_SHARED : LK_EXCLUSIVE; 1246 if (vget(vp, lockmode | LK_TIMELOCK, 1247 curthread)) { 1248 VM_OBJECT_WLOCK(object); 1249 ++pageout_lock_miss; 1250 if (object->flags & OBJ_MIGHTBEDIRTY) 1251 vnodes_skipped++; 1252 vp = NULL; 1253 goto unlock_and_continue; 1254 } 1255 VM_OBJECT_WLOCK(object); 1256 vm_page_lock(m); 1257 vm_pagequeue_lock(pq); 1258 queues_locked = TRUE; 1259 /* 1260 * The page might have been moved to another 1261 * queue during potential blocking in vget() 1262 * above. The page might have been freed and 1263 * reused for another vnode. 1264 */ 1265 if (m->queue != PQ_INACTIVE || 1266 m->object != object || 1267 TAILQ_NEXT(m, plinks.q) != &vmd->vmd_marker) { 1268 vm_page_unlock(m); 1269 if (object->flags & OBJ_MIGHTBEDIRTY) 1270 vnodes_skipped++; 1271 goto unlock_and_continue; 1272 } 1273 1274 /* 1275 * The page may have been busied during the 1276 * blocking in vget(). We don't move the 1277 * page back onto the end of the queue so that 1278 * statistics are more correct if we don't. 1279 */ 1280 if (vm_page_busied(m)) { 1281 vm_page_unlock(m); 1282 addl_page_shortage++; 1283 goto unlock_and_continue; 1284 } 1285 1286 /* 1287 * If the page has become held it might 1288 * be undergoing I/O, so skip it 1289 */ 1290 if (m->hold_count != 0) { 1291 vm_page_unlock(m); 1292 addl_page_shortage++; 1293 if (object->flags & OBJ_MIGHTBEDIRTY) 1294 vnodes_skipped++; 1295 goto unlock_and_continue; 1296 } 1297 vm_pagequeue_unlock(pq); 1298 queues_locked = FALSE; 1299 } 1300 1301 /* 1302 * If a page is dirty, then it is either being washed 1303 * (but not yet cleaned) or it is still in the 1304 * laundry. If it is still in the laundry, then we 1305 * start the cleaning operation. 1306 * 1307 * decrement page_shortage on success to account for 1308 * the (future) cleaned page. Otherwise we could wind 1309 * up laundering or cleaning too many pages. 1310 */ 1311 if (vm_pageout_clean(m) != 0) { 1312 --page_shortage; 1313 --maxlaunder; 1314 } 1315unlock_and_continue: 1316 vm_page_lock_assert(m, MA_NOTOWNED); 1317 VM_OBJECT_WUNLOCK(object); 1318 if (mp != NULL) { 1319 if (queues_locked) { 1320 vm_pagequeue_unlock(pq); 1321 queues_locked = FALSE; 1322 } 1323 if (vp != NULL) 1324 vput(vp); 1325 vm_object_deallocate(object); 1326 vn_finished_write(mp); 1327 } 1328 vm_page_lock_assert(m, MA_NOTOWNED); 1329 goto relock_queues; 1330 } 1331 vm_page_unlock(m); 1332 VM_OBJECT_WUNLOCK(object); 1333relock_queues: 1334 if (!queues_locked) { 1335 vm_pagequeue_lock(pq); 1336 queues_locked = TRUE; 1337 } 1338 next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); 1339 TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); 1340 } 1341 vm_pagequeue_unlock(pq); 1342 1343#if !defined(NO_SWAPPING) 1344 /* 1345 * Wakeup the swapout daemon if we didn't cache or free the targeted 1346 * number of pages. 1347 */ 1348 if (vm_swap_enabled && page_shortage > 0) 1349 vm_req_vmdaemon(VM_SWAP_NORMAL); 1350#endif 1351 1352 /* 1353 * Wakeup the sync daemon if we skipped a vnode in a writeable object 1354 * and we didn't cache or free enough pages. 1355 */ 1356 if (vnodes_skipped > 0 && page_shortage > cnt.v_free_target - 1357 cnt.v_free_min) 1358 (void)speedup_syncer(); 1359 1360 /* 1361 * Compute the number of pages we want to try to move from the 1362 * active queue to the inactive queue. 1363 */ 1364 page_shortage = cnt.v_inactive_target - cnt.v_inactive_count + 1365 vm_paging_target() + deficit + addl_page_shortage; 1366 1367 pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; 1368 vm_pagequeue_lock(pq); 1369 maxscan = pq->pq_cnt; 1370 1371 /* 1372 * If we're just idle polling attempt to visit every 1373 * active page within 'update_period' seconds. 1374 */ 1375 scan_tick = ticks; 1376 if (vm_pageout_update_period != 0) { 1377 min_scan = pq->pq_cnt; 1378 min_scan *= scan_tick - vmd->vmd_last_active_scan; 1379 min_scan /= hz * vm_pageout_update_period; 1380 } else 1381 min_scan = 0; 1382 if (min_scan > 0 || (page_shortage > 0 && maxscan > 0)) 1383 vmd->vmd_last_active_scan = scan_tick; 1384 1385 /* 1386 * Scan the active queue for pages that can be deactivated. Update 1387 * the per-page activity counter and use it to identify deactivation 1388 * candidates. 1389 */ 1390 for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < 1391 min_scan || (page_shortage > 0 && scanned < maxscan)); m = next, 1392 scanned++) { 1393 1394 KASSERT(m->queue == PQ_ACTIVE, 1395 ("vm_pageout_scan: page %p isn't active", m)); 1396 1397 next = TAILQ_NEXT(m, plinks.q); 1398 if ((m->flags & PG_MARKER) != 0) 1399 continue; 1400 KASSERT((m->flags & PG_FICTITIOUS) == 0, 1401 ("Fictitious page %p cannot be in active queue", m)); 1402 KASSERT((m->oflags & VPO_UNMANAGED) == 0, 1403 ("Unmanaged page %p cannot be in active queue", m)); 1404 if (!vm_pageout_page_lock(m, &next)) { 1405 vm_page_unlock(m); 1406 continue; 1407 } 1408 1409 /* 1410 * The count for pagedaemon pages is done after checking the 1411 * page for eligibility... 1412 */ 1413 PCPU_INC(cnt.v_pdpages); 1414 1415 /* 1416 * Check to see "how much" the page has been used. 1417 */ 1418 act_delta = 0; 1419 if (m->aflags & PGA_REFERENCED) { 1420 vm_page_aflag_clear(m, PGA_REFERENCED); 1421 act_delta += 1; 1422 } 1423 /* 1424 * Unlocked object ref count check. Two races are possible. 1425 * 1) The ref was transitioning to zero and we saw non-zero, 1426 * the pmap bits will be checked unnecessarily. 1427 * 2) The ref was transitioning to one and we saw zero. 1428 * The page lock prevents a new reference to this page so 1429 * we need not check the reference bits. 1430 */ 1431 if (m->object->ref_count != 0) 1432 act_delta += pmap_ts_referenced(m); 1433 1434 /* 1435 * Advance or decay the act_count based on recent usage. 1436 */ 1437 if (act_delta) { 1438 m->act_count += ACT_ADVANCE + act_delta; 1439 if (m->act_count > ACT_MAX) 1440 m->act_count = ACT_MAX; 1441 } else { 1442 m->act_count -= min(m->act_count, ACT_DECLINE); 1443 act_delta = m->act_count; 1444 } 1445 1446 /* 1447 * Move this page to the tail of the active or inactive 1448 * queue depending on usage. 1449 */ 1450 if (act_delta == 0) { 1451 /* Dequeue to avoid later lock recursion. */ 1452 vm_page_dequeue_locked(m); 1453 vm_page_deactivate(m); 1454 page_shortage--; 1455 } else 1456 vm_page_requeue_locked(m); 1457 vm_page_unlock(m); 1458 } 1459 vm_pagequeue_unlock(pq); 1460#if !defined(NO_SWAPPING) 1461 /* 1462 * Idle process swapout -- run once per second. 1463 */ 1464 if (vm_swap_idle_enabled) { 1465 static long lsec; 1466 if (time_second != lsec) { 1467 vm_req_vmdaemon(VM_SWAP_IDLE); 1468 lsec = time_second; 1469 } 1470 } 1471#endif 1472 1473 /* 1474 * If we are critically low on one of RAM or swap and low on 1475 * the other, kill the largest process. However, we avoid 1476 * doing this on the first pass in order to give ourselves a 1477 * chance to flush out dirty vnode-backed pages and to allow 1478 * active pages to be moved to the inactive queue and reclaimed. 1479 */ 1480 vm_pageout_mightbe_oom(vmd, pass); 1481} 1482 1483static int vm_pageout_oom_vote; 1484 1485/* 1486 * The pagedaemon threads randlomly select one to perform the 1487 * OOM. Trying to kill processes before all pagedaemons 1488 * failed to reach free target is premature. 1489 */ 1490static void 1491vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass) 1492{ 1493 int old_vote; 1494 1495 if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) || 1496 (swap_pager_full && vm_paging_target() > 0))) { 1497 if (vmd->vmd_oom) { 1498 vmd->vmd_oom = FALSE; 1499 atomic_subtract_int(&vm_pageout_oom_vote, 1); 1500 } 1501 return; 1502 } 1503 1504 if (vmd->vmd_oom) 1505 return; 1506 1507 vmd->vmd_oom = TRUE; 1508 old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); 1509 if (old_vote != vm_ndomains - 1) 1510 return; 1511 1512 /* 1513 * The current pagedaemon thread is the last in the quorum to 1514 * start OOM. Initiate the selection and signaling of the 1515 * victim. 1516 */ 1517 vm_pageout_oom(VM_OOM_MEM); 1518 1519 /* 1520 * After one round of OOM terror, recall our vote. On the 1521 * next pass, current pagedaemon would vote again if the low 1522 * memory condition is still there, due to vmd_oom being 1523 * false. 1524 */ 1525 vmd->vmd_oom = FALSE; 1526 atomic_subtract_int(&vm_pageout_oom_vote, 1); 1527} 1528 1529void 1530vm_pageout_oom(int shortage) 1531{ 1532 struct proc *p, *bigproc; 1533 vm_offset_t size, bigsize; 1534 struct thread *td; 1535 struct vmspace *vm; 1536 1537 /* 1538 * We keep the process bigproc locked once we find it to keep anyone 1539 * from messing with it; however, there is a possibility of 1540 * deadlock if process B is bigproc and one of it's child processes 1541 * attempts to propagate a signal to B while we are waiting for A's 1542 * lock while walking this list. To avoid this, we don't block on 1543 * the process lock but just skip a process if it is already locked. 1544 */ 1545 bigproc = NULL; 1546 bigsize = 0; 1547 sx_slock(&allproc_lock); 1548 FOREACH_PROC_IN_SYSTEM(p) { 1549 int breakout; 1550 1551 PROC_LOCK(p); 1552 1553 /* 1554 * If this is a system, protected or killed process, skip it. 1555 */ 1556 if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | 1557 P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || 1558 p->p_pid == 1 || P_KILLED(p) || 1559 (p->p_pid < 48 && swap_pager_avail != 0)) { 1560 PROC_UNLOCK(p); 1561 continue; 1562 } 1563 /* 1564 * If the process is in a non-running type state, 1565 * don't touch it. Check all the threads individually. 1566 */ 1567 breakout = 0; 1568 FOREACH_THREAD_IN_PROC(p, td) { 1569 thread_lock(td); 1570 if (!TD_ON_RUNQ(td) && 1571 !TD_IS_RUNNING(td) && 1572 !TD_IS_SLEEPING(td) && 1573 !TD_IS_SUSPENDED(td) && 1574 !TD_IS_SWAPPED(td)) { 1575 thread_unlock(td); 1576 breakout = 1; 1577 break; 1578 } 1579 thread_unlock(td); 1580 } 1581 if (breakout) { 1582 PROC_UNLOCK(p); 1583 continue; 1584 } 1585 /* 1586 * get the process size 1587 */ 1588 vm = vmspace_acquire_ref(p); 1589 if (vm == NULL) { 1590 PROC_UNLOCK(p); 1591 continue; 1592 } 1593 _PHOLD(p); 1594 if (!vm_map_trylock_read(&vm->vm_map)) { 1595 _PRELE(p); 1596 PROC_UNLOCK(p); 1597 vmspace_free(vm); 1598 continue; 1599 } 1600 PROC_UNLOCK(p); 1601 size = vmspace_swap_count(vm); 1602 vm_map_unlock_read(&vm->vm_map); 1603 if (shortage == VM_OOM_MEM) 1604 size += vmspace_resident_count(vm); 1605 vmspace_free(vm); 1606 /* 1607 * if the this process is bigger than the biggest one 1608 * remember it. 1609 */ 1610 if (size > bigsize) { 1611 if (bigproc != NULL) 1612 PRELE(bigproc); 1613 bigproc = p; 1614 bigsize = size; 1615 } else { 1616 PRELE(p); 1617 } 1618 } 1619 sx_sunlock(&allproc_lock); 1620 if (bigproc != NULL) { 1621 PROC_LOCK(bigproc); 1622 killproc(bigproc, "out of swap space"); 1623 sched_nice(bigproc, PRIO_MIN); 1624 _PRELE(bigproc); 1625 PROC_UNLOCK(bigproc); 1626 wakeup(&cnt.v_free_count); 1627 } 1628} 1629 1630static void 1631vm_pageout_worker(void *arg) 1632{ 1633 struct vm_domain *domain; 1634 int domidx; 1635 1636 domidx = (uintptr_t)arg; 1637 domain = &vm_dom[domidx]; 1638 1639 /* 1640 * XXXKIB It could be useful to bind pageout daemon threads to 1641 * the cores belonging to the domain, from which vm_page_array 1642 * is allocated. 1643 */ 1644 1645 KASSERT(domain->vmd_segs != 0, ("domain without segments")); 1646 domain->vmd_last_active_scan = ticks; 1647 vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); 1648 1649 /* 1650 * The pageout daemon worker is never done, so loop forever. 1651 */ 1652 while (TRUE) { 1653 /* 1654 * If we have enough free memory, wakeup waiters. Do 1655 * not clear vm_pages_needed until we reach our target, 1656 * otherwise we may be woken up over and over again and 1657 * waste a lot of cpu. 1658 */ 1659 mtx_lock(&vm_page_queue_free_mtx); 1660 if (vm_pages_needed && !vm_page_count_min()) { 1661 if (!vm_paging_needed()) 1662 vm_pages_needed = 0; 1663 wakeup(&cnt.v_free_count); 1664 } 1665 if (vm_pages_needed) { 1666 /* 1667 * We're still not done. Either vm_pages_needed was 1668 * set by another thread during the previous scan 1669 * (typically, this happens during a level 0 scan) or 1670 * vm_pages_needed was already set and the scan failed 1671 * to free enough pages. If we haven't yet performed 1672 * a level >= 2 scan (unlimited dirty cleaning), then 1673 * upgrade the level and scan again now. Otherwise, 1674 * sleep a bit and try again later. While sleeping, 1675 * vm_pages_needed can be cleared. 1676 */ 1677 if (domain->vmd_pass > 1) 1678 msleep(&vm_pages_needed, 1679 &vm_page_queue_free_mtx, PVM, "psleep", 1680 hz / 2); 1681 } else { 1682 /* 1683 * Good enough, sleep until required to refresh 1684 * stats. 1685 */ 1686 msleep(&vm_pages_needed, &vm_page_queue_free_mtx, 1687 PVM, "psleep", hz); 1688 } 1689 if (vm_pages_needed) { 1690 cnt.v_pdwakeups++; 1691 domain->vmd_pass++; 1692 } else 1693 domain->vmd_pass = 0; 1694 mtx_unlock(&vm_page_queue_free_mtx); 1695 vm_pageout_scan(domain, domain->vmd_pass); 1696 } 1697} 1698 1699/* 1700 * vm_pageout_init initialises basic pageout daemon settings. 1701 */ 1702static void 1703vm_pageout_init(void) 1704{ 1705 /* 1706 * Initialize some paging parameters. 1707 */ 1708 cnt.v_interrupt_free_min = 2; 1709 if (cnt.v_page_count < 2000) 1710 vm_pageout_page_count = 8; 1711 1712 /* 1713 * v_free_reserved needs to include enough for the largest 1714 * swap pager structures plus enough for any pv_entry structs 1715 * when paging. 1716 */ 1717 if (cnt.v_page_count > 1024) 1718 cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200; 1719 else 1720 cnt.v_free_min = 4; 1721 cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + 1722 cnt.v_interrupt_free_min; 1723 cnt.v_free_reserved = vm_pageout_page_count + 1724 cnt.v_pageout_free_min + (cnt.v_page_count / 768); 1725 cnt.v_free_severe = cnt.v_free_min / 2; 1726 cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved; 1727 cnt.v_free_min += cnt.v_free_reserved; 1728 cnt.v_free_severe += cnt.v_free_reserved; 1729 cnt.v_inactive_target = (3 * cnt.v_free_target) / 2; 1730 if (cnt.v_inactive_target > cnt.v_free_count / 3) 1731 cnt.v_inactive_target = cnt.v_free_count / 3; 1732 1733 /* 1734 * Set the default wakeup threshold to be 10% above the minimum 1735 * page limit. This keeps the steady state out of shortfall. 1736 */ 1737 vm_pageout_wakeup_thresh = (cnt.v_free_min / 10) * 11; 1738 1739 /* 1740 * Set interval in seconds for active scan. We want to visit each 1741 * page at least once every ten minutes. This is to prevent worst 1742 * case paging behaviors with stale active LRU. 1743 */ 1744 if (vm_pageout_update_period == 0) 1745 vm_pageout_update_period = 600; 1746 1747 /* XXX does not really belong here */ 1748 if (vm_page_max_wired == 0) 1749 vm_page_max_wired = cnt.v_free_count / 3; 1750} 1751 1752/* 1753 * vm_pageout is the high level pageout daemon. 1754 */ 1755static void 1756vm_pageout(void) 1757{ 1758 int error; 1759#if MAXMEMDOM > 1 1760 int i; 1761#endif 1762 1763 swap_pager_swap_init(); 1764#if MAXMEMDOM > 1 1765 for (i = 1; i < vm_ndomains; i++) { 1766 error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, 1767 curproc, NULL, 0, 0, "dom%d", i); 1768 if (error != 0) { 1769 panic("starting pageout for domain %d, error %d\n", 1770 i, error); 1771 } 1772 } 1773#endif 1774 error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 1775 0, 0, "uma"); 1776 if (error != 0) 1777 panic("starting uma_reclaim helper, error %d\n", error); 1778 vm_pageout_worker((void *)(uintptr_t)0); 1779} 1780 1781/* 1782 * Unless the free page queue lock is held by the caller, this function 1783 * should be regarded as advisory. Specifically, the caller should 1784 * not msleep() on &cnt.v_free_count following this function unless 1785 * the free page queue lock is held until the msleep() is performed. 1786 */ 1787void 1788pagedaemon_wakeup(void) 1789{ 1790 1791 if (!vm_pages_needed && curthread->td_proc != pageproc) { 1792 vm_pages_needed = 1; 1793 wakeup(&vm_pages_needed); 1794 } 1795} 1796 1797#if !defined(NO_SWAPPING) 1798static void 1799vm_req_vmdaemon(int req) 1800{ 1801 static int lastrun = 0; 1802 1803 mtx_lock(&vm_daemon_mtx); 1804 vm_pageout_req_swapout |= req; 1805 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 1806 wakeup(&vm_daemon_needed); 1807 lastrun = ticks; 1808 } 1809 mtx_unlock(&vm_daemon_mtx); 1810} 1811 1812static void 1813vm_daemon(void) 1814{ 1815 struct rlimit rsslim; 1816 struct proc *p; 1817 struct thread *td; 1818 struct vmspace *vm; 1819 int breakout, swapout_flags, tryagain, attempts; 1820#ifdef RACCT 1821 uint64_t rsize, ravailable; 1822#endif 1823 1824 while (TRUE) { 1825 mtx_lock(&vm_daemon_mtx); 1826 msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", 1827#ifdef RACCT 1828 racct_enable ? hz : 0 1829#else 1830 0 1831#endif 1832 ); 1833 swapout_flags = vm_pageout_req_swapout; 1834 vm_pageout_req_swapout = 0; 1835 mtx_unlock(&vm_daemon_mtx); 1836 if (swapout_flags) 1837 swapout_procs(swapout_flags); 1838 1839 /* 1840 * scan the processes for exceeding their rlimits or if 1841 * process is swapped out -- deactivate pages 1842 */ 1843 tryagain = 0; 1844 attempts = 0; 1845again: 1846 attempts++; 1847 sx_slock(&allproc_lock); 1848 FOREACH_PROC_IN_SYSTEM(p) { 1849 vm_pindex_t limit, size; 1850 1851 /* 1852 * if this is a system process or if we have already 1853 * looked at this process, skip it. 1854 */ 1855 PROC_LOCK(p); 1856 if (p->p_state != PRS_NORMAL || 1857 p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { 1858 PROC_UNLOCK(p); 1859 continue; 1860 } 1861 /* 1862 * if the process is in a non-running type state, 1863 * don't touch it. 1864 */ 1865 breakout = 0; 1866 FOREACH_THREAD_IN_PROC(p, td) { 1867 thread_lock(td); 1868 if (!TD_ON_RUNQ(td) && 1869 !TD_IS_RUNNING(td) && 1870 !TD_IS_SLEEPING(td) && 1871 !TD_IS_SUSPENDED(td)) { 1872 thread_unlock(td); 1873 breakout = 1; 1874 break; 1875 } 1876 thread_unlock(td); 1877 } 1878 if (breakout) { 1879 PROC_UNLOCK(p); 1880 continue; 1881 } 1882 /* 1883 * get a limit 1884 */ 1885 lim_rlimit(p, RLIMIT_RSS, &rsslim); 1886 limit = OFF_TO_IDX( 1887 qmin(rsslim.rlim_cur, rsslim.rlim_max)); 1888 1889 /* 1890 * let processes that are swapped out really be 1891 * swapped out set the limit to nothing (will force a 1892 * swap-out.) 1893 */ 1894 if ((p->p_flag & P_INMEM) == 0) 1895 limit = 0; /* XXX */ 1896 vm = vmspace_acquire_ref(p); 1897 PROC_UNLOCK(p); 1898 if (vm == NULL) 1899 continue; 1900 1901 size = vmspace_resident_count(vm); 1902 if (size >= limit) { 1903 vm_pageout_map_deactivate_pages( 1904 &vm->vm_map, limit); 1905 } 1906#ifdef RACCT 1907 if (racct_enable) { 1908 rsize = IDX_TO_OFF(size); 1909 PROC_LOCK(p); 1910 racct_set(p, RACCT_RSS, rsize); 1911 ravailable = racct_get_available(p, RACCT_RSS); 1912 PROC_UNLOCK(p); 1913 if (rsize > ravailable) { 1914 /* 1915 * Don't be overly aggressive; this 1916 * might be an innocent process, 1917 * and the limit could've been exceeded 1918 * by some memory hog. Don't try 1919 * to deactivate more than 1/4th 1920 * of process' resident set size. 1921 */ 1922 if (attempts <= 8) { 1923 if (ravailable < rsize - 1924 (rsize / 4)) { 1925 ravailable = rsize - 1926 (rsize / 4); 1927 } 1928 } 1929 vm_pageout_map_deactivate_pages( 1930 &vm->vm_map, 1931 OFF_TO_IDX(ravailable)); 1932 /* Update RSS usage after paging out. */ 1933 size = vmspace_resident_count(vm); 1934 rsize = IDX_TO_OFF(size); 1935 PROC_LOCK(p); 1936 racct_set(p, RACCT_RSS, rsize); 1937 PROC_UNLOCK(p); 1938 if (rsize > ravailable) 1939 tryagain = 1; 1940 } 1941 } 1942#endif 1943 vmspace_free(vm); 1944 } 1945 sx_sunlock(&allproc_lock); 1946 if (tryagain != 0 && attempts <= 10) 1947 goto again; 1948 } 1949} 1950#endif /* !defined(NO_SWAPPING) */ 1951