1/* 2 * linux/mm/vmscan.c 3 * 4 * The pageout daemon, decides which pages to evict (swap out) and 5 * does the actual work of freeing them. 6 * 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 8 * 9 * Swap reorganised 29.12.95, Stephen Tweedie. 10 * kswapd added: 7.1.96 sct 11 * Removed kswapd_ctl limits, and swap out as many pages as needed 12 * to bring the system back to freepages.high: 2.4.97, Rik van Riel. 13 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). 14 * Multiqueue VM started 5.8.00, Rik van Riel. 15 */ 16 17#include <linux/slab.h> 18#include <linux/kernel_stat.h> 19#include <linux/swap.h> 20#include <linux/swapctl.h> 21#include <linux/smp_lock.h> 22#include <linux/pagemap.h> 23#include <linux/init.h> 24#include <linux/highmem.h> 25#include <linux/file.h> 26 27#include <asm/pgalloc.h> 28 29/* 30 * The "priority" of VM scanning is how much of the queues we 31 * will scan in one go. A value of 6 for DEF_PRIORITY implies 32 * that we'll scan 1/64th of the queues ("queue_length >> 6") 33 * during a normal aging round. 34 */ 35#define DEF_PRIORITY (6) 36 37/* 38 * The swap-out function returns 1 if it successfully 39 * scanned all the pages it was asked to (`count'). 40 * It returns zero if it couldn't do anything, 41 * 42 * rss may decrease because pages are shared, but this 43 * doesn't count as having freed a page. 44 */ 45 46/* mm->page_table_lock is held. mmap_sem is not held */ 47static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone) 48{ 49 pte_t pte; 50 swp_entry_t entry; 51 52 /* Don't look at this pte if it's been accessed recently. */ 53 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) { 54 mark_page_accessed(page); 55 return 0; 56 } 57 58 /* Don't bother unmapping pages that are active */ 59 if (PageActive(page)) 60 return 0; 61 62 /* Don't bother replenishing zones not under pressure.. */ 63 if (!memclass(page_zone(page), classzone)) 64 return 0; 65 66 if (TryLockPage(page)) 67 return 0; 68 69 /* From this point on, the odds are that we're going to 70 * nuke this pte, so read and clear the pte. This hook 71 * is needed on CPUs which update the accessed and dirty 72 * bits in hardware. 73 */ 74 flush_cache_page(vma, address); 75 pte = ptep_get_and_clear(page_table); 76 flush_tlb_page(vma, address); 77 78 if (pte_dirty(pte)) 79 set_page_dirty(page); 80 81 /* 82 * Is the page already in the swap cache? If so, then 83 * we can just drop our reference to it without doing 84 * any IO - it's already up-to-date on disk. 85 */ 86 if (PageSwapCache(page)) { 87 entry.val = page->index; 88 swap_duplicate(entry); 89set_swap_pte: 90 set_pte(page_table, swp_entry_to_pte(entry)); 91drop_pte: 92 mm->rss--; 93 UnlockPage(page); 94 { 95 int freeable = page_count(page) - !!page->buffers <= 2; 96 page_cache_release(page); 97 return freeable; 98 } 99 } 100 101 /* 102 * Is it a clean page? Then it must be recoverable 103 * by just paging it in again, and we can just drop 104 * it.. or if it's dirty but has backing store, 105 * just mark the page dirty and drop it. 106 * 107 * However, this won't actually free any real 108 * memory, as the page will just be in the page cache 109 * somewhere, and as such we should just continue 110 * our scan. 111 * 112 * Basically, this just makes it possible for us to do 113 * some real work in the future in "refill_inactive()". 114 */ 115 if (page->mapping) 116 goto drop_pte; 117 if (!PageDirty(page)) 118 goto drop_pte; 119 120 /* 121 * Anonymous buffercache pages can be left behind by 122 * concurrent truncate and pagefault. 123 */ 124 if (page->buffers) 125 goto preserve; 126 127 /* 128 * This is a dirty, swappable page. First of all, 129 * get a suitable swap entry for it, and make sure 130 * we have the swap cache set up to associate the 131 * page with that swap entry. 132 */ 133 for (;;) { 134 entry = get_swap_page(); 135 if (!entry.val) 136 break; 137 /* Add it to the swap cache and mark it dirty 138 * (adding to the page cache will clear the dirty 139 * and uptodate bits, so we need to do it again) 140 */ 141 if (add_to_swap_cache(page, entry) == 0) { 142 SetPageUptodate(page); 143 set_page_dirty(page); 144 goto set_swap_pte; 145 } 146 /* Raced with "speculative" read_swap_cache_async */ 147 swap_free(entry); 148 } 149 150 /* No swap space left */ 151preserve: 152 set_pte(page_table, pte); 153 UnlockPage(page); 154 return 0; 155} 156 157/* mm->page_table_lock is held. mmap_sem is not held */ 158static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) 159{ 160 pte_t * pte; 161 unsigned long pmd_end; 162 163 if (pmd_none(*dir)) 164 return count; 165 if (pmd_bad(*dir)) { 166 pmd_ERROR(*dir); 167 pmd_clear(dir); 168 return count; 169 } 170 171 pte = pte_offset(dir, address); 172 173 pmd_end = (address + PMD_SIZE) & PMD_MASK; 174 if (end > pmd_end) 175 end = pmd_end; 176 177 do { 178 if (pte_present(*pte)) { 179 struct page *page = pte_page(*pte); 180 181 if (VALID_PAGE(page) && !PageReserved(page)) { 182 count -= try_to_swap_out(mm, vma, address, pte, page, classzone); 183 if (!count) { 184 address += PAGE_SIZE; 185 break; 186 } 187 } 188 } 189 address += PAGE_SIZE; 190 pte++; 191 } while (address && (address < end)); 192 mm->swap_address = address; 193 return count; 194} 195 196/* mm->page_table_lock is held. mmap_sem is not held */ 197static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone) 198{ 199 pmd_t * pmd; 200 unsigned long pgd_end; 201 202 if (pgd_none(*dir)) 203 return count; 204 if (pgd_bad(*dir)) { 205 pgd_ERROR(*dir); 206 pgd_clear(dir); 207 return count; 208 } 209 210 pmd = pmd_offset(dir, address); 211 212 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; 213 if (pgd_end && (end > pgd_end)) 214 end = pgd_end; 215 216 do { 217 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); 218 if (!count) 219 break; 220 address = (address + PMD_SIZE) & PMD_MASK; 221 pmd++; 222 } while (address && (address < end)); 223 return count; 224} 225 226/* mm->page_table_lock is held. mmap_sem is not held */ 227static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone) 228{ 229 pgd_t *pgdir; 230 unsigned long end; 231 232 /* Don't swap out areas which are reserved */ 233 if (vma->vm_flags & VM_RESERVED) 234 return count; 235 236 pgdir = pgd_offset(mm, address); 237 238 end = vma->vm_end; 239 BUG_ON(address >= end); 240 do { 241 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); 242 if (!count) 243 break; 244 address = (address + PGDIR_SIZE) & PGDIR_MASK; 245 pgdir++; 246 } while (address && (address < end)); 247 return count; 248} 249 250/* Placeholder for swap_out(): may be updated by fork.c:mmput() */ 251struct mm_struct *swap_mm = &init_mm; 252 253/* 254 * Returns remaining count of pages to be swapped out by followup call. 255 */ 256static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone) 257{ 258 unsigned long address; 259 struct vm_area_struct* vma; 260 261 /* 262 * Find the proper vm-area after freezing the vma chain 263 * and ptes. 264 */ 265 spin_lock(&mm->page_table_lock); 266 address = mm->swap_address; 267 if (address == TASK_SIZE || swap_mm != mm) { 268 /* We raced: don't count this mm but try again */ 269 ++*mmcounter; 270 goto out_unlock; 271 } 272 vma = find_vma(mm, address); 273 if (vma) { 274 if (address < vma->vm_start) 275 address = vma->vm_start; 276 277 for (;;) { 278 count = swap_out_vma(mm, vma, address, count, classzone); 279 vma = vma->vm_next; 280 if (!vma) 281 break; 282 if (!count) 283 goto out_unlock; 284 address = vma->vm_start; 285 } 286 } 287 /* Indicate that we reached the end of address space */ 288 mm->swap_address = TASK_SIZE; 289 290out_unlock: 291 spin_unlock(&mm->page_table_lock); 292 return count; 293} 294 295static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)); 296static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone) 297{ 298 int counter, nr_pages = SWAP_CLUSTER_MAX; 299 struct mm_struct *mm; 300 301 counter = mmlist_nr; 302 do { 303 if (unlikely(current->need_resched)) { 304 __set_current_state(TASK_RUNNING); 305 schedule(); 306 } 307 308 spin_lock(&mmlist_lock); 309 mm = swap_mm; 310 while (mm->swap_address == TASK_SIZE || mm == &init_mm) { 311 mm->swap_address = 0; 312 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); 313 if (mm == swap_mm) 314 goto empty; 315 swap_mm = mm; 316 } 317 318 /* Make sure the mm doesn't disappear when we drop the lock.. */ 319 atomic_inc(&mm->mm_users); 320 spin_unlock(&mmlist_lock); 321 322 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone); 323 324 mmput(mm); 325 326 if (!nr_pages) 327 return 1; 328 } while (--counter >= 0); 329 330 return 0; 331 332empty: 333 spin_unlock(&mmlist_lock); 334 return 0; 335} 336 337static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)); 338static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority) 339{ 340 struct list_head * entry; 341 int max_scan = nr_inactive_pages / priority; 342 int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10); 343 344 spin_lock(&pagemap_lru_lock); 345 while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { 346 struct page * page; 347 348 if (unlikely(current->need_resched)) { 349 spin_unlock(&pagemap_lru_lock); 350 __set_current_state(TASK_RUNNING); 351 schedule(); 352 spin_lock(&pagemap_lru_lock); 353 continue; 354 } 355 356 page = list_entry(entry, struct page, lru); 357 358 BUG_ON(!PageLRU(page)); 359 BUG_ON(PageActive(page)); 360 361 list_del(entry); 362 list_add(entry, &inactive_list); 363 364 /* 365 * Zero page counts can happen because we unlink the pages 366 * _after_ decrementing the usage count.. 367 */ 368 if (unlikely(!page_count(page))) 369 continue; 370 371 if (!memclass(page_zone(page), classzone)) 372 continue; 373 374 /* Racy check to avoid trylocking when not worthwhile */ 375 if (!page->buffers && (page_count(page) != 1 || !page->mapping)) 376 goto page_mapped; 377 378 /* 379 * The page is locked. IO in progress? 380 * Move it to the back of the list. 381 */ 382 if (unlikely(TryLockPage(page))) { 383 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) { 384 page_cache_get(page); 385 spin_unlock(&pagemap_lru_lock); 386 wait_on_page(page); 387 page_cache_release(page); 388 spin_lock(&pagemap_lru_lock); 389 } 390 continue; 391 } 392 393 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) { 394 /* 395 * It is not critical here to write it only if 396 * the page is unmapped beause any direct writer 397 * like O_DIRECT would set the PG_dirty bitflag 398 * on the phisical page after having successfully 399 * pinned it and after the I/O to the page is finished, 400 * so the direct writes to the page cannot get lost. 401 */ 402 int (*writepage)(struct page *); 403 404 writepage = page->mapping->a_ops->writepage; 405 if ((gfp_mask & __GFP_FS) && writepage) { 406 ClearPageDirty(page); 407 SetPageLaunder(page); 408 page_cache_get(page); 409 spin_unlock(&pagemap_lru_lock); 410 411 writepage(page); 412 page_cache_release(page); 413 414 spin_lock(&pagemap_lru_lock); 415 continue; 416 } 417 } 418 419 /* 420 * If the page has buffers, try to free the buffer mappings 421 * associated with this page. If we succeed we try to free 422 * the page as well. 423 */ 424 if (page->buffers) { 425 spin_unlock(&pagemap_lru_lock); 426 427 /* avoid to free a locked page */ 428 page_cache_get(page); 429 430 if (try_to_release_page(page, gfp_mask)) { 431 if (!page->mapping) { 432 /* 433 * We must not allow an anon page 434 * with no buffers to be visible on 435 * the LRU, so we unlock the page after 436 * taking the lru lock 437 */ 438 spin_lock(&pagemap_lru_lock); 439 UnlockPage(page); 440 __lru_cache_del(page); 441 442 /* effectively free the page here */ 443 page_cache_release(page); 444 445 if (--nr_pages) 446 continue; 447 break; 448 } else { 449 /* 450 * The page is still in pagecache so undo the stuff 451 * before the try_to_release_page since we've not 452 * finished and we can now try the next step. 453 */ 454 page_cache_release(page); 455 456 spin_lock(&pagemap_lru_lock); 457 } 458 } else { 459 /* failed to drop the buffers so stop here */ 460 UnlockPage(page); 461 page_cache_release(page); 462 463 spin_lock(&pagemap_lru_lock); 464 continue; 465 } 466 } 467 468 spin_lock(&pagecache_lock); 469 470 /* 471 * this is the non-racy check for busy page. 472 */ 473 if (!page->mapping || !is_page_cache_freeable(page)) { 474 spin_unlock(&pagecache_lock); 475 UnlockPage(page); 476page_mapped: 477 if (--max_mapped >= 0) 478 continue; 479 480 /* 481 * Alert! We've found too many mapped pages on the 482 * inactive list, so we start swapping out now! 483 */ 484 spin_unlock(&pagemap_lru_lock); 485 swap_out(priority, gfp_mask, classzone); 486 return nr_pages; 487 } 488 489 /* 490 * It is critical to check PageDirty _after_ we made sure 491 * the page is freeable* so not in use by anybody. 492 */ 493 if (PageDirty(page)) { 494 spin_unlock(&pagecache_lock); 495 UnlockPage(page); 496 continue; 497 } 498 499 /* point of no return */ 500 if (likely(!PageSwapCache(page))) { 501 __remove_inode_page(page); 502 spin_unlock(&pagecache_lock); 503 } else { 504 swp_entry_t swap; 505 swap.val = page->index; 506 __delete_from_swap_cache(page); 507 spin_unlock(&pagecache_lock); 508 swap_free(swap); 509 } 510 511 __lru_cache_del(page); 512 UnlockPage(page); 513 514 /* effectively free the page here */ 515 page_cache_release(page); 516 517 if (--nr_pages) 518 continue; 519 break; 520 } 521 spin_unlock(&pagemap_lru_lock); 522 523 return nr_pages; 524} 525 526/* 527 * This moves pages from the active list to 528 * the inactive list. 529 * 530 * We move them the other way when we see the 531 * reference bit on the page. 532 */ 533static void refill_inactive(int nr_pages) 534{ 535 struct list_head * entry; 536 537 spin_lock(&pagemap_lru_lock); 538 entry = active_list.prev; 539 while (nr_pages && entry != &active_list) { 540 struct page * page; 541 542 page = list_entry(entry, struct page, lru); 543 entry = entry->prev; 544 if (PageTestandClearReferenced(page)) { 545 list_del(&page->lru); 546 list_add(&page->lru, &active_list); 547 continue; 548 } 549 550 nr_pages--; 551 552 del_page_from_active_list(page); 553 add_page_to_inactive_list(page); 554 SetPageReferenced(page); 555 } 556 spin_unlock(&pagemap_lru_lock); 557} 558 559static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); 560static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) 561{ 562 int chunk_size = nr_pages; 563 unsigned long ratio; 564 565 nr_pages -= kmem_cache_reap(gfp_mask); 566 if (nr_pages <= 0) 567 return 0; 568 569 nr_pages = chunk_size; 570 /* try to keep the active list 2/3 of the size of the cache */ 571 ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); 572 refill_inactive(ratio); 573 574 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority); 575 if (nr_pages <= 0) 576 return 0; 577 578 shrink_dcache_memory(priority, gfp_mask); 579 shrink_icache_memory(priority, gfp_mask); 580#ifdef CONFIG_QUOTA 581 shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); 582#endif 583 584 return nr_pages; 585} 586 587int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask) 588{ 589 int priority = DEF_PRIORITY; 590 int nr_pages = SWAP_CLUSTER_MAX; 591 592 gfp_mask = pf_gfp_mask(gfp_mask); 593 do { 594 nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages); 595 if (nr_pages <= 0) 596 return 1; 597 } while (--priority); 598 599 /* 600 * Hmm.. Cache shrink failed - time to kill something? 601 * Mhwahahhaha! This is the part I really like. Giggle. 602 */ 603 out_of_memory(); 604 return 0; 605} 606 607int try_to_free_pages(unsigned int gfp_mask) 608{ 609 pg_data_t *pgdat; 610 zonelist_t *zonelist; 611 unsigned long pf_free_pages; 612 int error = 0; 613 614 pf_free_pages = current->flags & PF_FREE_PAGES; 615 current->flags &= ~PF_FREE_PAGES; 616 617 for_each_pgdat(pgdat) { 618 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK); 619 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask); 620 } 621 622 current->flags |= pf_free_pages; 623 return error; 624} 625 626DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); 627 628static int check_classzone_need_balance(zone_t * classzone) 629{ 630 zone_t * first_classzone; 631 632 first_classzone = classzone->zone_pgdat->node_zones; 633 while (classzone >= first_classzone) { 634 if (classzone->free_pages > classzone->pages_high) 635 return 0; 636 classzone--; 637 } 638 return 1; 639} 640 641static int kswapd_balance_pgdat(pg_data_t * pgdat) 642{ 643 int need_more_balance = 0, i; 644 zone_t * zone; 645 646 for (i = pgdat->nr_zones-1; i >= 0; i--) { 647 zone = pgdat->node_zones + i; 648 if (unlikely(current->need_resched)) 649 schedule(); 650 if (!zone->need_balance) 651 continue; 652 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) { 653 zone->need_balance = 0; 654 __set_current_state(TASK_INTERRUPTIBLE); 655 schedule_timeout(HZ); 656 continue; 657 } 658 if (check_classzone_need_balance(zone)) 659 need_more_balance = 1; 660 else 661 zone->need_balance = 0; 662 } 663 664 return need_more_balance; 665} 666 667static void kswapd_balance(void) 668{ 669 int need_more_balance; 670 pg_data_t * pgdat; 671 672 do { 673 need_more_balance = 0; 674 675 for_each_pgdat(pgdat) 676 need_more_balance |= kswapd_balance_pgdat(pgdat); 677 } while (need_more_balance); 678} 679 680static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) 681{ 682 zone_t * zone; 683 int i; 684 685 for (i = pgdat->nr_zones-1; i >= 0; i--) { 686 zone = pgdat->node_zones + i; 687 if (!zone->need_balance) 688 continue; 689 return 0; 690 } 691 692 return 1; 693} 694 695static int kswapd_can_sleep(void) 696{ 697 pg_data_t * pgdat; 698 699 for_each_pgdat(pgdat) { 700 if (!kswapd_can_sleep_pgdat(pgdat)) 701 return 0; 702 } 703 704 return 1; 705} 706 707/* 708 * The background pageout daemon, started as a kernel thread 709 * from the init process. 710 * 711 * This basically trickles out pages so that we have _some_ 712 * free memory available even if there is no other activity 713 * that frees anything up. This is needed for things like routing 714 * etc, where we otherwise might have all activity going on in 715 * asynchronous contexts that cannot page things out. 716 * 717 * If there are applications that are active memory-allocators 718 * (most normal use), this basically shouldn't matter. 719 */ 720int kswapd(void *unused) 721{ 722 struct task_struct *tsk = current; 723 DECLARE_WAITQUEUE(wait, tsk); 724 725 daemonize(); 726 strcpy(tsk->comm, "kswapd"); 727 sigfillset(&tsk->blocked); 728 729 /* 730 * Tell the memory management that we're a "memory allocator", 731 * and that if we need more memory we should get access to it 732 * regardless (see "__alloc_pages()"). "kswapd" should 733 * never get caught in the normal page freeing logic. 734 * 735 * (Kswapd normally doesn't need memory anyway, but sometimes 736 * you need a small amount of memory in order to be able to 737 * page out something else, and this flag essentially protects 738 * us from recursively trying to free more memory as we're 739 * trying to free the first piece of memory in the first place). 740 */ 741 tsk->flags |= PF_MEMALLOC; 742 743 /* 744 * Kswapd main loop. 745 */ 746 for (;;) { 747 __set_current_state(TASK_INTERRUPTIBLE); 748 add_wait_queue(&kswapd_wait, &wait); 749 750 mb(); 751 if (kswapd_can_sleep()) 752 schedule(); 753 754 __set_current_state(TASK_RUNNING); 755 remove_wait_queue(&kswapd_wait, &wait); 756 757 /* 758 * If we actually get into a low-memory situation, 759 * the processes needing more memory will wake us 760 * up on a more timely basis. 761 */ 762 kswapd_balance(); 763 run_task_queue(&tq_disk); 764 } 765} 766 767static int __init kswapd_init(void) 768{ 769 printk("Starting kswapd\n"); 770 swap_setup(); 771 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); 772 return 0; 773} 774 775module_init(kswapd_init) 776