1/*
2 *  linux/mm/vmscan.c
3 *
4 *  The pageout daemon, decides which pages to evict (swap out) and
5 *  does the actual work of freeing them.
6 *
7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8 *
9 *  Swap reorganised 29.12.95, Stephen Tweedie.
10 *  kswapd added: 7.1.96  sct
11 *  Removed kswapd_ctl limits, and swap out as many pages as needed
12 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 *  Multiqueue VM started 5.8.00, Rik van Riel.
15 */
16
17#include <linux/slab.h>
18#include <linux/kernel_stat.h>
19#include <linux/swap.h>
20#include <linux/swapctl.h>
21#include <linux/smp_lock.h>
22#include <linux/pagemap.h>
23#include <linux/init.h>
24#include <linux/highmem.h>
25#include <linux/file.h>
26
27#include <asm/pgalloc.h>
28
29/*
30 * The "priority" of VM scanning is how much of the queues we
31 * will scan in one go. A value of 6 for DEF_PRIORITY implies
32 * that we'll scan 1/64th of the queues ("queue_length >> 6")
33 * during a normal aging round.
34 */
35#define DEF_PRIORITY (6)
36
37/*
38 * The swap-out function returns 1 if it successfully
39 * scanned all the pages it was asked to (`count').
40 * It returns zero if it couldn't do anything,
41 *
42 * rss may decrease because pages are shared, but this
43 * doesn't count as having freed a page.
44 */
45
46/* mm->page_table_lock is held. mmap_sem is not held */
47static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
48{
49	pte_t pte;
50	swp_entry_t entry;
51
52	/* Don't look at this pte if it's been accessed recently. */
53	if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
54		mark_page_accessed(page);
55		return 0;
56	}
57
58	/* Don't bother unmapping pages that are active */
59	if (PageActive(page))
60		return 0;
61
62	/* Don't bother replenishing zones not under pressure.. */
63	if (!memclass(page_zone(page), classzone))
64		return 0;
65
66	if (TryLockPage(page))
67		return 0;
68
69	/* From this point on, the odds are that we're going to
70	 * nuke this pte, so read and clear the pte.  This hook
71	 * is needed on CPUs which update the accessed and dirty
72	 * bits in hardware.
73	 */
74	flush_cache_page(vma, address);
75	pte = ptep_get_and_clear(page_table);
76	flush_tlb_page(vma, address);
77
78	if (pte_dirty(pte))
79		set_page_dirty(page);
80
81	/*
82	 * Is the page already in the swap cache? If so, then
83	 * we can just drop our reference to it without doing
84	 * any IO - it's already up-to-date on disk.
85	 */
86	if (PageSwapCache(page)) {
87		entry.val = page->index;
88		swap_duplicate(entry);
89set_swap_pte:
90		set_pte(page_table, swp_entry_to_pte(entry));
91drop_pte:
92		mm->rss--;
93		UnlockPage(page);
94		{
95			int freeable = page_count(page) - !!page->buffers <= 2;
96			page_cache_release(page);
97			return freeable;
98		}
99	}
100
101	/*
102	 * Is it a clean page? Then it must be recoverable
103	 * by just paging it in again, and we can just drop
104	 * it..  or if it's dirty but has backing store,
105	 * just mark the page dirty and drop it.
106	 *
107	 * However, this won't actually free any real
108	 * memory, as the page will just be in the page cache
109	 * somewhere, and as such we should just continue
110	 * our scan.
111	 *
112	 * Basically, this just makes it possible for us to do
113	 * some real work in the future in "refill_inactive()".
114	 */
115	if (page->mapping)
116		goto drop_pte;
117	if (!PageDirty(page))
118		goto drop_pte;
119
120	/*
121	 * Anonymous buffercache pages can be left behind by
122	 * concurrent truncate and pagefault.
123	 */
124	if (page->buffers)
125		goto preserve;
126
127	/*
128	 * This is a dirty, swappable page.  First of all,
129	 * get a suitable swap entry for it, and make sure
130	 * we have the swap cache set up to associate the
131	 * page with that swap entry.
132	 */
133	for (;;) {
134		entry = get_swap_page();
135		if (!entry.val)
136			break;
137		/* Add it to the swap cache and mark it dirty
138		 * (adding to the page cache will clear the dirty
139		 * and uptodate bits, so we need to do it again)
140		 */
141		if (add_to_swap_cache(page, entry) == 0) {
142			SetPageUptodate(page);
143			set_page_dirty(page);
144			goto set_swap_pte;
145		}
146		/* Raced with "speculative" read_swap_cache_async */
147		swap_free(entry);
148	}
149
150	/* No swap space left */
151preserve:
152	set_pte(page_table, pte);
153	UnlockPage(page);
154	return 0;
155}
156
157/* mm->page_table_lock is held. mmap_sem is not held */
158static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
159{
160	pte_t * pte;
161	unsigned long pmd_end;
162
163	if (pmd_none(*dir))
164		return count;
165	if (pmd_bad(*dir)) {
166		pmd_ERROR(*dir);
167		pmd_clear(dir);
168		return count;
169	}
170
171	pte = pte_offset(dir, address);
172
173	pmd_end = (address + PMD_SIZE) & PMD_MASK;
174	if (end > pmd_end)
175		end = pmd_end;
176
177	do {
178		if (pte_present(*pte)) {
179			struct page *page = pte_page(*pte);
180
181			if (VALID_PAGE(page) && !PageReserved(page)) {
182				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
183				if (!count) {
184					address += PAGE_SIZE;
185					break;
186				}
187			}
188		}
189		address += PAGE_SIZE;
190		pte++;
191	} while (address && (address < end));
192	mm->swap_address = address;
193	return count;
194}
195
196/* mm->page_table_lock is held. mmap_sem is not held */
197static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
198{
199	pmd_t * pmd;
200	unsigned long pgd_end;
201
202	if (pgd_none(*dir))
203		return count;
204	if (pgd_bad(*dir)) {
205		pgd_ERROR(*dir);
206		pgd_clear(dir);
207		return count;
208	}
209
210	pmd = pmd_offset(dir, address);
211
212	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
213	if (pgd_end && (end > pgd_end))
214		end = pgd_end;
215
216	do {
217		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
218		if (!count)
219			break;
220		address = (address + PMD_SIZE) & PMD_MASK;
221		pmd++;
222	} while (address && (address < end));
223	return count;
224}
225
226/* mm->page_table_lock is held. mmap_sem is not held */
227static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
228{
229	pgd_t *pgdir;
230	unsigned long end;
231
232	/* Don't swap out areas which are reserved */
233	if (vma->vm_flags & VM_RESERVED)
234		return count;
235
236	pgdir = pgd_offset(mm, address);
237
238	end = vma->vm_end;
239	BUG_ON(address >= end);
240	do {
241		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
242		if (!count)
243			break;
244		address = (address + PGDIR_SIZE) & PGDIR_MASK;
245		pgdir++;
246	} while (address && (address < end));
247	return count;
248}
249
250/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
251struct mm_struct *swap_mm = &init_mm;
252
253/*
254 * Returns remaining count of pages to be swapped out by followup call.
255 */
256static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
257{
258	unsigned long address;
259	struct vm_area_struct* vma;
260
261	/*
262	 * Find the proper vm-area after freezing the vma chain
263	 * and ptes.
264	 */
265	spin_lock(&mm->page_table_lock);
266	address = mm->swap_address;
267	if (address == TASK_SIZE || swap_mm != mm) {
268		/* We raced: don't count this mm but try again */
269		++*mmcounter;
270		goto out_unlock;
271	}
272	vma = find_vma(mm, address);
273	if (vma) {
274		if (address < vma->vm_start)
275			address = vma->vm_start;
276
277		for (;;) {
278			count = swap_out_vma(mm, vma, address, count, classzone);
279			vma = vma->vm_next;
280			if (!vma)
281				break;
282			if (!count)
283				goto out_unlock;
284			address = vma->vm_start;
285		}
286	}
287	/* Indicate that we reached the end of address space */
288	mm->swap_address = TASK_SIZE;
289
290out_unlock:
291	spin_unlock(&mm->page_table_lock);
292	return count;
293}
294
295static int FASTCALL(swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone));
296static int swap_out(unsigned int priority, unsigned int gfp_mask, zone_t * classzone)
297{
298	int counter, nr_pages = SWAP_CLUSTER_MAX;
299	struct mm_struct *mm;
300
301	counter = mmlist_nr;
302	do {
303		if (unlikely(current->need_resched)) {
304			__set_current_state(TASK_RUNNING);
305			schedule();
306		}
307
308		spin_lock(&mmlist_lock);
309		mm = swap_mm;
310		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
311			mm->swap_address = 0;
312			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
313			if (mm == swap_mm)
314				goto empty;
315			swap_mm = mm;
316		}
317
318		/* Make sure the mm doesn't disappear when we drop the lock.. */
319		atomic_inc(&mm->mm_users);
320		spin_unlock(&mmlist_lock);
321
322		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
323
324		mmput(mm);
325
326		if (!nr_pages)
327			return 1;
328	} while (--counter >= 0);
329
330	return 0;
331
332empty:
333	spin_unlock(&mmlist_lock);
334	return 0;
335}
336
337static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority));
338static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
339{
340	struct list_head * entry;
341	int max_scan = nr_inactive_pages / priority;
342	int max_mapped = min((nr_pages << (10 - priority)), max_scan / 10);
343
344	spin_lock(&pagemap_lru_lock);
345	while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) {
346		struct page * page;
347
348		if (unlikely(current->need_resched)) {
349			spin_unlock(&pagemap_lru_lock);
350			__set_current_state(TASK_RUNNING);
351			schedule();
352			spin_lock(&pagemap_lru_lock);
353			continue;
354		}
355
356		page = list_entry(entry, struct page, lru);
357
358		BUG_ON(!PageLRU(page));
359		BUG_ON(PageActive(page));
360
361		list_del(entry);
362		list_add(entry, &inactive_list);
363
364		/*
365		 * Zero page counts can happen because we unlink the pages
366		 * _after_ decrementing the usage count..
367		 */
368		if (unlikely(!page_count(page)))
369			continue;
370
371		if (!memclass(page_zone(page), classzone))
372			continue;
373
374		/* Racy check to avoid trylocking when not worthwhile */
375		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
376			goto page_mapped;
377
378		/*
379		 * The page is locked. IO in progress?
380		 * Move it to the back of the list.
381		 */
382		if (unlikely(TryLockPage(page))) {
383			if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
384				page_cache_get(page);
385				spin_unlock(&pagemap_lru_lock);
386				wait_on_page(page);
387				page_cache_release(page);
388				spin_lock(&pagemap_lru_lock);
389			}
390			continue;
391		}
392
393		if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
394			/*
395			 * It is not critical here to write it only if
396			 * the page is unmapped beause any direct writer
397			 * like O_DIRECT would set the PG_dirty bitflag
398			 * on the phisical page after having successfully
399			 * pinned it and after the I/O to the page is finished,
400			 * so the direct writes to the page cannot get lost.
401			 */
402			int (*writepage)(struct page *);
403
404			writepage = page->mapping->a_ops->writepage;
405			if ((gfp_mask & __GFP_FS) && writepage) {
406				ClearPageDirty(page);
407				SetPageLaunder(page);
408				page_cache_get(page);
409				spin_unlock(&pagemap_lru_lock);
410
411				writepage(page);
412				page_cache_release(page);
413
414				spin_lock(&pagemap_lru_lock);
415				continue;
416			}
417		}
418
419		/*
420		 * If the page has buffers, try to free the buffer mappings
421		 * associated with this page. If we succeed we try to free
422		 * the page as well.
423		 */
424		if (page->buffers) {
425			spin_unlock(&pagemap_lru_lock);
426
427			/* avoid to free a locked page */
428			page_cache_get(page);
429
430			if (try_to_release_page(page, gfp_mask)) {
431				if (!page->mapping) {
432					/*
433					 * We must not allow an anon page
434					 * with no buffers to be visible on
435					 * the LRU, so we unlock the page after
436					 * taking the lru lock
437					 */
438					spin_lock(&pagemap_lru_lock);
439					UnlockPage(page);
440					__lru_cache_del(page);
441
442					/* effectively free the page here */
443					page_cache_release(page);
444
445					if (--nr_pages)
446						continue;
447					break;
448				} else {
449					/*
450					 * The page is still in pagecache so undo the stuff
451					 * before the try_to_release_page since we've not
452					 * finished and we can now try the next step.
453					 */
454					page_cache_release(page);
455
456					spin_lock(&pagemap_lru_lock);
457				}
458			} else {
459				/* failed to drop the buffers so stop here */
460				UnlockPage(page);
461				page_cache_release(page);
462
463				spin_lock(&pagemap_lru_lock);
464				continue;
465			}
466		}
467
468		spin_lock(&pagecache_lock);
469
470		/*
471		 * this is the non-racy check for busy page.
472		 */
473		if (!page->mapping || !is_page_cache_freeable(page)) {
474			spin_unlock(&pagecache_lock);
475			UnlockPage(page);
476page_mapped:
477			if (--max_mapped >= 0)
478				continue;
479
480			/*
481			 * Alert! We've found too many mapped pages on the
482			 * inactive list, so we start swapping out now!
483			 */
484			spin_unlock(&pagemap_lru_lock);
485			swap_out(priority, gfp_mask, classzone);
486			return nr_pages;
487		}
488
489		/*
490		 * It is critical to check PageDirty _after_ we made sure
491		 * the page is freeable* so not in use by anybody.
492		 */
493		if (PageDirty(page)) {
494			spin_unlock(&pagecache_lock);
495			UnlockPage(page);
496			continue;
497		}
498
499		/* point of no return */
500		if (likely(!PageSwapCache(page))) {
501			__remove_inode_page(page);
502			spin_unlock(&pagecache_lock);
503		} else {
504			swp_entry_t swap;
505			swap.val = page->index;
506			__delete_from_swap_cache(page);
507			spin_unlock(&pagecache_lock);
508			swap_free(swap);
509		}
510
511		__lru_cache_del(page);
512		UnlockPage(page);
513
514		/* effectively free the page here */
515		page_cache_release(page);
516
517		if (--nr_pages)
518			continue;
519		break;
520	}
521	spin_unlock(&pagemap_lru_lock);
522
523	return nr_pages;
524}
525
526/*
527 * This moves pages from the active list to
528 * the inactive list.
529 *
530 * We move them the other way when we see the
531 * reference bit on the page.
532 */
533static void refill_inactive(int nr_pages)
534{
535	struct list_head * entry;
536
537	spin_lock(&pagemap_lru_lock);
538	entry = active_list.prev;
539	while (nr_pages && entry != &active_list) {
540		struct page * page;
541
542		page = list_entry(entry, struct page, lru);
543		entry = entry->prev;
544		if (PageTestandClearReferenced(page)) {
545			list_del(&page->lru);
546			list_add(&page->lru, &active_list);
547			continue;
548		}
549
550		nr_pages--;
551
552		del_page_from_active_list(page);
553		add_page_to_inactive_list(page);
554		SetPageReferenced(page);
555	}
556	spin_unlock(&pagemap_lru_lock);
557}
558
559static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages));
560static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)
561{
562	int chunk_size = nr_pages;
563	unsigned long ratio;
564
565	nr_pages -= kmem_cache_reap(gfp_mask);
566	if (nr_pages <= 0)
567		return 0;
568
569	nr_pages = chunk_size;
570	/* try to keep the active list 2/3 of the size of the cache */
571	ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
572	refill_inactive(ratio);
573
574	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
575	if (nr_pages <= 0)
576		return 0;
577
578	shrink_dcache_memory(priority, gfp_mask);
579	shrink_icache_memory(priority, gfp_mask);
580#ifdef CONFIG_QUOTA
581	shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
582#endif
583
584	return nr_pages;
585}
586
587int try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
588{
589	int priority = DEF_PRIORITY;
590	int nr_pages = SWAP_CLUSTER_MAX;
591
592	gfp_mask = pf_gfp_mask(gfp_mask);
593	do {
594		nr_pages = shrink_caches(classzone, priority, gfp_mask, nr_pages);
595		if (nr_pages <= 0)
596			return 1;
597	} while (--priority);
598
599	/*
600	 * Hmm.. Cache shrink failed - time to kill something?
601	 * Mhwahahhaha! This is the part I really like. Giggle.
602	 */
603	out_of_memory();
604	return 0;
605}
606
607int try_to_free_pages(unsigned int gfp_mask)
608{
609	pg_data_t *pgdat;
610	zonelist_t *zonelist;
611	unsigned long pf_free_pages;
612	int error = 0;
613
614	pf_free_pages = current->flags & PF_FREE_PAGES;
615	current->flags &= ~PF_FREE_PAGES;
616
617	for_each_pgdat(pgdat) {
618		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
619		error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
620	}
621
622	current->flags |= pf_free_pages;
623	return error;
624}
625
626DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
627
628static int check_classzone_need_balance(zone_t * classzone)
629{
630	zone_t * first_classzone;
631
632	first_classzone = classzone->zone_pgdat->node_zones;
633	while (classzone >= first_classzone) {
634		if (classzone->free_pages > classzone->pages_high)
635			return 0;
636		classzone--;
637	}
638	return 1;
639}
640
641static int kswapd_balance_pgdat(pg_data_t * pgdat)
642{
643	int need_more_balance = 0, i;
644	zone_t * zone;
645
646	for (i = pgdat->nr_zones-1; i >= 0; i--) {
647		zone = pgdat->node_zones + i;
648		if (unlikely(current->need_resched))
649			schedule();
650		if (!zone->need_balance)
651			continue;
652		if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
653			zone->need_balance = 0;
654			__set_current_state(TASK_INTERRUPTIBLE);
655			schedule_timeout(HZ);
656			continue;
657		}
658		if (check_classzone_need_balance(zone))
659			need_more_balance = 1;
660		else
661			zone->need_balance = 0;
662	}
663
664	return need_more_balance;
665}
666
667static void kswapd_balance(void)
668{
669	int need_more_balance;
670	pg_data_t * pgdat;
671
672	do {
673		need_more_balance = 0;
674
675		for_each_pgdat(pgdat)
676			need_more_balance |= kswapd_balance_pgdat(pgdat);
677	} while (need_more_balance);
678}
679
680static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
681{
682	zone_t * zone;
683	int i;
684
685	for (i = pgdat->nr_zones-1; i >= 0; i--) {
686		zone = pgdat->node_zones + i;
687		if (!zone->need_balance)
688			continue;
689		return 0;
690	}
691
692	return 1;
693}
694
695static int kswapd_can_sleep(void)
696{
697	pg_data_t * pgdat;
698
699	for_each_pgdat(pgdat) {
700		if (!kswapd_can_sleep_pgdat(pgdat))
701			return 0;
702	}
703
704	return 1;
705}
706
707/*
708 * The background pageout daemon, started as a kernel thread
709 * from the init process.
710 *
711 * This basically trickles out pages so that we have _some_
712 * free memory available even if there is no other activity
713 * that frees anything up. This is needed for things like routing
714 * etc, where we otherwise might have all activity going on in
715 * asynchronous contexts that cannot page things out.
716 *
717 * If there are applications that are active memory-allocators
718 * (most normal use), this basically shouldn't matter.
719 */
720int kswapd(void *unused)
721{
722	struct task_struct *tsk = current;
723	DECLARE_WAITQUEUE(wait, tsk);
724
725	daemonize();
726	strcpy(tsk->comm, "kswapd");
727	sigfillset(&tsk->blocked);
728
729	/*
730	 * Tell the memory management that we're a "memory allocator",
731	 * and that if we need more memory we should get access to it
732	 * regardless (see "__alloc_pages()"). "kswapd" should
733	 * never get caught in the normal page freeing logic.
734	 *
735	 * (Kswapd normally doesn't need memory anyway, but sometimes
736	 * you need a small amount of memory in order to be able to
737	 * page out something else, and this flag essentially protects
738	 * us from recursively trying to free more memory as we're
739	 * trying to free the first piece of memory in the first place).
740	 */
741	tsk->flags |= PF_MEMALLOC;
742
743	/*
744	 * Kswapd main loop.
745	 */
746	for (;;) {
747		__set_current_state(TASK_INTERRUPTIBLE);
748		add_wait_queue(&kswapd_wait, &wait);
749
750		mb();
751		if (kswapd_can_sleep())
752			schedule();
753
754		__set_current_state(TASK_RUNNING);
755		remove_wait_queue(&kswapd_wait, &wait);
756
757		/*
758		 * If we actually get into a low-memory situation,
759		 * the processes needing more memory will wake us
760		 * up on a more timely basis.
761		 */
762		kswapd_balance();
763		run_task_queue(&tq_disk);
764	}
765}
766
767static int __init kswapd_init(void)
768{
769	printk("Starting kswapd\n");
770	swap_setup();
771	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
772	return 0;
773}
774
775module_init(kswapd_init)
776