1/*
2 *	linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-1999  Linus Torvalds
5 */
6
7/*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12#include <linux/module.h>
13#include <linux/slab.h>
14#include <linux/shm.h>
15#include <linux/mman.h>
16#include <linux/locks.h>
17#include <linux/pagemap.h>
18#include <linux/swap.h>
19#include <linux/smp_lock.h>
20#include <linux/blkdev.h>
21#include <linux/file.h>
22#include <linux/swapctl.h>
23#include <linux/init.h>
24#include <linux/mm.h>
25#include <linux/iobuf.h>
26
27#include <asm/pgalloc.h>
28#include <asm/uaccess.h>
29#include <asm/mman.h>
30
31#include <linux/highmem.h>
32
33/*
34 * Shared mappings implemented 30.11.1994. It's not fully working yet,
35 * though.
36 *
37 * Shared mappings now work. 15.8.1995  Bruno.
38 *
39 * finished 'unifying' the page and buffer cache and SMP-threaded the
40 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 *
42 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43 */
44
45atomic_t page_cache_size = ATOMIC_INIT(0);
46unsigned int page_hash_bits;
47struct page **page_hash_table;
48
49int vm_max_readahead = 31;
50int vm_min_readahead = 3;
51EXPORT_SYMBOL(vm_max_readahead);
52EXPORT_SYMBOL(vm_min_readahead);
53
54
55spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
56/*
57 * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
58 *	with the pagecache_lock held.
59 *
60 * Ordering:
61 *	swap_lock ->
62 *		pagemap_lru_lock ->
63 *			pagecache_lock
64 */
65spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
66
67#define CLUSTER_PAGES		(1 << page_cluster)
68#define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
69
70static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
71static void add_page_to_hash_queue(struct page * page, struct page **p)
72{
73	struct page *next = *p;
74
75	*p = page;
76	page->next_hash = next;
77	page->pprev_hash = p;
78	if (next)
79		next->pprev_hash = &page->next_hash;
80	if (page->buffers)
81		PAGE_BUG(page);
82	atomic_inc(&page_cache_size);
83}
84
85static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
86{
87	struct list_head *head = &mapping->clean_pages;
88
89	mapping->nrpages++;
90	list_add(&page->list, head);
91	page->mapping = mapping;
92}
93
94static inline void remove_page_from_inode_queue(struct page * page)
95{
96	struct address_space * mapping = page->mapping;
97
98	mapping->nrpages--;
99	list_del(&page->list);
100	page->mapping = NULL;
101}
102
103static inline void remove_page_from_hash_queue(struct page * page)
104{
105	struct page *next = page->next_hash;
106	struct page **pprev = page->pprev_hash;
107
108	if (next)
109		next->pprev_hash = pprev;
110	*pprev = next;
111	page->pprev_hash = NULL;
112	atomic_dec(&page_cache_size);
113}
114
115/*
116 * Remove a page from the page cache and free it. Caller has to make
117 * sure the page is locked and that nobody else uses it - or that usage
118 * is safe.
119 */
120void __remove_inode_page(struct page *page)
121{
122	if (PageDirty(page) && !PageSwapCache(page))
123		BUG();
124	remove_page_from_inode_queue(page);
125	remove_page_from_hash_queue(page);
126}
127
128void remove_inode_page(struct page *page)
129{
130	if (!PageLocked(page))
131		PAGE_BUG(page);
132
133	spin_lock(&pagecache_lock);
134	__remove_inode_page(page);
135	spin_unlock(&pagecache_lock);
136}
137
138static inline int sync_page(struct page *page)
139{
140	struct address_space *mapping = page->mapping;
141
142	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
143		return mapping->a_ops->sync_page(page);
144	return 0;
145}
146
147/*
148 * Add a page to the dirty page list.
149 */
150void set_page_dirty(struct page *page)
151{
152	if (!test_and_set_bit(PG_dirty, &page->flags)) {
153		struct address_space *mapping = page->mapping;
154
155		if (mapping) {
156			spin_lock(&pagecache_lock);
157			mapping = page->mapping;
158			if (mapping) {	/* may have been truncated */
159				list_del(&page->list);
160				list_add(&page->list, &mapping->dirty_pages);
161			}
162			spin_unlock(&pagecache_lock);
163
164			if (mapping && mapping->host)
165				mark_inode_dirty_pages(mapping->host);
166		}
167	}
168}
169
170/**
171 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
172 * @inode: the inode which pages we want to invalidate
173 *
174 * This function only removes the unlocked pages, if you want to
175 * remove all the pages of one inode, you must call truncate_inode_pages.
176 */
177
178void invalidate_inode_pages(struct inode * inode)
179{
180	struct list_head *head, *curr;
181	struct page * page;
182
183	head = &inode->i_mapping->clean_pages;
184
185	spin_lock(&pagemap_lru_lock);
186	spin_lock(&pagecache_lock);
187	curr = head->next;
188
189	while (curr != head) {
190		page = list_entry(curr, struct page, list);
191		curr = curr->next;
192
193		/* We cannot invalidate something in dirty.. */
194		if (PageDirty(page))
195			continue;
196
197		/* ..or locked */
198		if (TryLockPage(page))
199			continue;
200
201		if (page->buffers && !try_to_free_buffers(page, 0))
202			goto unlock;
203
204		if (page_count(page) != 1)
205			goto unlock;
206
207		__lru_cache_del(page);
208		__remove_inode_page(page);
209		UnlockPage(page);
210		page_cache_release(page);
211		continue;
212unlock:
213		UnlockPage(page);
214		continue;
215	}
216
217	spin_unlock(&pagecache_lock);
218	spin_unlock(&pagemap_lru_lock);
219}
220
221static int do_flushpage(struct page *page, unsigned long offset)
222{
223	int (*flushpage) (struct page *, unsigned long);
224	flushpage = page->mapping->a_ops->flushpage;
225	if (flushpage)
226		return (*flushpage)(page, offset);
227	return block_flushpage(page, offset);
228}
229
230static inline void truncate_partial_page(struct page *page, unsigned partial)
231{
232	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
233	if (page->buffers)
234		do_flushpage(page, partial);
235}
236
237static void truncate_complete_page(struct page *page)
238{
239	/* Leave it on the LRU if it gets converted into anonymous buffers */
240	if (!page->buffers || do_flushpage(page, 0))
241		lru_cache_del(page);
242
243	/*
244	 * We remove the page from the page cache _after_ we have
245	 * destroyed all buffer-cache references to it. Otherwise some
246	 * other process might think this inode page is not in the
247	 * page cache and creates a buffer-cache alias to it causing
248	 * all sorts of fun problems ...
249	 */
250	ClearPageDirty(page);
251	ClearPageUptodate(page);
252	remove_inode_page(page);
253	page_cache_release(page);
254}
255
256static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
257static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
258{
259	struct list_head *curr;
260	struct page * page;
261	int unlocked = 0;
262
263 restart:
264	curr = head->prev;
265	while (curr != head) {
266		unsigned long offset;
267
268		page = list_entry(curr, struct page, list);
269		offset = page->index;
270
271		/* Is one of the pages to truncate? */
272		if ((offset >= start) || (*partial && (offset + 1) == start)) {
273			int failed;
274
275			page_cache_get(page);
276			failed = TryLockPage(page);
277
278			list_del(head);
279			if (!failed)
280				/* Restart after this page */
281				list_add_tail(head, curr);
282			else
283				/* Restart on this page */
284				list_add(head, curr);
285
286			spin_unlock(&pagecache_lock);
287			unlocked = 1;
288
289 			if (!failed) {
290				if (*partial && (offset + 1) == start) {
291					truncate_partial_page(page, *partial);
292					*partial = 0;
293				} else
294					truncate_complete_page(page);
295
296				UnlockPage(page);
297			} else
298 				wait_on_page(page);
299
300			page_cache_release(page);
301
302			if (current->need_resched) {
303				__set_current_state(TASK_RUNNING);
304				schedule();
305			}
306
307			spin_lock(&pagecache_lock);
308			goto restart;
309		}
310		curr = curr->prev;
311	}
312	return unlocked;
313}
314
315
316/**
317 * truncate_inode_pages - truncate *all* the pages from an offset
318 * @mapping: mapping to truncate
319 * @lstart: offset from with to truncate
320 *
321 * Truncate the page cache at a set offset, removing the pages
322 * that are beyond that offset (and zeroing out partial pages).
323 * If any page is locked we wait for it to become unlocked.
324 */
325void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
326{
327	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
328	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
329	int unlocked;
330
331	spin_lock(&pagecache_lock);
332	do {
333		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
334		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
335		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
336	} while (unlocked);
337	/* Traversed all three lists without dropping the lock */
338	spin_unlock(&pagecache_lock);
339}
340
341static inline int invalidate_this_page2(struct page * page,
342					struct list_head * curr,
343					struct list_head * head)
344{
345	int unlocked = 1;
346
347	/*
348	 * The page is locked and we hold the pagecache_lock as well
349	 * so both page_count(page) and page->buffers stays constant here.
350	 */
351	if (page_count(page) == 1 + !!page->buffers) {
352		/* Restart after this page */
353		list_del(head);
354		list_add_tail(head, curr);
355
356		page_cache_get(page);
357		spin_unlock(&pagecache_lock);
358		truncate_complete_page(page);
359	} else {
360		if (page->buffers) {
361			/* Restart after this page */
362			list_del(head);
363			list_add_tail(head, curr);
364
365			page_cache_get(page);
366			spin_unlock(&pagecache_lock);
367			block_invalidate_page(page);
368		} else
369			unlocked = 0;
370
371		ClearPageDirty(page);
372		ClearPageUptodate(page);
373	}
374
375	return unlocked;
376}
377
378static int FASTCALL(invalidate_list_pages2(struct list_head *));
379static int invalidate_list_pages2(struct list_head *head)
380{
381	struct list_head *curr;
382	struct page * page;
383	int unlocked = 0;
384
385 restart:
386	curr = head->prev;
387	while (curr != head) {
388		page = list_entry(curr, struct page, list);
389
390		if (!TryLockPage(page)) {
391			int __unlocked;
392
393			__unlocked = invalidate_this_page2(page, curr, head);
394			UnlockPage(page);
395			unlocked |= __unlocked;
396			if (!__unlocked) {
397				curr = curr->prev;
398				continue;
399			}
400		} else {
401			/* Restart on this page */
402			list_del(head);
403			list_add(head, curr);
404
405			page_cache_get(page);
406			spin_unlock(&pagecache_lock);
407			unlocked = 1;
408			wait_on_page(page);
409		}
410
411		page_cache_release(page);
412		if (current->need_resched) {
413			__set_current_state(TASK_RUNNING);
414			schedule();
415		}
416
417		spin_lock(&pagecache_lock);
418		goto restart;
419	}
420	return unlocked;
421}
422
423/**
424 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
425 * free the pages because they're mapped.
426 * @mapping: the address_space which pages we want to invalidate
427 */
428void invalidate_inode_pages2(struct address_space * mapping)
429{
430	int unlocked;
431
432	spin_lock(&pagecache_lock);
433	do {
434		unlocked = invalidate_list_pages2(&mapping->clean_pages);
435		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
436		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
437	} while (unlocked);
438	spin_unlock(&pagecache_lock);
439}
440
441static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
442{
443	goto inside;
444
445	for (;;) {
446		page = page->next_hash;
447inside:
448		if (!page)
449			goto not_found;
450		if (page->mapping != mapping)
451			continue;
452		if (page->index == offset)
453			break;
454	}
455
456not_found:
457	return page;
458}
459
460static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
461{
462	struct list_head *curr;
463	struct page *page;
464	int retval = 0;
465
466	spin_lock(&pagecache_lock);
467	curr = head->next;
468	while (curr != head) {
469		page = list_entry(curr, struct page, list);
470		curr = curr->next;
471		if (!page->buffers)
472			continue;
473		if (page->index >= end)
474			continue;
475		if (page->index < start)
476			continue;
477
478		page_cache_get(page);
479		spin_unlock(&pagecache_lock);
480		lock_page(page);
481
482		/* The buffers could have been free'd while we waited for the page lock */
483		if (page->buffers)
484			retval |= fn(page);
485
486		UnlockPage(page);
487		spin_lock(&pagecache_lock);
488		curr = page->list.next;
489		page_cache_release(page);
490	}
491	spin_unlock(&pagecache_lock);
492
493	return retval;
494}
495
496/*
497 * Two-stage data sync: first start the IO, then go back and
498 * collect the information..
499 */
500int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
501{
502	int retval;
503
504	/* writeout dirty buffers on pages from both clean and dirty lists */
505	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
506	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
507	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
508
509	/* now wait for locked buffers on pages from both clean and dirty lists */
510	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
511	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
512	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
513
514	return retval;
515}
516
517/*
518 * In-memory filesystems have to fail their
519 * writepage function - and this has to be
520 * worked around in the VM layer..
521 *
522 * We
523 *  - mark the page dirty again (but do NOT
524 *    add it back to the inode dirty list, as
525 *    that would livelock in fdatasync)
526 *  - activate the page so that the page stealer
527 *    doesn't try to write it out over and over
528 *    again.
529 */
530int fail_writepage(struct page *page)
531{
532	/* Only activate on memory-pressure, not fsync.. */
533	if (PageLaunder(page)) {
534		activate_page(page);
535		SetPageReferenced(page);
536	}
537
538	/* Set the page dirty again, unlock */
539	SetPageDirty(page);
540	UnlockPage(page);
541	return 0;
542}
543
544EXPORT_SYMBOL(fail_writepage);
545
546/**
547 *      filemap_fdatasync - walk the list of dirty pages of the given address space
548 *     	and writepage() all of them.
549 *
550 *      @mapping: address space structure to write
551 *
552 */
553int filemap_fdatasync(struct address_space * mapping)
554{
555	int ret = 0;
556	int (*writepage)(struct page *) = mapping->a_ops->writepage;
557
558	spin_lock(&pagecache_lock);
559
560        while (!list_empty(&mapping->dirty_pages)) {
561		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
562
563		list_del(&page->list);
564		list_add(&page->list, &mapping->locked_pages);
565
566		if (!PageDirty(page))
567			continue;
568
569		page_cache_get(page);
570		spin_unlock(&pagecache_lock);
571
572		lock_page(page);
573
574		if (PageDirty(page)) {
575			int err;
576			ClearPageDirty(page);
577			err = writepage(page);
578			if (err && !ret)
579				ret = err;
580		} else
581			UnlockPage(page);
582
583		page_cache_release(page);
584		spin_lock(&pagecache_lock);
585	}
586	spin_unlock(&pagecache_lock);
587	return ret;
588}
589
590/**
591 *      filemap_fdatawait - walk the list of locked pages of the given address space
592 *     	and wait for all of them.
593 *
594 *      @mapping: address space structure to wait for
595 *
596 */
597int filemap_fdatawait(struct address_space * mapping)
598{
599	int ret = 0;
600
601	spin_lock(&pagecache_lock);
602
603        while (!list_empty(&mapping->locked_pages)) {
604		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
605
606		list_del(&page->list);
607		list_add(&page->list, &mapping->clean_pages);
608
609		if (!PageLocked(page))
610			continue;
611
612		page_cache_get(page);
613		spin_unlock(&pagecache_lock);
614
615		___wait_on_page(page);
616		if (PageError(page))
617			ret = -EIO;
618
619		page_cache_release(page);
620		spin_lock(&pagecache_lock);
621	}
622	spin_unlock(&pagecache_lock);
623	return ret;
624}
625
626/*
627 * Add a page to the inode page cache.
628 *
629 * The caller must have locked the page and
630 * set all the page flags correctly..
631 */
632void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
633{
634	if (!PageLocked(page))
635		BUG();
636
637	page->index = index;
638	page_cache_get(page);
639	spin_lock(&pagecache_lock);
640	add_page_to_inode_queue(mapping, page);
641	add_page_to_hash_queue(page, page_hash(mapping, index));
642	spin_unlock(&pagecache_lock);
643
644	lru_cache_add(page);
645}
646
647/*
648 * This adds a page to the page cache, starting out as locked,
649 * owned by us, but unreferenced, not uptodate and with no errors.
650 */
651static inline void __add_to_page_cache(struct page * page,
652	struct address_space *mapping, unsigned long offset,
653	struct page **hash)
654{
655	unsigned long flags;
656
657	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
658	page->flags = flags | (1 << PG_locked);
659	page_cache_get(page);
660	page->index = offset;
661	add_page_to_inode_queue(mapping, page);
662	add_page_to_hash_queue(page, hash);
663}
664
665void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
666{
667	spin_lock(&pagecache_lock);
668	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
669	spin_unlock(&pagecache_lock);
670	lru_cache_add(page);
671}
672
673int add_to_page_cache_unique(struct page * page,
674	struct address_space *mapping, unsigned long offset,
675	struct page **hash)
676{
677	int err;
678	struct page *alias;
679
680	spin_lock(&pagecache_lock);
681	alias = __find_page_nolock(mapping, offset, *hash);
682
683	err = 1;
684	if (!alias) {
685		__add_to_page_cache(page,mapping,offset,hash);
686		err = 0;
687	}
688
689	spin_unlock(&pagecache_lock);
690	if (!err)
691		lru_cache_add(page);
692	return err;
693}
694
695/*
696 * This adds the requested page to the page cache if it isn't already there,
697 * and schedules an I/O to read in its contents from disk.
698 */
699static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
700static int page_cache_read(struct file * file, unsigned long offset)
701{
702	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
703	struct page **hash = page_hash(mapping, offset);
704	struct page *page;
705
706	spin_lock(&pagecache_lock);
707	page = __find_page_nolock(mapping, offset, *hash);
708	spin_unlock(&pagecache_lock);
709	if (page)
710		return 0;
711
712	page = page_cache_alloc(mapping);
713	if (!page)
714		return -ENOMEM;
715
716	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
717		int error = mapping->a_ops->readpage(file, page);
718		page_cache_release(page);
719		return error;
720	}
721	/*
722	 * We arrive here in the unlikely event that someone
723	 * raced with us and added our page to the cache first.
724	 */
725	page_cache_release(page);
726	return 0;
727}
728
729/*
730 * Read in an entire cluster at once.  A cluster is usually a 64k-
731 * aligned block that includes the page requested in "offset."
732 */
733static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
734					     unsigned long filesize));
735static int read_cluster_nonblocking(struct file * file, unsigned long offset,
736	unsigned long filesize)
737{
738	unsigned long pages = CLUSTER_PAGES;
739
740	offset = CLUSTER_OFFSET(offset);
741	while ((pages-- > 0) && (offset < filesize)) {
742		int error = page_cache_read(file, offset);
743		if (error < 0)
744			return error;
745		offset ++;
746	}
747
748	return 0;
749}
750
751/*
752 * Knuth recommends primes in approximately golden ratio to the maximum
753 * integer representable by a machine word for multiplicative hashing.
754 * Chuck Lever verified the effectiveness of this technique:
755 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
756 *
757 * These primes are chosen to be bit-sparse, that is operations on
758 * them can use shifts and additions instead of multiplications for
759 * machines where multiplications are slow.
760 */
761#if BITS_PER_LONG == 32
762/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
763#define GOLDEN_RATIO_PRIME 0x9e370001UL
764#elif BITS_PER_LONG == 64
765/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
766#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
767#else
768#error Define GOLDEN_RATIO_PRIME for your wordsize.
769#endif
770
771/*
772 * In order to wait for pages to become available there must be
773 * waitqueues associated with pages. By using a hash table of
774 * waitqueues where the bucket discipline is to maintain all
775 * waiters on the same queue and wake all when any of the pages
776 * become available, and for the woken contexts to check to be
777 * sure the appropriate page became available, this saves space
778 * at a cost of "thundering herd" phenomena during rare hash
779 * collisions.
780 */
781static inline wait_queue_head_t *page_waitqueue(struct page *page)
782{
783	const zone_t *zone = page_zone(page);
784	wait_queue_head_t *wait = zone->wait_table;
785	unsigned long hash = (unsigned long)page;
786
787#if BITS_PER_LONG == 64
788	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
789	unsigned long n = hash;
790	n <<= 18;
791	hash -= n;
792	n <<= 33;
793	hash -= n;
794	n <<= 3;
795	hash += n;
796	n <<= 3;
797	hash -= n;
798	n <<= 4;
799	hash += n;
800	n <<= 2;
801	hash += n;
802#else
803	/* On some cpus multiply is faster, on others gcc will do shifts */
804	hash *= GOLDEN_RATIO_PRIME;
805#endif
806	hash >>= zone->wait_table_shift;
807
808	return &wait[hash];
809}
810
811/*
812 * Wait for a page to get unlocked.
813 *
814 * This must be called with the caller "holding" the page,
815 * ie with increased "page->count" so that the page won't
816 * go away during the wait..
817 *
818 * The waiting strategy is to get on a waitqueue determined
819 * by hashing. Waiters will then collide, and the newly woken
820 * task must then determine whether it was woken for the page
821 * it really wanted, and go back to sleep on the waitqueue if
822 * that wasn't it. With the waitqueue semantics, it never leaves
823 * the waitqueue unless it calls, so the loop moves forward one
824 * iteration every time there is
825 * (1) a collision
826 * and
827 * (2) one of the colliding pages is woken
828 *
829 * This is the thundering herd problem, but it is expected to
830 * be very rare due to the few pages that are actually being
831 * waited on at any given time and the quality of the hash function.
832 */
833void ___wait_on_page(struct page *page)
834{
835	wait_queue_head_t *waitqueue = page_waitqueue(page);
836	struct task_struct *tsk = current;
837	DECLARE_WAITQUEUE(wait, tsk);
838
839	add_wait_queue(waitqueue, &wait);
840	do {
841		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
842		if (!PageLocked(page))
843			break;
844		sync_page(page);
845		schedule();
846	} while (PageLocked(page));
847	__set_task_state(tsk, TASK_RUNNING);
848	remove_wait_queue(waitqueue, &wait);
849}
850
851/*
852 * unlock_page() is the other half of the story just above
853 * __wait_on_page(). Here a couple of quick checks are done
854 * and a couple of flags are set on the page, and then all
855 * of the waiters for all of the pages in the appropriate
856 * wait queue are woken.
857 */
858void unlock_page(struct page *page)
859{
860	wait_queue_head_t *waitqueue = page_waitqueue(page);
861	ClearPageLaunder(page);
862	smp_mb__before_clear_bit();
863	if (!test_and_clear_bit(PG_locked, &(page)->flags))
864		BUG();
865	smp_mb__after_clear_bit();
866
867	/*
868	 * Although the default semantics of wake_up() are
869	 * to wake all, here the specific function is used
870	 * to make it even more explicit that a number of
871	 * pages are being waited on here.
872	 */
873	if (waitqueue_active(waitqueue))
874		wake_up_all(waitqueue);
875}
876
877/*
878 * Get a lock on the page, assuming we need to sleep
879 * to get it..
880 */
881static void __lock_page(struct page *page)
882{
883	wait_queue_head_t *waitqueue = page_waitqueue(page);
884	struct task_struct *tsk = current;
885	DECLARE_WAITQUEUE(wait, tsk);
886
887	add_wait_queue_exclusive(waitqueue, &wait);
888	for (;;) {
889		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
890		if (PageLocked(page)) {
891			sync_page(page);
892			schedule();
893		}
894		if (!TryLockPage(page))
895			break;
896	}
897	__set_task_state(tsk, TASK_RUNNING);
898	remove_wait_queue(waitqueue, &wait);
899}
900
901/*
902 * Get an exclusive lock on the page, optimistically
903 * assuming it's not locked..
904 */
905void lock_page(struct page *page)
906{
907	if (TryLockPage(page))
908		__lock_page(page);
909}
910
911/*
912 * a rather lightweight function, finding and getting a reference to a
913 * hashed page atomically.
914 */
915struct page * __find_get_page(struct address_space *mapping,
916			      unsigned long offset, struct page **hash)
917{
918	struct page *page;
919
920	/*
921	 * We scan the hash list read-only. Addition to and removal from
922	 * the hash-list needs a held write-lock.
923	 */
924	spin_lock(&pagecache_lock);
925	page = __find_page_nolock(mapping, offset, *hash);
926	if (page)
927		page_cache_get(page);
928	spin_unlock(&pagecache_lock);
929	return page;
930}
931
932/*
933 * Same as above, but trylock it instead of incrementing the count.
934 */
935struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
936{
937	struct page *page;
938	struct page **hash = page_hash(mapping, offset);
939
940	spin_lock(&pagecache_lock);
941	page = __find_page_nolock(mapping, offset, *hash);
942	if (page) {
943		if (TryLockPage(page))
944			page = NULL;
945	}
946	spin_unlock(&pagecache_lock);
947	return page;
948}
949
950/*
951 * Must be called with the pagecache lock held,
952 * will return with it held (but it may be dropped
953 * during blocking operations..
954 */
955static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
956static struct page * __find_lock_page_helper(struct address_space *mapping,
957					unsigned long offset, struct page *hash)
958{
959	struct page *page;
960
961	/*
962	 * We scan the hash list read-only. Addition to and removal from
963	 * the hash-list needs a held write-lock.
964	 */
965repeat:
966	page = __find_page_nolock(mapping, offset, hash);
967	if (page) {
968		page_cache_get(page);
969		if (TryLockPage(page)) {
970			spin_unlock(&pagecache_lock);
971			lock_page(page);
972			spin_lock(&pagecache_lock);
973
974			/* Has the page been re-allocated while we slept? */
975			if (page->mapping != mapping || page->index != offset) {
976				UnlockPage(page);
977				page_cache_release(page);
978				goto repeat;
979			}
980		}
981	}
982	return page;
983}
984
985/*
986 * Same as the above, but lock the page too, verifying that
987 * it's still valid once we own it.
988 */
989struct page * __find_lock_page (struct address_space *mapping,
990				unsigned long offset, struct page **hash)
991{
992	struct page *page;
993
994	spin_lock(&pagecache_lock);
995	page = __find_lock_page_helper(mapping, offset, *hash);
996	spin_unlock(&pagecache_lock);
997	return page;
998}
999
1000/*
1001 * Same as above, but create the page if required..
1002 */
1003struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1004{
1005	struct page *page;
1006	struct page **hash = page_hash(mapping, index);
1007
1008	spin_lock(&pagecache_lock);
1009	page = __find_lock_page_helper(mapping, index, *hash);
1010	spin_unlock(&pagecache_lock);
1011	if (!page) {
1012		struct page *newpage = alloc_page(gfp_mask);
1013		if (newpage) {
1014			spin_lock(&pagecache_lock);
1015			page = __find_lock_page_helper(mapping, index, *hash);
1016			if (likely(!page)) {
1017				page = newpage;
1018				__add_to_page_cache(page, mapping, index, hash);
1019				newpage = NULL;
1020			}
1021			spin_unlock(&pagecache_lock);
1022			if (newpage == NULL)
1023				lru_cache_add(page);
1024			else
1025				page_cache_release(newpage);
1026		}
1027	}
1028	return page;
1029}
1030
1031/*
1032 * Same as grab_cache_page, but do not wait if the page is unavailable.
1033 * This is intended for speculative data generators, where the data can
1034 * be regenerated if the page couldn't be grabbed.  This routine should
1035 * be safe to call while holding the lock for another page.
1036 */
1037struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1038{
1039	struct page *page, **hash;
1040
1041	hash = page_hash(mapping, index);
1042	page = __find_get_page(mapping, index, hash);
1043
1044	if ( page ) {
1045		if ( !TryLockPage(page) ) {
1046			/* Page found and locked */
1047			/* This test is overly paranoid, but what the heck... */
1048			if ( unlikely(page->mapping != mapping || page->index != index) ) {
1049				/* Someone reallocated this page under us. */
1050				UnlockPage(page);
1051				page_cache_release(page);
1052				return NULL;
1053			} else {
1054				return page;
1055			}
1056		} else {
1057			/* Page locked by someone else */
1058			page_cache_release(page);
1059			return NULL;
1060		}
1061	}
1062
1063	page = page_cache_alloc(mapping);
1064	if ( unlikely(!page) )
1065		return NULL;	/* Failed to allocate a page */
1066
1067	if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1068		/* Someone else grabbed the page already. */
1069		page_cache_release(page);
1070		return NULL;
1071	}
1072
1073	return page;
1074}
1075
1076
1077/*
1078 * Read-ahead profiling information
1079 * --------------------------------
1080 * Every PROFILE_MAXREADCOUNT, the following information is written
1081 * to the syslog:
1082 *   Percentage of asynchronous read-ahead.
1083 *   Average of read-ahead fields context value.
1084 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1085 * to the syslog.
1086 */
1087
1088#ifdef PROFILE_READAHEAD
1089
1090#define PROFILE_MAXREADCOUNT 1000
1091
1092static unsigned long total_reada;
1093static unsigned long total_async;
1094static unsigned long total_ramax;
1095static unsigned long total_ralen;
1096static unsigned long total_rawin;
1097
1098static void profile_readahead(int async, struct file *filp)
1099{
1100	unsigned long flags;
1101
1102	++total_reada;
1103	if (async)
1104		++total_async;
1105
1106	total_ramax	+= filp->f_ramax;
1107	total_ralen	+= filp->f_ralen;
1108	total_rawin	+= filp->f_rawin;
1109
1110	if (total_reada > PROFILE_MAXREADCOUNT) {
1111		save_flags(flags);
1112		cli();
1113		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1114			restore_flags(flags);
1115			return;
1116		}
1117
1118		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1119			total_ramax/total_reada,
1120			total_ralen/total_reada,
1121			total_rawin/total_reada,
1122			(total_async*100)/total_reada);
1123#ifdef DEBUG_READAHEAD
1124		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1125			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1126#endif
1127
1128		total_reada	= 0;
1129		total_async	= 0;
1130		total_ramax	= 0;
1131		total_ralen	= 0;
1132		total_rawin	= 0;
1133
1134		restore_flags(flags);
1135	}
1136}
1137#endif  /* defined PROFILE_READAHEAD */
1138
1139/*
1140 * Read-ahead context:
1141 * -------------------
1142 * The read ahead context fields of the "struct file" are the following:
1143 * - f_raend : position of the first byte after the last page we tried to
1144 *	       read ahead.
1145 * - f_ramax : current read-ahead maximum size.
1146 * - f_ralen : length of the current IO read block we tried to read-ahead.
1147 * - f_rawin : length of the current read-ahead window.
1148 *		if last read-ahead was synchronous then
1149 *			f_rawin = f_ralen
1150 *		otherwise (was asynchronous)
1151 *			f_rawin = previous value of f_ralen + f_ralen
1152 *
1153 * Read-ahead limits:
1154 * ------------------
1155 * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
1156 * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
1157 *
1158 * Synchronous read-ahead benefits:
1159 * --------------------------------
1160 * Using reasonable IO xfer length from peripheral devices increase system
1161 * performances.
1162 * Reasonable means, in this context, not too large but not too small.
1163 * The actual maximum value is:
1164 *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1165 *      and 32K if defined (4K page size assumed).
1166 *
1167 * Asynchronous read-ahead benefits:
1168 * ---------------------------------
1169 * Overlapping next read request and user process execution increase system
1170 * performance.
1171 *
1172 * Read-ahead risks:
1173 * -----------------
1174 * We have to guess which further data are needed by the user process.
1175 * If these data are often not really needed, it's bad for system
1176 * performances.
1177 * However, we know that files are often accessed sequentially by
1178 * application programs and it seems that it is possible to have some good
1179 * strategy in that guessing.
1180 * We only try to read-ahead files that seems to be read sequentially.
1181 *
1182 * Asynchronous read-ahead risks:
1183 * ------------------------------
1184 * In order to maximize overlapping, we must start some asynchronous read
1185 * request from the device, as soon as possible.
1186 * We must be very careful about:
1187 * - The number of effective pending IO read requests.
1188 *   ONE seems to be the only reasonable value.
1189 * - The total memory pool usage for the file access stream.
1190 *   This maximum memory usage is implicitly 2 IO read chunks:
1191 *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1192 *   64k if defined (4K page size assumed).
1193 */
1194
1195static inline int get_max_readahead(struct inode * inode)
1196{
1197	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1198		return vm_max_readahead;
1199	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1200}
1201
1202static void generic_file_readahead(int reada_ok,
1203	struct file * filp, struct inode * inode,
1204	struct page * page)
1205{
1206	unsigned long end_index;
1207	unsigned long index = page->index;
1208	unsigned long max_ahead, ahead;
1209	unsigned long raend;
1210	int max_readahead = get_max_readahead(inode);
1211
1212	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1213
1214	raend = filp->f_raend;
1215	max_ahead = 0;
1216
1217/*
1218 * The current page is locked.
1219 * If the current position is inside the previous read IO request, do not
1220 * try to reread previously read ahead pages.
1221 * Otherwise decide or not to read ahead some pages synchronously.
1222 * If we are not going to read ahead, set the read ahead context for this
1223 * page only.
1224 */
1225	if (PageLocked(page)) {
1226		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1227			raend = index;
1228			if (raend < end_index)
1229				max_ahead = filp->f_ramax;
1230			filp->f_rawin = 0;
1231			filp->f_ralen = 1;
1232			if (!max_ahead) {
1233				filp->f_raend  = index + filp->f_ralen;
1234				filp->f_rawin += filp->f_ralen;
1235			}
1236		}
1237	}
1238/*
1239 * The current page is not locked.
1240 * If we were reading ahead and,
1241 * if the current max read ahead size is not zero and,
1242 * if the current position is inside the last read-ahead IO request,
1243 *   it is the moment to try to read ahead asynchronously.
1244 * We will later force unplug device in order to force asynchronous read IO.
1245 */
1246	else if (reada_ok && filp->f_ramax && raend >= 1 &&
1247		 index <= raend && index + filp->f_ralen >= raend) {
1248/*
1249 * Add ONE page to max_ahead in order to try to have about the same IO max size
1250 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1251 * Compute the position of the last page we have tried to read in order to
1252 * begin to read ahead just at the next page.
1253 */
1254		raend -= 1;
1255		if (raend < end_index)
1256			max_ahead = filp->f_ramax + 1;
1257
1258		if (max_ahead) {
1259			filp->f_rawin = filp->f_ralen;
1260			filp->f_ralen = 0;
1261			reada_ok      = 2;
1262		}
1263	}
1264/*
1265 * Try to read ahead pages.
1266 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1267 * scheduler, will work enough for us to avoid too bad actuals IO requests.
1268 */
1269	ahead = 0;
1270	while (ahead < max_ahead) {
1271		ahead ++;
1272		if ((raend + ahead) >= end_index)
1273			break;
1274		if (page_cache_read(filp, raend + ahead) < 0)
1275			break;
1276	}
1277/*
1278 * If we tried to read ahead some pages,
1279 * If we tried to read ahead asynchronously,
1280 *   Try to force unplug of the device in order to start an asynchronous
1281 *   read IO request.
1282 * Update the read-ahead context.
1283 * Store the length of the current read-ahead window.
1284 * Double the current max read ahead size.
1285 *   That heuristic avoid to do some large IO for files that are not really
1286 *   accessed sequentially.
1287 */
1288	if (ahead) {
1289		filp->f_ralen += ahead;
1290		filp->f_rawin += filp->f_ralen;
1291		filp->f_raend = raend + ahead + 1;
1292
1293		filp->f_ramax += filp->f_ramax;
1294
1295		if (filp->f_ramax > max_readahead)
1296			filp->f_ramax = max_readahead;
1297
1298#ifdef PROFILE_READAHEAD
1299		profile_readahead((reada_ok == 2), filp);
1300#endif
1301	}
1302
1303	return;
1304}
1305
1306/*
1307 * Mark a page as having seen activity.
1308 *
1309 * If it was already so marked, move it to the active queue and drop
1310 * the referenced bit.  Otherwise, just mark it for future action..
1311 */
1312void mark_page_accessed(struct page *page)
1313{
1314	if (!PageActive(page) && PageReferenced(page)) {
1315		activate_page(page);
1316		ClearPageReferenced(page);
1317	} else
1318		SetPageReferenced(page);
1319}
1320
1321/*
1322 * This is a generic file read routine, and uses the
1323 * inode->i_op->readpage() function for the actual low-level
1324 * stuff.
1325 *
1326 * This is really ugly. But the goto's actually try to clarify some
1327 * of the logic when it comes to error handling etc.
1328 */
1329void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1330{
1331	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1332	struct inode *inode = mapping->host;
1333	unsigned long index, offset;
1334	struct page *cached_page;
1335	int reada_ok;
1336	int error;
1337	int max_readahead = get_max_readahead(inode);
1338
1339	cached_page = NULL;
1340	index = *ppos >> PAGE_CACHE_SHIFT;
1341	offset = *ppos & ~PAGE_CACHE_MASK;
1342
1343/*
1344 * If the current position is outside the previous read-ahead window,
1345 * we reset the current read-ahead context and set read ahead max to zero
1346 * (will be set to just needed value later),
1347 * otherwise, we assume that the file accesses are sequential enough to
1348 * continue read-ahead.
1349 */
1350	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1351		reada_ok = 0;
1352		filp->f_raend = 0;
1353		filp->f_ralen = 0;
1354		filp->f_ramax = 0;
1355		filp->f_rawin = 0;
1356	} else {
1357		reada_ok = 1;
1358	}
1359/*
1360 * Adjust the current value of read-ahead max.
1361 * If the read operation stay in the first half page, force no readahead.
1362 * Otherwise try to increase read ahead max just enough to do the read request.
1363 * Then, at least MIN_READAHEAD if read ahead is ok,
1364 * and at most MAX_READAHEAD in all cases.
1365 */
1366	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1367		filp->f_ramax = 0;
1368	} else {
1369		unsigned long needed;
1370
1371		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1372
1373		if (filp->f_ramax < needed)
1374			filp->f_ramax = needed;
1375
1376		if (reada_ok && filp->f_ramax < vm_min_readahead)
1377				filp->f_ramax = vm_min_readahead;
1378		if (filp->f_ramax > max_readahead)
1379			filp->f_ramax = max_readahead;
1380	}
1381
1382	for (;;) {
1383		struct page *page, **hash;
1384		unsigned long end_index, nr, ret;
1385
1386		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1387
1388		if (index > end_index)
1389			break;
1390		nr = PAGE_CACHE_SIZE;
1391		if (index == end_index) {
1392			nr = inode->i_size & ~PAGE_CACHE_MASK;
1393			if (nr <= offset)
1394				break;
1395		}
1396
1397		nr = nr - offset;
1398
1399		/*
1400		 * Try to find the data in the page cache..
1401		 */
1402		hash = page_hash(mapping, index);
1403
1404		spin_lock(&pagecache_lock);
1405		page = __find_page_nolock(mapping, index, *hash);
1406		if (!page)
1407			goto no_cached_page;
1408found_page:
1409		page_cache_get(page);
1410		spin_unlock(&pagecache_lock);
1411
1412		if (!Page_Uptodate(page))
1413			goto page_not_up_to_date;
1414		generic_file_readahead(reada_ok, filp, inode, page);
1415page_ok:
1416		/* If users can be writing to this page using arbitrary
1417		 * virtual addresses, take care about potential aliasing
1418		 * before reading the page on the kernel side.
1419		 */
1420		if (mapping->i_mmap_shared != NULL)
1421			flush_dcache_page(page);
1422
1423		/*
1424		 * Mark the page accessed if we read the
1425		 * beginning or we just did an lseek.
1426		 */
1427		if (!offset || !filp->f_reada)
1428			mark_page_accessed(page);
1429
1430		/*
1431		 * Ok, we have the page, and it's up-to-date, so
1432		 * now we can copy it to user space...
1433		 *
1434		 * The actor routine returns how many bytes were actually used..
1435		 * NOTE! This may not be the same as how much of a user buffer
1436		 * we filled up (we may be padding etc), so we can only update
1437		 * "pos" here (the actor routine has to update the user buffer
1438		 * pointers and the remaining count).
1439		 */
1440		ret = actor(desc, page, offset, nr);
1441		offset += ret;
1442		index += offset >> PAGE_CACHE_SHIFT;
1443		offset &= ~PAGE_CACHE_MASK;
1444
1445		page_cache_release(page);
1446		if (ret == nr && desc->count)
1447			continue;
1448		break;
1449
1450/*
1451 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1452 */
1453page_not_up_to_date:
1454		generic_file_readahead(reada_ok, filp, inode, page);
1455
1456		if (Page_Uptodate(page))
1457			goto page_ok;
1458
1459		/* Get exclusive access to the page ... */
1460		lock_page(page);
1461
1462		/* Did it get unhashed before we got the lock? */
1463		if (!page->mapping) {
1464			UnlockPage(page);
1465			page_cache_release(page);
1466			continue;
1467		}
1468
1469		/* Did somebody else fill it already? */
1470		if (Page_Uptodate(page)) {
1471			UnlockPage(page);
1472			goto page_ok;
1473		}
1474
1475readpage:
1476		/* ... and start the actual read. The read will unlock the page. */
1477		error = mapping->a_ops->readpage(filp, page);
1478
1479		if (!error) {
1480			if (Page_Uptodate(page))
1481				goto page_ok;
1482
1483			/* Again, try some read-ahead while waiting for the page to finish.. */
1484			generic_file_readahead(reada_ok, filp, inode, page);
1485			wait_on_page(page);
1486			if (Page_Uptodate(page))
1487				goto page_ok;
1488			error = -EIO;
1489		}
1490
1491		/* UHHUH! A synchronous read error occurred. Report it */
1492		desc->error = error;
1493		page_cache_release(page);
1494		break;
1495
1496no_cached_page:
1497		/*
1498		 * Ok, it wasn't cached, so we need to create a new
1499		 * page..
1500		 *
1501		 * We get here with the page cache lock held.
1502		 */
1503		if (!cached_page) {
1504			spin_unlock(&pagecache_lock);
1505			cached_page = page_cache_alloc(mapping);
1506			if (!cached_page) {
1507				desc->error = -ENOMEM;
1508				break;
1509			}
1510
1511			/*
1512			 * Somebody may have added the page while we
1513			 * dropped the page cache lock. Check for that.
1514			 */
1515			spin_lock(&pagecache_lock);
1516			page = __find_page_nolock(mapping, index, *hash);
1517			if (page)
1518				goto found_page;
1519		}
1520
1521		/*
1522		 * Ok, add the new page to the hash-queues...
1523		 */
1524		page = cached_page;
1525		__add_to_page_cache(page, mapping, index, hash);
1526		spin_unlock(&pagecache_lock);
1527		lru_cache_add(page);
1528		cached_page = NULL;
1529
1530		goto readpage;
1531	}
1532
1533	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1534	filp->f_reada = 1;
1535	if (cached_page)
1536		page_cache_release(cached_page);
1537	UPDATE_ATIME(inode);
1538}
1539
1540static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1541{
1542	ssize_t retval;
1543	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
1544	struct kiobuf * iobuf;
1545	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1546	struct inode * inode = mapping->host;
1547	loff_t size = inode->i_size;
1548
1549	new_iobuf = 0;
1550	iobuf = filp->f_iobuf;
1551	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1552		/*
1553		 * A parallel read/write is using the preallocated iobuf
1554		 * so just run slow and allocate a new one.
1555		 */
1556		retval = alloc_kiovec(1, &iobuf);
1557		if (retval)
1558			goto out;
1559		new_iobuf = 1;
1560	}
1561
1562	blocksize = 1 << inode->i_blkbits;
1563	blocksize_bits = inode->i_blkbits;
1564	blocksize_mask = blocksize - 1;
1565	chunk_size = KIO_MAX_ATOMIC_IO << 10;
1566
1567	retval = -EINVAL;
1568	if ((offset & blocksize_mask) || (count & blocksize_mask))
1569		goto out_free;
1570	if (!mapping->a_ops->direct_IO)
1571		goto out_free;
1572
1573	if ((rw == READ) && (offset + count > size))
1574		count = size - offset;
1575
1576	/*
1577	 * Flush to disk exclusively the _data_, metadata must remain
1578	 * completly asynchronous or performance will go to /dev/null.
1579	 */
1580	retval = filemap_fdatasync(mapping);
1581	if (retval == 0)
1582		retval = fsync_inode_data_buffers(inode);
1583	if (retval == 0)
1584		retval = filemap_fdatawait(mapping);
1585	if (retval < 0)
1586		goto out_free;
1587
1588	progress = retval = 0;
1589	while (count > 0) {
1590		iosize = count;
1591		if (iosize > chunk_size)
1592			iosize = chunk_size;
1593
1594		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1595		if (retval)
1596			break;
1597
1598		retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1599
1600		if (rw == READ && retval > 0)
1601			mark_dirty_kiobuf(iobuf, retval);
1602
1603		if (retval >= 0) {
1604			count -= retval;
1605			buf += retval;
1606			/* warning: weird semantics here, we're reporting a read behind the end of the file */
1607			progress += retval;
1608		}
1609
1610		unmap_kiobuf(iobuf);
1611
1612		if (retval != iosize)
1613			break;
1614	}
1615
1616	if (progress)
1617		retval = progress;
1618
1619 out_free:
1620	if (!new_iobuf)
1621		clear_bit(0, &filp->f_iobuf_lock);
1622	else
1623		free_kiovec(1, &iobuf);
1624 out:
1625	return retval;
1626}
1627
1628int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1629{
1630	char *kaddr;
1631	unsigned long left, count = desc->count;
1632
1633	if (size > count)
1634		size = count;
1635
1636	kaddr = kmap(page);
1637	left = __copy_to_user(desc->buf, kaddr + offset, size);
1638	kunmap(page);
1639
1640	if (left) {
1641		size -= left;
1642		desc->error = -EFAULT;
1643	}
1644	desc->count = count - size;
1645	desc->written += size;
1646	desc->buf += size;
1647	return size;
1648}
1649
1650/*
1651 * This is the "read()" routine for all filesystems
1652 * that can use the page cache directly.
1653 */
1654ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1655{
1656	ssize_t retval;
1657
1658	if ((ssize_t) count < 0)
1659		return -EINVAL;
1660
1661	if (filp->f_flags & O_DIRECT)
1662		goto o_direct;
1663
1664	retval = -EFAULT;
1665	if (access_ok(VERIFY_WRITE, buf, count)) {
1666		retval = 0;
1667
1668		if (count) {
1669			read_descriptor_t desc;
1670
1671			desc.written = 0;
1672			desc.count = count;
1673			desc.buf = buf;
1674			desc.error = 0;
1675			do_generic_file_read(filp, ppos, &desc, file_read_actor);
1676
1677			retval = desc.written;
1678			if (!retval)
1679				retval = desc.error;
1680		}
1681	}
1682 out:
1683	return retval;
1684
1685 o_direct:
1686	{
1687		loff_t pos = *ppos, size;
1688		struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1689		struct inode *inode = mapping->host;
1690
1691		retval = 0;
1692		if (!count)
1693			goto out; /* skip atime */
1694		size = inode->i_size;
1695		if (pos < size) {
1696			retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1697			if (retval > 0)
1698				*ppos = pos + retval;
1699		}
1700		UPDATE_ATIME(filp->f_dentry->d_inode);
1701		goto out;
1702	}
1703}
1704
1705static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1706{
1707	ssize_t written;
1708	unsigned long count = desc->count;
1709	struct file *file = (struct file *) desc->buf;
1710
1711	if (size > count)
1712		size = count;
1713
1714 	if (file->f_op->sendpage) {
1715 		written = file->f_op->sendpage(file, page, offset,
1716					       size, &file->f_pos, size<count);
1717	} else {
1718		char *kaddr;
1719		mm_segment_t old_fs;
1720
1721		old_fs = get_fs();
1722		set_fs(KERNEL_DS);
1723
1724		kaddr = kmap(page);
1725		written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1726		kunmap(page);
1727
1728		set_fs(old_fs);
1729	}
1730	if (written < 0) {
1731		desc->error = written;
1732		written = 0;
1733	}
1734	desc->count = count - written;
1735	desc->written += written;
1736	return written;
1737}
1738
1739asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1740{
1741	ssize_t retval;
1742	struct file * in_file, * out_file;
1743	struct inode * in_inode, * out_inode;
1744
1745	/*
1746	 * Get input file, and verify that it is ok..
1747	 */
1748	retval = -EBADF;
1749	in_file = fget(in_fd);
1750	if (!in_file)
1751		goto out;
1752	if (!(in_file->f_mode & FMODE_READ))
1753		goto fput_in;
1754	retval = -EINVAL;
1755	in_inode = in_file->f_dentry->d_inode;
1756	if (!in_inode)
1757		goto fput_in;
1758	if (!in_inode->i_mapping->a_ops->readpage)
1759		goto fput_in;
1760	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
1761	if (retval)
1762		goto fput_in;
1763
1764	/*
1765	 * Get output file, and verify that it is ok..
1766	 */
1767	retval = -EBADF;
1768	out_file = fget(out_fd);
1769	if (!out_file)
1770		goto fput_in;
1771	if (!(out_file->f_mode & FMODE_WRITE))
1772		goto fput_out;
1773	retval = -EINVAL;
1774	if (!out_file->f_op || !out_file->f_op->write)
1775		goto fput_out;
1776	out_inode = out_file->f_dentry->d_inode;
1777	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
1778	if (retval)
1779		goto fput_out;
1780
1781	retval = 0;
1782	if (count) {
1783		read_descriptor_t desc;
1784		loff_t pos = 0, *ppos;
1785
1786		retval = -EFAULT;
1787		ppos = &in_file->f_pos;
1788		if (offset) {
1789			if (get_user(pos, offset))
1790				goto fput_out;
1791			ppos = &pos;
1792		}
1793
1794		desc.written = 0;
1795		desc.count = count;
1796		desc.buf = (char *) out_file;
1797		desc.error = 0;
1798		do_generic_file_read(in_file, ppos, &desc, file_send_actor);
1799
1800		retval = desc.written;
1801		if (!retval)
1802			retval = desc.error;
1803		if (offset)
1804			put_user(pos, offset);
1805	}
1806
1807fput_out:
1808	fput(out_file);
1809fput_in:
1810	fput(in_file);
1811out:
1812	return retval;
1813}
1814
1815static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1816{
1817	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1818	unsigned long max;
1819
1820	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1821		return -EINVAL;
1822
1823	/* Limit it to the size of the file.. */
1824	max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1825	if (index > max)
1826		return 0;
1827	max -= index;
1828	if (nr > max)
1829		nr = max;
1830
1831	/* And limit it to a sane percentage of the inactive list.. */
1832	max = nr_inactive_pages / 2;
1833	if (nr > max)
1834		nr = max;
1835
1836	while (nr) {
1837		page_cache_read(file, index);
1838		index++;
1839		nr--;
1840	}
1841	return 0;
1842}
1843
1844asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1845{
1846	ssize_t ret;
1847	struct file *file;
1848
1849	ret = -EBADF;
1850	file = fget(fd);
1851	if (file) {
1852		if (file->f_mode & FMODE_READ) {
1853			unsigned long start = offset >> PAGE_CACHE_SHIFT;
1854			unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1855			ret = do_readahead(file, start, len);
1856		}
1857		fput(file);
1858	}
1859	return ret;
1860}
1861
1862/*
1863 * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
1864 * sure this is sequential access, we don't need a flexible read-ahead
1865 * window size -- we can always use a large fixed size window.
1866 */
1867static void nopage_sequential_readahead(struct vm_area_struct * vma,
1868	unsigned long pgoff, unsigned long filesize)
1869{
1870	unsigned long ra_window;
1871
1872	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
1873	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
1874
1875	/* vm_raend is zero if we haven't read ahead in this area yet.  */
1876	if (vma->vm_raend == 0)
1877		vma->vm_raend = vma->vm_pgoff + ra_window;
1878
1879	/*
1880	 * If we've just faulted the page half-way through our window,
1881	 * then schedule reads for the next window, and release the
1882	 * pages in the previous window.
1883	 */
1884	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
1885		unsigned long start = vma->vm_pgoff + vma->vm_raend;
1886		unsigned long end = start + ra_window;
1887
1888		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
1889			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
1890		if (start > end)
1891			return;
1892
1893		while ((start < end) && (start < filesize)) {
1894			if (read_cluster_nonblocking(vma->vm_file,
1895							start, filesize) < 0)
1896				break;
1897			start += CLUSTER_PAGES;
1898		}
1899		run_task_queue(&tq_disk);
1900
1901		/* if we're far enough past the beginning of this area,
1902		   recycle pages that are in the previous window. */
1903		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
1904			unsigned long window = ra_window << PAGE_SHIFT;
1905
1906			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
1907			end -= window + window;
1908			filemap_sync(vma, end - window, window, MS_INVALIDATE);
1909		}
1910
1911		vma->vm_raend += ra_window;
1912	}
1913
1914	return;
1915}
1916
1917/*
1918 * filemap_nopage() is invoked via the vma operations vector for a
1919 * mapped memory region to read in file data during a page fault.
1920 *
1921 * The goto's are kind of ugly, but this streamlines the normal case of having
1922 * it in the page cache, and handles the special cases reasonably without
1923 * having a lot of duplicated code.
1924 */
1925struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
1926{
1927	int error;
1928	struct file *file = area->vm_file;
1929	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1930	struct inode *inode = mapping->host;
1931	struct page *page, **hash;
1932	unsigned long size, pgoff, endoff;
1933
1934	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1935	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
1936
1937retry_all:
1938	/*
1939	 * An external ptracer can access pages that normally aren't
1940	 * accessible..
1941	 */
1942	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1943	if ((pgoff >= size) && (area->vm_mm == current->mm))
1944		return NULL;
1945
1946	/* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
1947	if (size > endoff)
1948		size = endoff;
1949
1950	/*
1951	 * Do we have something in the page cache already?
1952	 */
1953	hash = page_hash(mapping, pgoff);
1954retry_find:
1955	page = __find_get_page(mapping, pgoff, hash);
1956	if (!page)
1957		goto no_cached_page;
1958
1959	/*
1960	 * Ok, found a page in the page cache, now we need to check
1961	 * that it's up-to-date.
1962	 */
1963	if (!Page_Uptodate(page))
1964		goto page_not_uptodate;
1965
1966success:
1967 	/*
1968	 * Try read-ahead for sequential areas.
1969	 */
1970	if (VM_SequentialReadHint(area))
1971		nopage_sequential_readahead(area, pgoff, size);
1972
1973	/*
1974	 * Found the page and have a reference on it, need to check sharing
1975	 * and possibly copy it over to another page..
1976	 */
1977	mark_page_accessed(page);
1978	flush_page_to_ram(page);
1979	return page;
1980
1981no_cached_page:
1982	/*
1983	 * If the requested offset is within our file, try to read a whole
1984	 * cluster of pages at once.
1985	 *
1986	 * Otherwise, we're off the end of a privately mapped file,
1987	 * so we need to map a zero page.
1988	 */
1989	if ((pgoff < size) && !VM_RandomReadHint(area))
1990		error = read_cluster_nonblocking(file, pgoff, size);
1991	else
1992		error = page_cache_read(file, pgoff);
1993
1994	/*
1995	 * The page we want has now been added to the page cache.
1996	 * In the unlikely event that someone removed it in the
1997	 * meantime, we'll just come back here and read it again.
1998	 */
1999	if (error >= 0)
2000		goto retry_find;
2001
2002	/*
2003	 * An error return from page_cache_read can result if the
2004	 * system is low on memory, or a problem occurs while trying
2005	 * to schedule I/O.
2006	 */
2007	if (error == -ENOMEM)
2008		return NOPAGE_OOM;
2009	return NULL;
2010
2011page_not_uptodate:
2012	lock_page(page);
2013
2014	/* Did it get unhashed while we waited for it? */
2015	if (!page->mapping) {
2016		UnlockPage(page);
2017		page_cache_release(page);
2018		goto retry_all;
2019	}
2020
2021	/* Did somebody else get it up-to-date? */
2022	if (Page_Uptodate(page)) {
2023		UnlockPage(page);
2024		goto success;
2025	}
2026
2027	if (!mapping->a_ops->readpage(file, page)) {
2028		wait_on_page(page);
2029		if (Page_Uptodate(page))
2030			goto success;
2031	}
2032
2033	/*
2034	 * Umm, take care of errors if the page isn't up-to-date.
2035	 * Try to re-read it _once_. We do this synchronously,
2036	 * because there really aren't any performance issues here
2037	 * and we need to check for errors.
2038	 */
2039	lock_page(page);
2040
2041	/* Somebody truncated the page on us? */
2042	if (!page->mapping) {
2043		UnlockPage(page);
2044		page_cache_release(page);
2045		goto retry_all;
2046	}
2047
2048	/* Somebody else successfully read it in? */
2049	if (Page_Uptodate(page)) {
2050		UnlockPage(page);
2051		goto success;
2052	}
2053	ClearPageError(page);
2054	if (!mapping->a_ops->readpage(file, page)) {
2055		wait_on_page(page);
2056		if (Page_Uptodate(page))
2057			goto success;
2058	}
2059
2060	/*
2061	 * Things didn't work out. Return zero to tell the
2062	 * mm layer so, possibly freeing the page cache page first.
2063	 */
2064	page_cache_release(page);
2065	return NULL;
2066}
2067
2068/* Called with mm->page_table_lock held to protect against other
2069 * threads/the swapper from ripping pte's out from under us.
2070 */
2071static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2072	unsigned long address, unsigned int flags)
2073{
2074	pte_t pte = *ptep;
2075
2076	if (pte_present(pte)) {
2077		struct page *page = pte_page(pte);
2078		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2079			flush_tlb_page(vma, address);
2080			set_page_dirty(page);
2081		}
2082	}
2083	return 0;
2084}
2085
2086static inline int filemap_sync_pte_range(pmd_t * pmd,
2087	unsigned long address, unsigned long size,
2088	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2089{
2090	pte_t * pte;
2091	unsigned long end;
2092	int error;
2093
2094	if (pmd_none(*pmd))
2095		return 0;
2096	if (pmd_bad(*pmd)) {
2097		pmd_ERROR(*pmd);
2098		pmd_clear(pmd);
2099		return 0;
2100	}
2101	pte = pte_offset(pmd, address);
2102	offset += address & PMD_MASK;
2103	address &= ~PMD_MASK;
2104	end = address + size;
2105	if (end > PMD_SIZE)
2106		end = PMD_SIZE;
2107	error = 0;
2108	do {
2109		error |= filemap_sync_pte(pte, vma, address + offset, flags);
2110		address += PAGE_SIZE;
2111		pte++;
2112	} while (address && (address < end));
2113	return error;
2114}
2115
2116static inline int filemap_sync_pmd_range(pgd_t * pgd,
2117	unsigned long address, unsigned long size,
2118	struct vm_area_struct *vma, unsigned int flags)
2119{
2120	pmd_t * pmd;
2121	unsigned long offset, end;
2122	int error;
2123
2124	if (pgd_none(*pgd))
2125		return 0;
2126	if (pgd_bad(*pgd)) {
2127		pgd_ERROR(*pgd);
2128		pgd_clear(pgd);
2129		return 0;
2130	}
2131	pmd = pmd_offset(pgd, address);
2132	offset = address & PGDIR_MASK;
2133	address &= ~PGDIR_MASK;
2134	end = address + size;
2135	if (end > PGDIR_SIZE)
2136		end = PGDIR_SIZE;
2137	error = 0;
2138	do {
2139		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2140		address = (address + PMD_SIZE) & PMD_MASK;
2141		pmd++;
2142	} while (address && (address < end));
2143	return error;
2144}
2145
2146int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2147	size_t size, unsigned int flags)
2148{
2149	pgd_t * dir;
2150	unsigned long end = address + size;
2151	int error = 0;
2152
2153	/* Aquire the lock early; it may be possible to avoid dropping
2154	 * and reaquiring it repeatedly.
2155	 */
2156	spin_lock(&vma->vm_mm->page_table_lock);
2157
2158	dir = pgd_offset(vma->vm_mm, address);
2159	flush_cache_range(vma->vm_mm, end - size, end);
2160	if (address >= end)
2161		BUG();
2162	do {
2163		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2164		address = (address + PGDIR_SIZE) & PGDIR_MASK;
2165		dir++;
2166	} while (address && (address < end));
2167	flush_tlb_range(vma->vm_mm, end - size, end);
2168
2169	spin_unlock(&vma->vm_mm->page_table_lock);
2170
2171	return error;
2172}
2173
2174static struct vm_operations_struct generic_file_vm_ops = {
2175	nopage:		filemap_nopage,
2176};
2177
2178/* This is used for a general mmap of a disk file */
2179
2180int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2181{
2182	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2183	struct inode *inode = mapping->host;
2184
2185	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2186		if (!mapping->a_ops->writepage)
2187			return -EINVAL;
2188	}
2189	if (!mapping->a_ops->readpage)
2190		return -ENOEXEC;
2191	UPDATE_ATIME(inode);
2192	vma->vm_ops = &generic_file_vm_ops;
2193	return 0;
2194}
2195
2196/*
2197 * The msync() system call.
2198 */
2199
2200/*
2201 * MS_SYNC syncs the entire file - including mappings.
2202 *
2203 * MS_ASYNC initiates writeout of just the dirty mapped data.
2204 * This provides no guarantee of file integrity - things like indirect
2205 * blocks may not have started writeout.  MS_ASYNC is primarily useful
2206 * where the application knows that it has finished with the data and
2207 * wishes to intelligently schedule its own I/O traffic.
2208 */
2209static int msync_interval(struct vm_area_struct * vma,
2210	unsigned long start, unsigned long end, int flags)
2211{
2212	int ret = 0;
2213	struct file * file = vma->vm_file;
2214
2215	if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2216		return -EBUSY;
2217
2218	if (file && (vma->vm_flags & VM_SHARED)) {
2219		ret = filemap_sync(vma, start, end-start, flags);
2220
2221		if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2222			struct inode * inode = file->f_dentry->d_inode;
2223
2224			down(&inode->i_sem);
2225			ret = filemap_fdatasync(inode->i_mapping);
2226			if (flags & MS_SYNC) {
2227				int err;
2228
2229				if (file->f_op && file->f_op->fsync) {
2230					err = file->f_op->fsync(file, file->f_dentry, 1);
2231					if (err && !ret)
2232						ret = err;
2233				}
2234				err = filemap_fdatawait(inode->i_mapping);
2235				if (err && !ret)
2236					ret = err;
2237			}
2238			up(&inode->i_sem);
2239		}
2240	}
2241	return ret;
2242}
2243
2244asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2245{
2246	unsigned long end;
2247	struct vm_area_struct * vma;
2248	int unmapped_error, error = -EINVAL;
2249
2250	down_read(&current->mm->mmap_sem);
2251	if (start & ~PAGE_MASK)
2252		goto out;
2253	len = (len + ~PAGE_MASK) & PAGE_MASK;
2254	end = start + len;
2255	if (end < start)
2256		goto out;
2257	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2258		goto out;
2259	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2260		goto out;
2261
2262	error = 0;
2263	if (end == start)
2264		goto out;
2265	/*
2266	 * If the interval [start,end) covers some unmapped address ranges,
2267	 * just ignore them, but return -ENOMEM at the end.
2268	 */
2269	vma = find_vma(current->mm, start);
2270	unmapped_error = 0;
2271	for (;;) {
2272		/* Still start < end. */
2273		error = -ENOMEM;
2274		if (!vma)
2275			goto out;
2276		/* Here start < vma->vm_end. */
2277		if (start < vma->vm_start) {
2278			unmapped_error = -ENOMEM;
2279			start = vma->vm_start;
2280		}
2281		/* Here vma->vm_start <= start < vma->vm_end. */
2282		if (end <= vma->vm_end) {
2283			if (start < end) {
2284				error = msync_interval(vma, start, end, flags);
2285				if (error)
2286					goto out;
2287			}
2288			error = unmapped_error;
2289			goto out;
2290		}
2291		/* Here vma->vm_start <= start < vma->vm_end < end. */
2292		error = msync_interval(vma, start, vma->vm_end, flags);
2293		if (error)
2294			goto out;
2295		start = vma->vm_end;
2296		vma = vma->vm_next;
2297	}
2298out:
2299	up_read(&current->mm->mmap_sem);
2300	return error;
2301}
2302
2303static inline void setup_read_behavior(struct vm_area_struct * vma,
2304	int behavior)
2305{
2306	VM_ClearReadHint(vma);
2307	switch(behavior) {
2308		case MADV_SEQUENTIAL:
2309			vma->vm_flags |= VM_SEQ_READ;
2310			break;
2311		case MADV_RANDOM:
2312			vma->vm_flags |= VM_RAND_READ;
2313			break;
2314		default:
2315			break;
2316	}
2317	return;
2318}
2319
2320static long madvise_fixup_start(struct vm_area_struct * vma,
2321	unsigned long end, int behavior)
2322{
2323	struct vm_area_struct * n;
2324	struct mm_struct * mm = vma->vm_mm;
2325
2326	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2327	if (!n)
2328		return -EAGAIN;
2329	*n = *vma;
2330	n->vm_end = end;
2331	setup_read_behavior(n, behavior);
2332	n->vm_raend = 0;
2333	if (n->vm_file)
2334		get_file(n->vm_file);
2335	if (n->vm_ops && n->vm_ops->open)
2336		n->vm_ops->open(n);
2337	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2338	lock_vma_mappings(vma);
2339	spin_lock(&mm->page_table_lock);
2340	vma->vm_start = end;
2341	__insert_vm_struct(mm, n);
2342	spin_unlock(&mm->page_table_lock);
2343	unlock_vma_mappings(vma);
2344	return 0;
2345}
2346
2347static long madvise_fixup_end(struct vm_area_struct * vma,
2348	unsigned long start, int behavior)
2349{
2350	struct vm_area_struct * n;
2351	struct mm_struct * mm = vma->vm_mm;
2352
2353	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2354	if (!n)
2355		return -EAGAIN;
2356	*n = *vma;
2357	n->vm_start = start;
2358	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2359	setup_read_behavior(n, behavior);
2360	n->vm_raend = 0;
2361	if (n->vm_file)
2362		get_file(n->vm_file);
2363	if (n->vm_ops && n->vm_ops->open)
2364		n->vm_ops->open(n);
2365	lock_vma_mappings(vma);
2366	spin_lock(&mm->page_table_lock);
2367	vma->vm_end = start;
2368	__insert_vm_struct(mm, n);
2369	spin_unlock(&mm->page_table_lock);
2370	unlock_vma_mappings(vma);
2371	return 0;
2372}
2373
2374static long madvise_fixup_middle(struct vm_area_struct * vma,
2375	unsigned long start, unsigned long end, int behavior)
2376{
2377	struct vm_area_struct * left, * right;
2378	struct mm_struct * mm = vma->vm_mm;
2379
2380	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2381	if (!left)
2382		return -EAGAIN;
2383	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2384	if (!right) {
2385		kmem_cache_free(vm_area_cachep, left);
2386		return -EAGAIN;
2387	}
2388	*left = *vma;
2389	*right = *vma;
2390	left->vm_end = start;
2391	right->vm_start = end;
2392	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2393	left->vm_raend = 0;
2394	right->vm_raend = 0;
2395	if (vma->vm_file)
2396		atomic_add(2, &vma->vm_file->f_count);
2397
2398	if (vma->vm_ops && vma->vm_ops->open) {
2399		vma->vm_ops->open(left);
2400		vma->vm_ops->open(right);
2401	}
2402	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2403	vma->vm_raend = 0;
2404	lock_vma_mappings(vma);
2405	spin_lock(&mm->page_table_lock);
2406	vma->vm_start = start;
2407	vma->vm_end = end;
2408	setup_read_behavior(vma, behavior);
2409	__insert_vm_struct(mm, left);
2410	__insert_vm_struct(mm, right);
2411	spin_unlock(&mm->page_table_lock);
2412	unlock_vma_mappings(vma);
2413	return 0;
2414}
2415
2416/*
2417 * We can potentially split a vm area into separate
2418 * areas, each area with its own behavior.
2419 */
2420static long madvise_behavior(struct vm_area_struct * vma,
2421	unsigned long start, unsigned long end, int behavior)
2422{
2423	int error = 0;
2424
2425	/* This caps the number of vma's this process can own */
2426	if (vma->vm_mm->map_count > max_map_count)
2427		return -ENOMEM;
2428
2429	if (start == vma->vm_start) {
2430		if (end == vma->vm_end) {
2431			setup_read_behavior(vma, behavior);
2432			vma->vm_raend = 0;
2433		} else
2434			error = madvise_fixup_start(vma, end, behavior);
2435	} else {
2436		if (end == vma->vm_end)
2437			error = madvise_fixup_end(vma, start, behavior);
2438		else
2439			error = madvise_fixup_middle(vma, start, end, behavior);
2440	}
2441
2442	return error;
2443}
2444
2445/*
2446 * Schedule all required I/O operations, then run the disk queue
2447 * to make sure they are started.  Do not wait for completion.
2448 */
2449static long madvise_willneed(struct vm_area_struct * vma,
2450	unsigned long start, unsigned long end)
2451{
2452	long error = -EBADF;
2453	struct file * file;
2454	unsigned long size, rlim_rss;
2455
2456	/* Doesn't work if there's no mapped file. */
2457	if (!vma->vm_file)
2458		return error;
2459	file = vma->vm_file;
2460	size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >>
2461							PAGE_CACHE_SHIFT;
2462
2463	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2464	if (end > vma->vm_end)
2465		end = vma->vm_end;
2466	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2467
2468	/* Make sure this doesn't exceed the process's max rss. */
2469	error = -EIO;
2470	rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
2471				LONG_MAX; /* default: see resource.h */
2472	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
2473		return error;
2474
2475	/* round to cluster boundaries if this isn't a "random" area. */
2476	if (!VM_RandomReadHint(vma)) {
2477		start = CLUSTER_OFFSET(start);
2478		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2479
2480		while ((start < end) && (start < size)) {
2481			error = read_cluster_nonblocking(file, start, size);
2482			start += CLUSTER_PAGES;
2483			if (error < 0)
2484				break;
2485		}
2486	} else {
2487		while ((start < end) && (start < size)) {
2488			error = page_cache_read(file, start);
2489			start++;
2490			if (error < 0)
2491				break;
2492		}
2493	}
2494
2495	/* Don't wait for someone else to push these requests. */
2496	run_task_queue(&tq_disk);
2497
2498	return error;
2499}
2500
2501/*
2502 * Application no longer needs these pages.  If the pages are dirty,
2503 * it's OK to just throw them away.  The app will be more careful about
2504 * data it wants to keep.  Be sure to free swap resources too.  The
2505 * zap_page_range call sets things up for refill_inactive to actually free
2506 * these pages later if no one else has touched them in the meantime,
2507 * although we could add these pages to a global reuse list for
2508 * refill_inactive to pick up before reclaiming other pages.
2509 *
2510 * NB: This interface discards data rather than pushes it out to swap,
2511 * as some implementations do.  This has performance implications for
2512 * applications like large transactional databases which want to discard
2513 * pages in anonymous maps after committing to backing store the data
2514 * that was kept in them.  There is no reason to write this data out to
2515 * the swap area if the application is discarding it.
2516 *
2517 * An interface that causes the system to free clean pages and flush
2518 * dirty pages is already available as msync(MS_INVALIDATE).
2519 */
2520static long madvise_dontneed(struct vm_area_struct * vma,
2521	unsigned long start, unsigned long end)
2522{
2523	if (vma->vm_flags & VM_LOCKED)
2524		return -EINVAL;
2525
2526	zap_page_range(vma->vm_mm, start, end - start);
2527	return 0;
2528}
2529
2530static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2531	unsigned long end, int behavior)
2532{
2533	long error = -EBADF;
2534
2535	switch (behavior) {
2536	case MADV_NORMAL:
2537	case MADV_SEQUENTIAL:
2538	case MADV_RANDOM:
2539		error = madvise_behavior(vma, start, end, behavior);
2540		break;
2541
2542	case MADV_WILLNEED:
2543		error = madvise_willneed(vma, start, end);
2544		break;
2545
2546	case MADV_DONTNEED:
2547		error = madvise_dontneed(vma, start, end);
2548		break;
2549
2550	default:
2551		error = -EINVAL;
2552		break;
2553	}
2554
2555	return error;
2556}
2557
2558/*
2559 * The madvise(2) system call.
2560 *
2561 * Applications can use madvise() to advise the kernel how it should
2562 * handle paging I/O in this VM area.  The idea is to help the kernel
2563 * use appropriate read-ahead and caching techniques.  The information
2564 * provided is advisory only, and can be safely disregarded by the
2565 * kernel without affecting the correct operation of the application.
2566 *
2567 * behavior values:
2568 *  MADV_NORMAL - the default behavior is to read clusters.  This
2569 *		results in some read-ahead and read-behind.
2570 *  MADV_RANDOM - the system should read the minimum amount of data
2571 *		on any access, since it is unlikely that the appli-
2572 *		cation will need more than what it asks for.
2573 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2574 *		once, so they can be aggressively read ahead, and
2575 *		can be freed soon after they are accessed.
2576 *  MADV_WILLNEED - the application is notifying the system to read
2577 *		some pages ahead.
2578 *  MADV_DONTNEED - the application is finished with the given range,
2579 *		so the kernel can free resources associated with it.
2580 *
2581 * return values:
2582 *  zero    - success
2583 *  -EINVAL - start + len < 0, start is not page-aligned,
2584 *		"behavior" is not a valid value, or application
2585 *		is attempting to release locked or shared pages.
2586 *  -ENOMEM - addresses in the specified range are not currently
2587 *		mapped, or are outside the AS of the process.
2588 *  -EIO    - an I/O error occurred while paging in data.
2589 *  -EBADF  - map exists, but area maps something that isn't a file.
2590 *  -EAGAIN - a kernel resource was temporarily unavailable.
2591 */
2592asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2593{
2594	unsigned long end;
2595	struct vm_area_struct * vma;
2596	int unmapped_error = 0;
2597	int error = -EINVAL;
2598
2599	down_write(&current->mm->mmap_sem);
2600
2601	if (start & ~PAGE_MASK)
2602		goto out;
2603	len = (len + ~PAGE_MASK) & PAGE_MASK;
2604	end = start + len;
2605	if (end < start)
2606		goto out;
2607
2608	error = 0;
2609	if (end == start)
2610		goto out;
2611
2612	/*
2613	 * If the interval [start,end) covers some unmapped address
2614	 * ranges, just ignore them, but return -ENOMEM at the end.
2615	 */
2616	vma = find_vma(current->mm, start);
2617	for (;;) {
2618		/* Still start < end. */
2619		error = -ENOMEM;
2620		if (!vma)
2621			goto out;
2622
2623		/* Here start < vma->vm_end. */
2624		if (start < vma->vm_start) {
2625			unmapped_error = -ENOMEM;
2626			start = vma->vm_start;
2627		}
2628
2629		/* Here vma->vm_start <= start < vma->vm_end. */
2630		if (end <= vma->vm_end) {
2631			if (start < end) {
2632				error = madvise_vma(vma, start, end,
2633							behavior);
2634				if (error)
2635					goto out;
2636			}
2637			error = unmapped_error;
2638			goto out;
2639		}
2640
2641		/* Here vma->vm_start <= start < vma->vm_end < end. */
2642		error = madvise_vma(vma, start, vma->vm_end, behavior);
2643		if (error)
2644			goto out;
2645		start = vma->vm_end;
2646		vma = vma->vm_next;
2647	}
2648
2649out:
2650	up_write(&current->mm->mmap_sem);
2651	return error;
2652}
2653
2654/*
2655 * Later we can get more picky about what "in core" means precisely.
2656 * For now, simply check to see if the page is in the page cache,
2657 * and is up to date; i.e. that no page-in operation would be required
2658 * at this time if an application were to map and access this page.
2659 */
2660static unsigned char mincore_page(struct vm_area_struct * vma,
2661	unsigned long pgoff)
2662{
2663	unsigned char present = 0;
2664	struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2665	struct page * page, ** hash = page_hash(as, pgoff);
2666
2667	spin_lock(&pagecache_lock);
2668	page = __find_page_nolock(as, pgoff, *hash);
2669	if ((page) && (Page_Uptodate(page)))
2670		present = 1;
2671	spin_unlock(&pagecache_lock);
2672
2673	return present;
2674}
2675
2676static long mincore_vma(struct vm_area_struct * vma,
2677	unsigned long start, unsigned long end, unsigned char * vec)
2678{
2679	long error, i, remaining;
2680	unsigned char * tmp;
2681
2682	error = -ENOMEM;
2683	if (!vma->vm_file)
2684		return error;
2685
2686	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2687	if (end > vma->vm_end)
2688		end = vma->vm_end;
2689	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2690
2691	error = -EAGAIN;
2692	tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
2693	if (!tmp)
2694		return error;
2695
2696	/* (end - start) is # of pages, and also # of bytes in "vec */
2697	remaining = (end - start),
2698
2699	error = 0;
2700	for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
2701		int j = 0;
2702		long thispiece = (remaining < PAGE_SIZE) ?
2703						remaining : PAGE_SIZE;
2704
2705		while (j < thispiece)
2706			tmp[j++] = mincore_page(vma, start++);
2707
2708		if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
2709			error = -EFAULT;
2710			break;
2711		}
2712	}
2713
2714	free_page((unsigned long) tmp);
2715	return error;
2716}
2717
2718/*
2719 * The mincore(2) system call.
2720 *
2721 * mincore() returns the memory residency status of the pages in the
2722 * current process's address space specified by [addr, addr + len).
2723 * The status is returned in a vector of bytes.  The least significant
2724 * bit of each byte is 1 if the referenced page is in memory, otherwise
2725 * it is zero.
2726 *
2727 * Because the status of a page can change after mincore() checks it
2728 * but before it returns to the application, the returned vector may
2729 * contain stale information.  Only locked pages are guaranteed to
2730 * remain in memory.
2731 *
2732 * return values:
2733 *  zero    - success
2734 *  -EFAULT - vec points to an illegal address
2735 *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
2736 *		or len has a nonpositive value
2737 *  -ENOMEM - Addresses in the range [addr, addr + len] are
2738 *		invalid for the address space of this process, or
2739 *		specify one or more pages which are not currently
2740 *		mapped
2741 *  -EAGAIN - A kernel resource was temporarily unavailable.
2742 */
2743asmlinkage long sys_mincore(unsigned long start, size_t len,
2744	unsigned char * vec)
2745{
2746	int index = 0;
2747	unsigned long end;
2748	struct vm_area_struct * vma;
2749	int unmapped_error = 0;
2750	long error = -EINVAL;
2751
2752	down_read(&current->mm->mmap_sem);
2753
2754	if (start & ~PAGE_CACHE_MASK)
2755		goto out;
2756	len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
2757	end = start + len;
2758	if (end < start)
2759		goto out;
2760
2761	error = 0;
2762	if (end == start)
2763		goto out;
2764
2765	/*
2766	 * If the interval [start,end) covers some unmapped address
2767	 * ranges, just ignore them, but return -ENOMEM at the end.
2768	 */
2769	vma = find_vma(current->mm, start);
2770	for (;;) {
2771		/* Still start < end. */
2772		error = -ENOMEM;
2773		if (!vma)
2774			goto out;
2775
2776		/* Here start < vma->vm_end. */
2777		if (start < vma->vm_start) {
2778			unmapped_error = -ENOMEM;
2779			start = vma->vm_start;
2780		}
2781
2782		/* Here vma->vm_start <= start < vma->vm_end. */
2783		if (end <= vma->vm_end) {
2784			if (start < end) {
2785				error = mincore_vma(vma, start, end,
2786							&vec[index]);
2787				if (error)
2788					goto out;
2789			}
2790			error = unmapped_error;
2791			goto out;
2792		}
2793
2794		/* Here vma->vm_start <= start < vma->vm_end < end. */
2795		error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
2796		if (error)
2797			goto out;
2798		index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
2799		start = vma->vm_end;
2800		vma = vma->vm_next;
2801	}
2802
2803out:
2804	up_read(&current->mm->mmap_sem);
2805	return error;
2806}
2807
2808static inline
2809struct page *__read_cache_page(struct address_space *mapping,
2810				unsigned long index,
2811				int (*filler)(void *,struct page*),
2812				void *data)
2813{
2814	struct page **hash = page_hash(mapping, index);
2815	struct page *page, *cached_page = NULL;
2816	int err;
2817repeat:
2818	page = __find_get_page(mapping, index, hash);
2819	if (!page) {
2820		if (!cached_page) {
2821			cached_page = page_cache_alloc(mapping);
2822			if (!cached_page)
2823				return ERR_PTR(-ENOMEM);
2824		}
2825		page = cached_page;
2826		if (add_to_page_cache_unique(page, mapping, index, hash))
2827			goto repeat;
2828		cached_page = NULL;
2829		err = filler(data, page);
2830		if (err < 0) {
2831			page_cache_release(page);
2832			page = ERR_PTR(err);
2833		}
2834	}
2835	if (cached_page)
2836		page_cache_release(cached_page);
2837	return page;
2838}
2839
2840/*
2841 * Read into the page cache. If a page already exists,
2842 * and Page_Uptodate() is not set, try to fill the page.
2843 */
2844struct page *read_cache_page(struct address_space *mapping,
2845				unsigned long index,
2846				int (*filler)(void *,struct page*),
2847				void *data)
2848{
2849	struct page *page;
2850	int err;
2851
2852retry:
2853	page = __read_cache_page(mapping, index, filler, data);
2854	if (IS_ERR(page))
2855		goto out;
2856	mark_page_accessed(page);
2857	if (Page_Uptodate(page))
2858		goto out;
2859
2860	lock_page(page);
2861	if (!page->mapping) {
2862		UnlockPage(page);
2863		page_cache_release(page);
2864		goto retry;
2865	}
2866	if (Page_Uptodate(page)) {
2867		UnlockPage(page);
2868		goto out;
2869	}
2870	err = filler(data, page);
2871	if (err < 0) {
2872		page_cache_release(page);
2873		page = ERR_PTR(err);
2874	}
2875 out:
2876	return page;
2877}
2878
2879static inline struct page * __grab_cache_page(struct address_space *mapping,
2880				unsigned long index, struct page **cached_page)
2881{
2882	struct page *page, **hash = page_hash(mapping, index);
2883repeat:
2884	page = __find_lock_page(mapping, index, hash);
2885	if (!page) {
2886		if (!*cached_page) {
2887			*cached_page = page_cache_alloc(mapping);
2888			if (!*cached_page)
2889				return NULL;
2890		}
2891		page = *cached_page;
2892		if (add_to_page_cache_unique(page, mapping, index, hash))
2893			goto repeat;
2894		*cached_page = NULL;
2895	}
2896	return page;
2897}
2898
2899inline void remove_suid(struct inode *inode)
2900{
2901	unsigned int mode;
2902
2903	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
2904	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
2905
2906	/* was any of the uid bits set? */
2907	mode &= inode->i_mode;
2908	if (mode && !capable(CAP_FSETID)) {
2909		inode->i_mode &= ~mode;
2910		mark_inode_dirty(inode);
2911	}
2912}
2913
2914/*
2915 * Write to a file through the page cache.
2916 *
2917 * We currently put everything into the page cache prior to writing it.
2918 * This is not a problem when writing full pages. With partial pages,
2919 * however, we first have to read the data into the cache, then
2920 * dirty the page, and finally schedule it for writing. Alternatively, we
2921 * could write-through just the portion of data that would go into that
2922 * page, but that would kill performance for applications that write data
2923 * line by line, and it's prone to race conditions.
2924 *
2925 * Note that this routine doesn't try to keep track of dirty pages. Each
2926 * file system has to do this all by itself, unfortunately.
2927 *							okir@monad.swb.de
2928 */
2929ssize_t
2930generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
2931{
2932	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2933	struct inode	*inode = mapping->host;
2934	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
2935	loff_t		pos;
2936	struct page	*page, *cached_page;
2937	ssize_t		written;
2938	long		status = 0;
2939	int		err;
2940	unsigned	bytes;
2941
2942	if ((ssize_t) count < 0)
2943		return -EINVAL;
2944
2945	if (!access_ok(VERIFY_READ, buf, count))
2946		return -EFAULT;
2947
2948	cached_page = NULL;
2949
2950	down(&inode->i_sem);
2951
2952	pos = *ppos;
2953	err = -EINVAL;
2954	if (pos < 0)
2955		goto out;
2956
2957	err = file->f_error;
2958	if (err) {
2959		file->f_error = 0;
2960		goto out;
2961	}
2962
2963	written = 0;
2964
2965	if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
2966		pos = inode->i_size;
2967
2968	/*
2969	 * Check whether we've reached the file size limit.
2970	 */
2971	err = -EFBIG;
2972
2973	if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
2974		if (pos >= limit) {
2975			send_sig(SIGXFSZ, current, 0);
2976			goto out;
2977		}
2978		if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) {
2979			/* send_sig(SIGXFSZ, current, 0); */
2980			count = limit - (u32)pos;
2981		}
2982	}
2983
2984	/*
2985	 *	LFS rule
2986	 */
2987	if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
2988		if (pos >= MAX_NON_LFS) {
2989			send_sig(SIGXFSZ, current, 0);
2990			goto out;
2991		}
2992		if (count > MAX_NON_LFS - (u32)pos) {
2993			/* send_sig(SIGXFSZ, current, 0); */
2994			count = MAX_NON_LFS - (u32)pos;
2995		}
2996	}
2997
2998	/*
2999	 *	Are we about to exceed the fs block limit ?
3000	 *
3001	 *	If we have written data it becomes a short write
3002	 *	If we have exceeded without writing data we send
3003	 *	a signal and give them an EFBIG.
3004	 *
3005	 *	Linus frestrict idea will clean these up nicely..
3006	 */
3007
3008	if (!S_ISBLK(inode->i_mode)) {
3009		if (pos >= inode->i_sb->s_maxbytes)
3010		{
3011			if (count || pos > inode->i_sb->s_maxbytes) {
3012				send_sig(SIGXFSZ, current, 0);
3013				err = -EFBIG;
3014				goto out;
3015			}
3016			/* zero-length writes at ->s_maxbytes are OK */
3017		}
3018
3019		if (pos + count > inode->i_sb->s_maxbytes)
3020			count = inode->i_sb->s_maxbytes - pos;
3021	} else {
3022		if (is_read_only(inode->i_rdev)) {
3023			err = -EPERM;
3024			goto out;
3025		}
3026		if (pos >= inode->i_size) {
3027			if (count || pos > inode->i_size) {
3028				err = -ENOSPC;
3029				goto out;
3030			}
3031		}
3032
3033		if (pos + count > inode->i_size)
3034			count = inode->i_size - pos;
3035	}
3036
3037	err = 0;
3038	if (count == 0)
3039		goto out;
3040
3041	remove_suid(inode);
3042	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3043	mark_inode_dirty_sync(inode);
3044
3045	if (file->f_flags & O_DIRECT)
3046		goto o_direct;
3047
3048	do {
3049		unsigned long index, offset;
3050		long page_fault;
3051		char *kaddr;
3052
3053		/*
3054		 * Try to find the page in the cache. If it isn't there,
3055		 * allocate a free page.
3056		 */
3057		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3058		index = pos >> PAGE_CACHE_SHIFT;
3059		bytes = PAGE_CACHE_SIZE - offset;
3060		if (bytes > count)
3061			bytes = count;
3062
3063		/*
3064		 * Bring in the user page that we will copy from _first_.
3065		 * Otherwise there's a nasty deadlock on copying from the
3066		 * same page as we're writing to, without it being marked
3067		 * up-to-date.
3068		 */
3069		{ volatile unsigned char dummy;
3070			__get_user(dummy, buf);
3071			__get_user(dummy, buf+bytes-1);
3072		}
3073
3074		status = -ENOMEM;	/* we'll assign it later anyway */
3075		page = __grab_cache_page(mapping, index, &cached_page);
3076		if (!page)
3077			break;
3078
3079		/* We have exclusive IO access to the page.. */
3080		if (!PageLocked(page)) {
3081			PAGE_BUG(page);
3082		}
3083
3084		kaddr = kmap(page);
3085		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3086		if (status)
3087			goto sync_failure;
3088		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3089		flush_dcache_page(page);
3090		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3091		if (page_fault)
3092			goto fail_write;
3093		if (!status)
3094			status = bytes;
3095
3096		if (status >= 0) {
3097			written += status;
3098			count -= status;
3099			pos += status;
3100			buf += status;
3101		}
3102unlock:
3103		kunmap(page);
3104		/* Mark it unlocked again and drop the page.. */
3105		SetPageReferenced(page);
3106		UnlockPage(page);
3107		page_cache_release(page);
3108
3109		if (status < 0)
3110			break;
3111	} while (count);
3112done:
3113	*ppos = pos;
3114
3115	if (cached_page)
3116		page_cache_release(cached_page);
3117
3118	/* For now, when the user asks for O_SYNC, we'll actually
3119	 * provide O_DSYNC. */
3120	if (status >= 0) {
3121		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3122			status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3123	}
3124
3125out_status:
3126	err = written ? written : status;
3127out:
3128
3129	up(&inode->i_sem);
3130	return err;
3131fail_write:
3132	status = -EFAULT;
3133	goto unlock;
3134
3135sync_failure:
3136	/*
3137	 * If blocksize < pagesize, prepare_write() may have instantiated a
3138	 * few blocks outside i_size.  Trim these off again.
3139	 */
3140	kunmap(page);
3141	UnlockPage(page);
3142	page_cache_release(page);
3143	if (pos + bytes > inode->i_size)
3144		vmtruncate(inode, inode->i_size);
3145	goto done;
3146
3147o_direct:
3148	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3149	if (written > 0) {
3150		loff_t end = pos + written;
3151		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3152			inode->i_size = end;
3153			mark_inode_dirty(inode);
3154		}
3155		*ppos = end;
3156		invalidate_inode_pages2(mapping);
3157	}
3158	/*
3159	 * Sync the fs metadata but not the minor inode changes and
3160	 * of course not the data as we did direct DMA for the IO.
3161	 */
3162	if (written >= 0 && file->f_flags & O_SYNC)
3163		status = generic_osync_inode(inode, OSYNC_METADATA);
3164	goto out_status;
3165}
3166
3167void __init page_cache_init(unsigned long mempages)
3168{
3169	unsigned long htable_size, order;
3170
3171	htable_size = mempages;
3172	htable_size *= sizeof(struct page *);
3173	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3174		;
3175
3176	do {
3177		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3178
3179		page_hash_bits = 0;
3180		while((tmp >>= 1UL) != 0UL)
3181			page_hash_bits++;
3182
3183		page_hash_table = (struct page **)
3184			__get_free_pages(GFP_ATOMIC, order);
3185	} while(page_hash_table == NULL && --order > 0);
3186
3187	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3188	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
3189	if (!page_hash_table)
3190		panic("Failed to allocate page hash table\n");
3191	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
3192}
3193