1/* 2 * linux/mm/filemap.c 3 * 4 * Copyright (C) 1994-1999 Linus Torvalds 5 */ 6 7/* 8 * This file handles the generic file mmap semantics used by 9 * most "normal" filesystems (but you don't /have/ to use this: 10 * the NFS filesystem used to do this differently, for example) 11 */ 12#include <linux/module.h> 13#include <linux/slab.h> 14#include <linux/shm.h> 15#include <linux/mman.h> 16#include <linux/locks.h> 17#include <linux/pagemap.h> 18#include <linux/swap.h> 19#include <linux/smp_lock.h> 20#include <linux/blkdev.h> 21#include <linux/file.h> 22#include <linux/swapctl.h> 23#include <linux/init.h> 24#include <linux/mm.h> 25#include <linux/iobuf.h> 26 27#include <asm/pgalloc.h> 28#include <asm/uaccess.h> 29#include <asm/mman.h> 30 31#include <linux/highmem.h> 32 33/* 34 * Shared mappings implemented 30.11.1994. It's not fully working yet, 35 * though. 36 * 37 * Shared mappings now work. 15.8.1995 Bruno. 38 * 39 * finished 'unifying' the page and buffer cache and SMP-threaded the 40 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> 41 * 42 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> 43 */ 44 45atomic_t page_cache_size = ATOMIC_INIT(0); 46unsigned int page_hash_bits; 47struct page **page_hash_table; 48 49int vm_max_readahead = 31; 50int vm_min_readahead = 3; 51EXPORT_SYMBOL(vm_max_readahead); 52EXPORT_SYMBOL(vm_min_readahead); 53 54 55spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED}; 56/* 57 * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 58 * with the pagecache_lock held. 59 * 60 * Ordering: 61 * swap_lock -> 62 * pagemap_lru_lock -> 63 * pagecache_lock 64 */ 65spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED}; 66 67#define CLUSTER_PAGES (1 << page_cluster) 68#define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) 69 70static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); 71static void add_page_to_hash_queue(struct page * page, struct page **p) 72{ 73 struct page *next = *p; 74 75 *p = page; 76 page->next_hash = next; 77 page->pprev_hash = p; 78 if (next) 79 next->pprev_hash = &page->next_hash; 80 if (page->buffers) 81 PAGE_BUG(page); 82 atomic_inc(&page_cache_size); 83} 84 85static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) 86{ 87 struct list_head *head = &mapping->clean_pages; 88 89 mapping->nrpages++; 90 list_add(&page->list, head); 91 page->mapping = mapping; 92} 93 94static inline void remove_page_from_inode_queue(struct page * page) 95{ 96 struct address_space * mapping = page->mapping; 97 98 mapping->nrpages--; 99 list_del(&page->list); 100 page->mapping = NULL; 101} 102 103static inline void remove_page_from_hash_queue(struct page * page) 104{ 105 struct page *next = page->next_hash; 106 struct page **pprev = page->pprev_hash; 107 108 if (next) 109 next->pprev_hash = pprev; 110 *pprev = next; 111 page->pprev_hash = NULL; 112 atomic_dec(&page_cache_size); 113} 114 115/* 116 * Remove a page from the page cache and free it. Caller has to make 117 * sure the page is locked and that nobody else uses it - or that usage 118 * is safe. 119 */ 120void __remove_inode_page(struct page *page) 121{ 122 if (PageDirty(page) && !PageSwapCache(page)) 123 BUG(); 124 remove_page_from_inode_queue(page); 125 remove_page_from_hash_queue(page); 126} 127 128void remove_inode_page(struct page *page) 129{ 130 if (!PageLocked(page)) 131 PAGE_BUG(page); 132 133 spin_lock(&pagecache_lock); 134 __remove_inode_page(page); 135 spin_unlock(&pagecache_lock); 136} 137 138static inline int sync_page(struct page *page) 139{ 140 struct address_space *mapping = page->mapping; 141 142 if (mapping && mapping->a_ops && mapping->a_ops->sync_page) 143 return mapping->a_ops->sync_page(page); 144 return 0; 145} 146 147/* 148 * Add a page to the dirty page list. 149 */ 150void set_page_dirty(struct page *page) 151{ 152 if (!test_and_set_bit(PG_dirty, &page->flags)) { 153 struct address_space *mapping = page->mapping; 154 155 if (mapping) { 156 spin_lock(&pagecache_lock); 157 mapping = page->mapping; 158 if (mapping) { /* may have been truncated */ 159 list_del(&page->list); 160 list_add(&page->list, &mapping->dirty_pages); 161 } 162 spin_unlock(&pagecache_lock); 163 164 if (mapping && mapping->host) 165 mark_inode_dirty_pages(mapping->host); 166 } 167 } 168} 169 170/** 171 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode 172 * @inode: the inode which pages we want to invalidate 173 * 174 * This function only removes the unlocked pages, if you want to 175 * remove all the pages of one inode, you must call truncate_inode_pages. 176 */ 177 178void invalidate_inode_pages(struct inode * inode) 179{ 180 struct list_head *head, *curr; 181 struct page * page; 182 183 head = &inode->i_mapping->clean_pages; 184 185 spin_lock(&pagemap_lru_lock); 186 spin_lock(&pagecache_lock); 187 curr = head->next; 188 189 while (curr != head) { 190 page = list_entry(curr, struct page, list); 191 curr = curr->next; 192 193 /* We cannot invalidate something in dirty.. */ 194 if (PageDirty(page)) 195 continue; 196 197 /* ..or locked */ 198 if (TryLockPage(page)) 199 continue; 200 201 if (page->buffers && !try_to_free_buffers(page, 0)) 202 goto unlock; 203 204 if (page_count(page) != 1) 205 goto unlock; 206 207 __lru_cache_del(page); 208 __remove_inode_page(page); 209 UnlockPage(page); 210 page_cache_release(page); 211 continue; 212unlock: 213 UnlockPage(page); 214 continue; 215 } 216 217 spin_unlock(&pagecache_lock); 218 spin_unlock(&pagemap_lru_lock); 219} 220 221static int do_flushpage(struct page *page, unsigned long offset) 222{ 223 int (*flushpage) (struct page *, unsigned long); 224 flushpage = page->mapping->a_ops->flushpage; 225 if (flushpage) 226 return (*flushpage)(page, offset); 227 return block_flushpage(page, offset); 228} 229 230static inline void truncate_partial_page(struct page *page, unsigned partial) 231{ 232 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); 233 if (page->buffers) 234 do_flushpage(page, partial); 235} 236 237static void truncate_complete_page(struct page *page) 238{ 239 /* Leave it on the LRU if it gets converted into anonymous buffers */ 240 if (!page->buffers || do_flushpage(page, 0)) 241 lru_cache_del(page); 242 243 /* 244 * We remove the page from the page cache _after_ we have 245 * destroyed all buffer-cache references to it. Otherwise some 246 * other process might think this inode page is not in the 247 * page cache and creates a buffer-cache alias to it causing 248 * all sorts of fun problems ... 249 */ 250 ClearPageDirty(page); 251 ClearPageUptodate(page); 252 remove_inode_page(page); 253 page_cache_release(page); 254} 255 256static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); 257static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) 258{ 259 struct list_head *curr; 260 struct page * page; 261 int unlocked = 0; 262 263 restart: 264 curr = head->prev; 265 while (curr != head) { 266 unsigned long offset; 267 268 page = list_entry(curr, struct page, list); 269 offset = page->index; 270 271 /* Is one of the pages to truncate? */ 272 if ((offset >= start) || (*partial && (offset + 1) == start)) { 273 int failed; 274 275 page_cache_get(page); 276 failed = TryLockPage(page); 277 278 list_del(head); 279 if (!failed) 280 /* Restart after this page */ 281 list_add_tail(head, curr); 282 else 283 /* Restart on this page */ 284 list_add(head, curr); 285 286 spin_unlock(&pagecache_lock); 287 unlocked = 1; 288 289 if (!failed) { 290 if (*partial && (offset + 1) == start) { 291 truncate_partial_page(page, *partial); 292 *partial = 0; 293 } else 294 truncate_complete_page(page); 295 296 UnlockPage(page); 297 } else 298 wait_on_page(page); 299 300 page_cache_release(page); 301 302 if (current->need_resched) { 303 __set_current_state(TASK_RUNNING); 304 schedule(); 305 } 306 307 spin_lock(&pagecache_lock); 308 goto restart; 309 } 310 curr = curr->prev; 311 } 312 return unlocked; 313} 314 315 316/** 317 * truncate_inode_pages - truncate *all* the pages from an offset 318 * @mapping: mapping to truncate 319 * @lstart: offset from with to truncate 320 * 321 * Truncate the page cache at a set offset, removing the pages 322 * that are beyond that offset (and zeroing out partial pages). 323 * If any page is locked we wait for it to become unlocked. 324 */ 325void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 326{ 327 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 328 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 329 int unlocked; 330 331 spin_lock(&pagecache_lock); 332 do { 333 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); 334 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); 335 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); 336 } while (unlocked); 337 /* Traversed all three lists without dropping the lock */ 338 spin_unlock(&pagecache_lock); 339} 340 341static inline int invalidate_this_page2(struct page * page, 342 struct list_head * curr, 343 struct list_head * head) 344{ 345 int unlocked = 1; 346 347 /* 348 * The page is locked and we hold the pagecache_lock as well 349 * so both page_count(page) and page->buffers stays constant here. 350 */ 351 if (page_count(page) == 1 + !!page->buffers) { 352 /* Restart after this page */ 353 list_del(head); 354 list_add_tail(head, curr); 355 356 page_cache_get(page); 357 spin_unlock(&pagecache_lock); 358 truncate_complete_page(page); 359 } else { 360 if (page->buffers) { 361 /* Restart after this page */ 362 list_del(head); 363 list_add_tail(head, curr); 364 365 page_cache_get(page); 366 spin_unlock(&pagecache_lock); 367 block_invalidate_page(page); 368 } else 369 unlocked = 0; 370 371 ClearPageDirty(page); 372 ClearPageUptodate(page); 373 } 374 375 return unlocked; 376} 377 378static int FASTCALL(invalidate_list_pages2(struct list_head *)); 379static int invalidate_list_pages2(struct list_head *head) 380{ 381 struct list_head *curr; 382 struct page * page; 383 int unlocked = 0; 384 385 restart: 386 curr = head->prev; 387 while (curr != head) { 388 page = list_entry(curr, struct page, list); 389 390 if (!TryLockPage(page)) { 391 int __unlocked; 392 393 __unlocked = invalidate_this_page2(page, curr, head); 394 UnlockPage(page); 395 unlocked |= __unlocked; 396 if (!__unlocked) { 397 curr = curr->prev; 398 continue; 399 } 400 } else { 401 /* Restart on this page */ 402 list_del(head); 403 list_add(head, curr); 404 405 page_cache_get(page); 406 spin_unlock(&pagecache_lock); 407 unlocked = 1; 408 wait_on_page(page); 409 } 410 411 page_cache_release(page); 412 if (current->need_resched) { 413 __set_current_state(TASK_RUNNING); 414 schedule(); 415 } 416 417 spin_lock(&pagecache_lock); 418 goto restart; 419 } 420 return unlocked; 421} 422 423/** 424 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't 425 * free the pages because they're mapped. 426 * @mapping: the address_space which pages we want to invalidate 427 */ 428void invalidate_inode_pages2(struct address_space * mapping) 429{ 430 int unlocked; 431 432 spin_lock(&pagecache_lock); 433 do { 434 unlocked = invalidate_list_pages2(&mapping->clean_pages); 435 unlocked |= invalidate_list_pages2(&mapping->dirty_pages); 436 unlocked |= invalidate_list_pages2(&mapping->locked_pages); 437 } while (unlocked); 438 spin_unlock(&pagecache_lock); 439} 440 441static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) 442{ 443 goto inside; 444 445 for (;;) { 446 page = page->next_hash; 447inside: 448 if (!page) 449 goto not_found; 450 if (page->mapping != mapping) 451 continue; 452 if (page->index == offset) 453 break; 454 } 455 456not_found: 457 return page; 458} 459 460static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *)) 461{ 462 struct list_head *curr; 463 struct page *page; 464 int retval = 0; 465 466 spin_lock(&pagecache_lock); 467 curr = head->next; 468 while (curr != head) { 469 page = list_entry(curr, struct page, list); 470 curr = curr->next; 471 if (!page->buffers) 472 continue; 473 if (page->index >= end) 474 continue; 475 if (page->index < start) 476 continue; 477 478 page_cache_get(page); 479 spin_unlock(&pagecache_lock); 480 lock_page(page); 481 482 /* The buffers could have been free'd while we waited for the page lock */ 483 if (page->buffers) 484 retval |= fn(page); 485 486 UnlockPage(page); 487 spin_lock(&pagecache_lock); 488 curr = page->list.next; 489 page_cache_release(page); 490 } 491 spin_unlock(&pagecache_lock); 492 493 return retval; 494} 495 496/* 497 * Two-stage data sync: first start the IO, then go back and 498 * collect the information.. 499 */ 500int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx) 501{ 502 int retval; 503 504 /* writeout dirty buffers on pages from both clean and dirty lists */ 505 retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page); 506 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page); 507 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page); 508 509 /* now wait for locked buffers on pages from both clean and dirty lists */ 510 retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page); 511 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page); 512 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page); 513 514 return retval; 515} 516 517/* 518 * In-memory filesystems have to fail their 519 * writepage function - and this has to be 520 * worked around in the VM layer.. 521 * 522 * We 523 * - mark the page dirty again (but do NOT 524 * add it back to the inode dirty list, as 525 * that would livelock in fdatasync) 526 * - activate the page so that the page stealer 527 * doesn't try to write it out over and over 528 * again. 529 */ 530int fail_writepage(struct page *page) 531{ 532 /* Only activate on memory-pressure, not fsync.. */ 533 if (PageLaunder(page)) { 534 activate_page(page); 535 SetPageReferenced(page); 536 } 537 538 /* Set the page dirty again, unlock */ 539 SetPageDirty(page); 540 UnlockPage(page); 541 return 0; 542} 543 544EXPORT_SYMBOL(fail_writepage); 545 546/** 547 * filemap_fdatasync - walk the list of dirty pages of the given address space 548 * and writepage() all of them. 549 * 550 * @mapping: address space structure to write 551 * 552 */ 553int filemap_fdatasync(struct address_space * mapping) 554{ 555 int ret = 0; 556 int (*writepage)(struct page *) = mapping->a_ops->writepage; 557 558 spin_lock(&pagecache_lock); 559 560 while (!list_empty(&mapping->dirty_pages)) { 561 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list); 562 563 list_del(&page->list); 564 list_add(&page->list, &mapping->locked_pages); 565 566 if (!PageDirty(page)) 567 continue; 568 569 page_cache_get(page); 570 spin_unlock(&pagecache_lock); 571 572 lock_page(page); 573 574 if (PageDirty(page)) { 575 int err; 576 ClearPageDirty(page); 577 err = writepage(page); 578 if (err && !ret) 579 ret = err; 580 } else 581 UnlockPage(page); 582 583 page_cache_release(page); 584 spin_lock(&pagecache_lock); 585 } 586 spin_unlock(&pagecache_lock); 587 return ret; 588} 589 590/** 591 * filemap_fdatawait - walk the list of locked pages of the given address space 592 * and wait for all of them. 593 * 594 * @mapping: address space structure to wait for 595 * 596 */ 597int filemap_fdatawait(struct address_space * mapping) 598{ 599 int ret = 0; 600 601 spin_lock(&pagecache_lock); 602 603 while (!list_empty(&mapping->locked_pages)) { 604 struct page *page = list_entry(mapping->locked_pages.next, struct page, list); 605 606 list_del(&page->list); 607 list_add(&page->list, &mapping->clean_pages); 608 609 if (!PageLocked(page)) 610 continue; 611 612 page_cache_get(page); 613 spin_unlock(&pagecache_lock); 614 615 ___wait_on_page(page); 616 if (PageError(page)) 617 ret = -EIO; 618 619 page_cache_release(page); 620 spin_lock(&pagecache_lock); 621 } 622 spin_unlock(&pagecache_lock); 623 return ret; 624} 625 626/* 627 * Add a page to the inode page cache. 628 * 629 * The caller must have locked the page and 630 * set all the page flags correctly.. 631 */ 632void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) 633{ 634 if (!PageLocked(page)) 635 BUG(); 636 637 page->index = index; 638 page_cache_get(page); 639 spin_lock(&pagecache_lock); 640 add_page_to_inode_queue(mapping, page); 641 add_page_to_hash_queue(page, page_hash(mapping, index)); 642 spin_unlock(&pagecache_lock); 643 644 lru_cache_add(page); 645} 646 647/* 648 * This adds a page to the page cache, starting out as locked, 649 * owned by us, but unreferenced, not uptodate and with no errors. 650 */ 651static inline void __add_to_page_cache(struct page * page, 652 struct address_space *mapping, unsigned long offset, 653 struct page **hash) 654{ 655 unsigned long flags; 656 657 flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked); 658 page->flags = flags | (1 << PG_locked); 659 page_cache_get(page); 660 page->index = offset; 661 add_page_to_inode_queue(mapping, page); 662 add_page_to_hash_queue(page, hash); 663} 664 665void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset) 666{ 667 spin_lock(&pagecache_lock); 668 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset)); 669 spin_unlock(&pagecache_lock); 670 lru_cache_add(page); 671} 672 673int add_to_page_cache_unique(struct page * page, 674 struct address_space *mapping, unsigned long offset, 675 struct page **hash) 676{ 677 int err; 678 struct page *alias; 679 680 spin_lock(&pagecache_lock); 681 alias = __find_page_nolock(mapping, offset, *hash); 682 683 err = 1; 684 if (!alias) { 685 __add_to_page_cache(page,mapping,offset,hash); 686 err = 0; 687 } 688 689 spin_unlock(&pagecache_lock); 690 if (!err) 691 lru_cache_add(page); 692 return err; 693} 694 695/* 696 * This adds the requested page to the page cache if it isn't already there, 697 * and schedules an I/O to read in its contents from disk. 698 */ 699static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); 700static int page_cache_read(struct file * file, unsigned long offset) 701{ 702 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 703 struct page **hash = page_hash(mapping, offset); 704 struct page *page; 705 706 spin_lock(&pagecache_lock); 707 page = __find_page_nolock(mapping, offset, *hash); 708 spin_unlock(&pagecache_lock); 709 if (page) 710 return 0; 711 712 page = page_cache_alloc(mapping); 713 if (!page) 714 return -ENOMEM; 715 716 if (!add_to_page_cache_unique(page, mapping, offset, hash)) { 717 int error = mapping->a_ops->readpage(file, page); 718 page_cache_release(page); 719 return error; 720 } 721 /* 722 * We arrive here in the unlikely event that someone 723 * raced with us and added our page to the cache first. 724 */ 725 page_cache_release(page); 726 return 0; 727} 728 729/* 730 * Read in an entire cluster at once. A cluster is usually a 64k- 731 * aligned block that includes the page requested in "offset." 732 */ 733static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset, 734 unsigned long filesize)); 735static int read_cluster_nonblocking(struct file * file, unsigned long offset, 736 unsigned long filesize) 737{ 738 unsigned long pages = CLUSTER_PAGES; 739 740 offset = CLUSTER_OFFSET(offset); 741 while ((pages-- > 0) && (offset < filesize)) { 742 int error = page_cache_read(file, offset); 743 if (error < 0) 744 return error; 745 offset ++; 746 } 747 748 return 0; 749} 750 751/* 752 * Knuth recommends primes in approximately golden ratio to the maximum 753 * integer representable by a machine word for multiplicative hashing. 754 * Chuck Lever verified the effectiveness of this technique: 755 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf 756 * 757 * These primes are chosen to be bit-sparse, that is operations on 758 * them can use shifts and additions instead of multiplications for 759 * machines where multiplications are slow. 760 */ 761#if BITS_PER_LONG == 32 762/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ 763#define GOLDEN_RATIO_PRIME 0x9e370001UL 764#elif BITS_PER_LONG == 64 765/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ 766#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL 767#else 768#error Define GOLDEN_RATIO_PRIME for your wordsize. 769#endif 770 771/* 772 * In order to wait for pages to become available there must be 773 * waitqueues associated with pages. By using a hash table of 774 * waitqueues where the bucket discipline is to maintain all 775 * waiters on the same queue and wake all when any of the pages 776 * become available, and for the woken contexts to check to be 777 * sure the appropriate page became available, this saves space 778 * at a cost of "thundering herd" phenomena during rare hash 779 * collisions. 780 */ 781static inline wait_queue_head_t *page_waitqueue(struct page *page) 782{ 783 const zone_t *zone = page_zone(page); 784 wait_queue_head_t *wait = zone->wait_table; 785 unsigned long hash = (unsigned long)page; 786 787#if BITS_PER_LONG == 64 788 /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ 789 unsigned long n = hash; 790 n <<= 18; 791 hash -= n; 792 n <<= 33; 793 hash -= n; 794 n <<= 3; 795 hash += n; 796 n <<= 3; 797 hash -= n; 798 n <<= 4; 799 hash += n; 800 n <<= 2; 801 hash += n; 802#else 803 /* On some cpus multiply is faster, on others gcc will do shifts */ 804 hash *= GOLDEN_RATIO_PRIME; 805#endif 806 hash >>= zone->wait_table_shift; 807 808 return &wait[hash]; 809} 810 811/* 812 * Wait for a page to get unlocked. 813 * 814 * This must be called with the caller "holding" the page, 815 * ie with increased "page->count" so that the page won't 816 * go away during the wait.. 817 * 818 * The waiting strategy is to get on a waitqueue determined 819 * by hashing. Waiters will then collide, and the newly woken 820 * task must then determine whether it was woken for the page 821 * it really wanted, and go back to sleep on the waitqueue if 822 * that wasn't it. With the waitqueue semantics, it never leaves 823 * the waitqueue unless it calls, so the loop moves forward one 824 * iteration every time there is 825 * (1) a collision 826 * and 827 * (2) one of the colliding pages is woken 828 * 829 * This is the thundering herd problem, but it is expected to 830 * be very rare due to the few pages that are actually being 831 * waited on at any given time and the quality of the hash function. 832 */ 833void ___wait_on_page(struct page *page) 834{ 835 wait_queue_head_t *waitqueue = page_waitqueue(page); 836 struct task_struct *tsk = current; 837 DECLARE_WAITQUEUE(wait, tsk); 838 839 add_wait_queue(waitqueue, &wait); 840 do { 841 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 842 if (!PageLocked(page)) 843 break; 844 sync_page(page); 845 schedule(); 846 } while (PageLocked(page)); 847 __set_task_state(tsk, TASK_RUNNING); 848 remove_wait_queue(waitqueue, &wait); 849} 850 851/* 852 * unlock_page() is the other half of the story just above 853 * __wait_on_page(). Here a couple of quick checks are done 854 * and a couple of flags are set on the page, and then all 855 * of the waiters for all of the pages in the appropriate 856 * wait queue are woken. 857 */ 858void unlock_page(struct page *page) 859{ 860 wait_queue_head_t *waitqueue = page_waitqueue(page); 861 ClearPageLaunder(page); 862 smp_mb__before_clear_bit(); 863 if (!test_and_clear_bit(PG_locked, &(page)->flags)) 864 BUG(); 865 smp_mb__after_clear_bit(); 866 867 /* 868 * Although the default semantics of wake_up() are 869 * to wake all, here the specific function is used 870 * to make it even more explicit that a number of 871 * pages are being waited on here. 872 */ 873 if (waitqueue_active(waitqueue)) 874 wake_up_all(waitqueue); 875} 876 877/* 878 * Get a lock on the page, assuming we need to sleep 879 * to get it.. 880 */ 881static void __lock_page(struct page *page) 882{ 883 wait_queue_head_t *waitqueue = page_waitqueue(page); 884 struct task_struct *tsk = current; 885 DECLARE_WAITQUEUE(wait, tsk); 886 887 add_wait_queue_exclusive(waitqueue, &wait); 888 for (;;) { 889 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 890 if (PageLocked(page)) { 891 sync_page(page); 892 schedule(); 893 } 894 if (!TryLockPage(page)) 895 break; 896 } 897 __set_task_state(tsk, TASK_RUNNING); 898 remove_wait_queue(waitqueue, &wait); 899} 900 901/* 902 * Get an exclusive lock on the page, optimistically 903 * assuming it's not locked.. 904 */ 905void lock_page(struct page *page) 906{ 907 if (TryLockPage(page)) 908 __lock_page(page); 909} 910 911/* 912 * a rather lightweight function, finding and getting a reference to a 913 * hashed page atomically. 914 */ 915struct page * __find_get_page(struct address_space *mapping, 916 unsigned long offset, struct page **hash) 917{ 918 struct page *page; 919 920 /* 921 * We scan the hash list read-only. Addition to and removal from 922 * the hash-list needs a held write-lock. 923 */ 924 spin_lock(&pagecache_lock); 925 page = __find_page_nolock(mapping, offset, *hash); 926 if (page) 927 page_cache_get(page); 928 spin_unlock(&pagecache_lock); 929 return page; 930} 931 932/* 933 * Same as above, but trylock it instead of incrementing the count. 934 */ 935struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) 936{ 937 struct page *page; 938 struct page **hash = page_hash(mapping, offset); 939 940 spin_lock(&pagecache_lock); 941 page = __find_page_nolock(mapping, offset, *hash); 942 if (page) { 943 if (TryLockPage(page)) 944 page = NULL; 945 } 946 spin_unlock(&pagecache_lock); 947 return page; 948} 949 950/* 951 * Must be called with the pagecache lock held, 952 * will return with it held (but it may be dropped 953 * during blocking operations.. 954 */ 955static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); 956static struct page * __find_lock_page_helper(struct address_space *mapping, 957 unsigned long offset, struct page *hash) 958{ 959 struct page *page; 960 961 /* 962 * We scan the hash list read-only. Addition to and removal from 963 * the hash-list needs a held write-lock. 964 */ 965repeat: 966 page = __find_page_nolock(mapping, offset, hash); 967 if (page) { 968 page_cache_get(page); 969 if (TryLockPage(page)) { 970 spin_unlock(&pagecache_lock); 971 lock_page(page); 972 spin_lock(&pagecache_lock); 973 974 /* Has the page been re-allocated while we slept? */ 975 if (page->mapping != mapping || page->index != offset) { 976 UnlockPage(page); 977 page_cache_release(page); 978 goto repeat; 979 } 980 } 981 } 982 return page; 983} 984 985/* 986 * Same as the above, but lock the page too, verifying that 987 * it's still valid once we own it. 988 */ 989struct page * __find_lock_page (struct address_space *mapping, 990 unsigned long offset, struct page **hash) 991{ 992 struct page *page; 993 994 spin_lock(&pagecache_lock); 995 page = __find_lock_page_helper(mapping, offset, *hash); 996 spin_unlock(&pagecache_lock); 997 return page; 998} 999 1000/* 1001 * Same as above, but create the page if required.. 1002 */ 1003struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) 1004{ 1005 struct page *page; 1006 struct page **hash = page_hash(mapping, index); 1007 1008 spin_lock(&pagecache_lock); 1009 page = __find_lock_page_helper(mapping, index, *hash); 1010 spin_unlock(&pagecache_lock); 1011 if (!page) { 1012 struct page *newpage = alloc_page(gfp_mask); 1013 if (newpage) { 1014 spin_lock(&pagecache_lock); 1015 page = __find_lock_page_helper(mapping, index, *hash); 1016 if (likely(!page)) { 1017 page = newpage; 1018 __add_to_page_cache(page, mapping, index, hash); 1019 newpage = NULL; 1020 } 1021 spin_unlock(&pagecache_lock); 1022 if (newpage == NULL) 1023 lru_cache_add(page); 1024 else 1025 page_cache_release(newpage); 1026 } 1027 } 1028 return page; 1029} 1030 1031/* 1032 * Same as grab_cache_page, but do not wait if the page is unavailable. 1033 * This is intended for speculative data generators, where the data can 1034 * be regenerated if the page couldn't be grabbed. This routine should 1035 * be safe to call while holding the lock for another page. 1036 */ 1037struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) 1038{ 1039 struct page *page, **hash; 1040 1041 hash = page_hash(mapping, index); 1042 page = __find_get_page(mapping, index, hash); 1043 1044 if ( page ) { 1045 if ( !TryLockPage(page) ) { 1046 /* Page found and locked */ 1047 /* This test is overly paranoid, but what the heck... */ 1048 if ( unlikely(page->mapping != mapping || page->index != index) ) { 1049 /* Someone reallocated this page under us. */ 1050 UnlockPage(page); 1051 page_cache_release(page); 1052 return NULL; 1053 } else { 1054 return page; 1055 } 1056 } else { 1057 /* Page locked by someone else */ 1058 page_cache_release(page); 1059 return NULL; 1060 } 1061 } 1062 1063 page = page_cache_alloc(mapping); 1064 if ( unlikely(!page) ) 1065 return NULL; /* Failed to allocate a page */ 1066 1067 if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) { 1068 /* Someone else grabbed the page already. */ 1069 page_cache_release(page); 1070 return NULL; 1071 } 1072 1073 return page; 1074} 1075 1076 1077/* 1078 * Read-ahead profiling information 1079 * -------------------------------- 1080 * Every PROFILE_MAXREADCOUNT, the following information is written 1081 * to the syslog: 1082 * Percentage of asynchronous read-ahead. 1083 * Average of read-ahead fields context value. 1084 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 1085 * to the syslog. 1086 */ 1087 1088#ifdef PROFILE_READAHEAD 1089 1090#define PROFILE_MAXREADCOUNT 1000 1091 1092static unsigned long total_reada; 1093static unsigned long total_async; 1094static unsigned long total_ramax; 1095static unsigned long total_ralen; 1096static unsigned long total_rawin; 1097 1098static void profile_readahead(int async, struct file *filp) 1099{ 1100 unsigned long flags; 1101 1102 ++total_reada; 1103 if (async) 1104 ++total_async; 1105 1106 total_ramax += filp->f_ramax; 1107 total_ralen += filp->f_ralen; 1108 total_rawin += filp->f_rawin; 1109 1110 if (total_reada > PROFILE_MAXREADCOUNT) { 1111 save_flags(flags); 1112 cli(); 1113 if (!(total_reada > PROFILE_MAXREADCOUNT)) { 1114 restore_flags(flags); 1115 return; 1116 } 1117 1118 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n", 1119 total_ramax/total_reada, 1120 total_ralen/total_reada, 1121 total_rawin/total_reada, 1122 (total_async*100)/total_reada); 1123#ifdef DEBUG_READAHEAD 1124 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n", 1125 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend); 1126#endif 1127 1128 total_reada = 0; 1129 total_async = 0; 1130 total_ramax = 0; 1131 total_ralen = 0; 1132 total_rawin = 0; 1133 1134 restore_flags(flags); 1135 } 1136} 1137#endif /* defined PROFILE_READAHEAD */ 1138 1139/* 1140 * Read-ahead context: 1141 * ------------------- 1142 * The read ahead context fields of the "struct file" are the following: 1143 * - f_raend : position of the first byte after the last page we tried to 1144 * read ahead. 1145 * - f_ramax : current read-ahead maximum size. 1146 * - f_ralen : length of the current IO read block we tried to read-ahead. 1147 * - f_rawin : length of the current read-ahead window. 1148 * if last read-ahead was synchronous then 1149 * f_rawin = f_ralen 1150 * otherwise (was asynchronous) 1151 * f_rawin = previous value of f_ralen + f_ralen 1152 * 1153 * Read-ahead limits: 1154 * ------------------ 1155 * MIN_READAHEAD : minimum read-ahead size when read-ahead. 1156 * MAX_READAHEAD : maximum read-ahead size when read-ahead. 1157 * 1158 * Synchronous read-ahead benefits: 1159 * -------------------------------- 1160 * Using reasonable IO xfer length from peripheral devices increase system 1161 * performances. 1162 * Reasonable means, in this context, not too large but not too small. 1163 * The actual maximum value is: 1164 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined 1165 * and 32K if defined (4K page size assumed). 1166 * 1167 * Asynchronous read-ahead benefits: 1168 * --------------------------------- 1169 * Overlapping next read request and user process execution increase system 1170 * performance. 1171 * 1172 * Read-ahead risks: 1173 * ----------------- 1174 * We have to guess which further data are needed by the user process. 1175 * If these data are often not really needed, it's bad for system 1176 * performances. 1177 * However, we know that files are often accessed sequentially by 1178 * application programs and it seems that it is possible to have some good 1179 * strategy in that guessing. 1180 * We only try to read-ahead files that seems to be read sequentially. 1181 * 1182 * Asynchronous read-ahead risks: 1183 * ------------------------------ 1184 * In order to maximize overlapping, we must start some asynchronous read 1185 * request from the device, as soon as possible. 1186 * We must be very careful about: 1187 * - The number of effective pending IO read requests. 1188 * ONE seems to be the only reasonable value. 1189 * - The total memory pool usage for the file access stream. 1190 * This maximum memory usage is implicitly 2 IO read chunks: 1191 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined, 1192 * 64k if defined (4K page size assumed). 1193 */ 1194 1195static inline int get_max_readahead(struct inode * inode) 1196{ 1197 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)]) 1198 return vm_max_readahead; 1199 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; 1200} 1201 1202static void generic_file_readahead(int reada_ok, 1203 struct file * filp, struct inode * inode, 1204 struct page * page) 1205{ 1206 unsigned long end_index; 1207 unsigned long index = page->index; 1208 unsigned long max_ahead, ahead; 1209 unsigned long raend; 1210 int max_readahead = get_max_readahead(inode); 1211 1212 end_index = inode->i_size >> PAGE_CACHE_SHIFT; 1213 1214 raend = filp->f_raend; 1215 max_ahead = 0; 1216 1217/* 1218 * The current page is locked. 1219 * If the current position is inside the previous read IO request, do not 1220 * try to reread previously read ahead pages. 1221 * Otherwise decide or not to read ahead some pages synchronously. 1222 * If we are not going to read ahead, set the read ahead context for this 1223 * page only. 1224 */ 1225 if (PageLocked(page)) { 1226 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) { 1227 raend = index; 1228 if (raend < end_index) 1229 max_ahead = filp->f_ramax; 1230 filp->f_rawin = 0; 1231 filp->f_ralen = 1; 1232 if (!max_ahead) { 1233 filp->f_raend = index + filp->f_ralen; 1234 filp->f_rawin += filp->f_ralen; 1235 } 1236 } 1237 } 1238/* 1239 * The current page is not locked. 1240 * If we were reading ahead and, 1241 * if the current max read ahead size is not zero and, 1242 * if the current position is inside the last read-ahead IO request, 1243 * it is the moment to try to read ahead asynchronously. 1244 * We will later force unplug device in order to force asynchronous read IO. 1245 */ 1246 else if (reada_ok && filp->f_ramax && raend >= 1 && 1247 index <= raend && index + filp->f_ralen >= raend) { 1248/* 1249 * Add ONE page to max_ahead in order to try to have about the same IO max size 1250 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE. 1251 * Compute the position of the last page we have tried to read in order to 1252 * begin to read ahead just at the next page. 1253 */ 1254 raend -= 1; 1255 if (raend < end_index) 1256 max_ahead = filp->f_ramax + 1; 1257 1258 if (max_ahead) { 1259 filp->f_rawin = filp->f_ralen; 1260 filp->f_ralen = 0; 1261 reada_ok = 2; 1262 } 1263 } 1264/* 1265 * Try to read ahead pages. 1266 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the 1267 * scheduler, will work enough for us to avoid too bad actuals IO requests. 1268 */ 1269 ahead = 0; 1270 while (ahead < max_ahead) { 1271 ahead ++; 1272 if ((raend + ahead) >= end_index) 1273 break; 1274 if (page_cache_read(filp, raend + ahead) < 0) 1275 break; 1276 } 1277/* 1278 * If we tried to read ahead some pages, 1279 * If we tried to read ahead asynchronously, 1280 * Try to force unplug of the device in order to start an asynchronous 1281 * read IO request. 1282 * Update the read-ahead context. 1283 * Store the length of the current read-ahead window. 1284 * Double the current max read ahead size. 1285 * That heuristic avoid to do some large IO for files that are not really 1286 * accessed sequentially. 1287 */ 1288 if (ahead) { 1289 filp->f_ralen += ahead; 1290 filp->f_rawin += filp->f_ralen; 1291 filp->f_raend = raend + ahead + 1; 1292 1293 filp->f_ramax += filp->f_ramax; 1294 1295 if (filp->f_ramax > max_readahead) 1296 filp->f_ramax = max_readahead; 1297 1298#ifdef PROFILE_READAHEAD 1299 profile_readahead((reada_ok == 2), filp); 1300#endif 1301 } 1302 1303 return; 1304} 1305 1306/* 1307 * Mark a page as having seen activity. 1308 * 1309 * If it was already so marked, move it to the active queue and drop 1310 * the referenced bit. Otherwise, just mark it for future action.. 1311 */ 1312void mark_page_accessed(struct page *page) 1313{ 1314 if (!PageActive(page) && PageReferenced(page)) { 1315 activate_page(page); 1316 ClearPageReferenced(page); 1317 } else 1318 SetPageReferenced(page); 1319} 1320 1321/* 1322 * This is a generic file read routine, and uses the 1323 * inode->i_op->readpage() function for the actual low-level 1324 * stuff. 1325 * 1326 * This is really ugly. But the goto's actually try to clarify some 1327 * of the logic when it comes to error handling etc. 1328 */ 1329void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) 1330{ 1331 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; 1332 struct inode *inode = mapping->host; 1333 unsigned long index, offset; 1334 struct page *cached_page; 1335 int reada_ok; 1336 int error; 1337 int max_readahead = get_max_readahead(inode); 1338 1339 cached_page = NULL; 1340 index = *ppos >> PAGE_CACHE_SHIFT; 1341 offset = *ppos & ~PAGE_CACHE_MASK; 1342 1343/* 1344 * If the current position is outside the previous read-ahead window, 1345 * we reset the current read-ahead context and set read ahead max to zero 1346 * (will be set to just needed value later), 1347 * otherwise, we assume that the file accesses are sequential enough to 1348 * continue read-ahead. 1349 */ 1350 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) { 1351 reada_ok = 0; 1352 filp->f_raend = 0; 1353 filp->f_ralen = 0; 1354 filp->f_ramax = 0; 1355 filp->f_rawin = 0; 1356 } else { 1357 reada_ok = 1; 1358 } 1359/* 1360 * Adjust the current value of read-ahead max. 1361 * If the read operation stay in the first half page, force no readahead. 1362 * Otherwise try to increase read ahead max just enough to do the read request. 1363 * Then, at least MIN_READAHEAD if read ahead is ok, 1364 * and at most MAX_READAHEAD in all cases. 1365 */ 1366 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) { 1367 filp->f_ramax = 0; 1368 } else { 1369 unsigned long needed; 1370 1371 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1; 1372 1373 if (filp->f_ramax < needed) 1374 filp->f_ramax = needed; 1375 1376 if (reada_ok && filp->f_ramax < vm_min_readahead) 1377 filp->f_ramax = vm_min_readahead; 1378 if (filp->f_ramax > max_readahead) 1379 filp->f_ramax = max_readahead; 1380 } 1381 1382 for (;;) { 1383 struct page *page, **hash; 1384 unsigned long end_index, nr, ret; 1385 1386 end_index = inode->i_size >> PAGE_CACHE_SHIFT; 1387 1388 if (index > end_index) 1389 break; 1390 nr = PAGE_CACHE_SIZE; 1391 if (index == end_index) { 1392 nr = inode->i_size & ~PAGE_CACHE_MASK; 1393 if (nr <= offset) 1394 break; 1395 } 1396 1397 nr = nr - offset; 1398 1399 /* 1400 * Try to find the data in the page cache.. 1401 */ 1402 hash = page_hash(mapping, index); 1403 1404 spin_lock(&pagecache_lock); 1405 page = __find_page_nolock(mapping, index, *hash); 1406 if (!page) 1407 goto no_cached_page; 1408found_page: 1409 page_cache_get(page); 1410 spin_unlock(&pagecache_lock); 1411 1412 if (!Page_Uptodate(page)) 1413 goto page_not_up_to_date; 1414 generic_file_readahead(reada_ok, filp, inode, page); 1415page_ok: 1416 /* If users can be writing to this page using arbitrary 1417 * virtual addresses, take care about potential aliasing 1418 * before reading the page on the kernel side. 1419 */ 1420 if (mapping->i_mmap_shared != NULL) 1421 flush_dcache_page(page); 1422 1423 /* 1424 * Mark the page accessed if we read the 1425 * beginning or we just did an lseek. 1426 */ 1427 if (!offset || !filp->f_reada) 1428 mark_page_accessed(page); 1429 1430 /* 1431 * Ok, we have the page, and it's up-to-date, so 1432 * now we can copy it to user space... 1433 * 1434 * The actor routine returns how many bytes were actually used.. 1435 * NOTE! This may not be the same as how much of a user buffer 1436 * we filled up (we may be padding etc), so we can only update 1437 * "pos" here (the actor routine has to update the user buffer 1438 * pointers and the remaining count). 1439 */ 1440 ret = actor(desc, page, offset, nr); 1441 offset += ret; 1442 index += offset >> PAGE_CACHE_SHIFT; 1443 offset &= ~PAGE_CACHE_MASK; 1444 1445 page_cache_release(page); 1446 if (ret == nr && desc->count) 1447 continue; 1448 break; 1449 1450/* 1451 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it.. 1452 */ 1453page_not_up_to_date: 1454 generic_file_readahead(reada_ok, filp, inode, page); 1455 1456 if (Page_Uptodate(page)) 1457 goto page_ok; 1458 1459 /* Get exclusive access to the page ... */ 1460 lock_page(page); 1461 1462 /* Did it get unhashed before we got the lock? */ 1463 if (!page->mapping) { 1464 UnlockPage(page); 1465 page_cache_release(page); 1466 continue; 1467 } 1468 1469 /* Did somebody else fill it already? */ 1470 if (Page_Uptodate(page)) { 1471 UnlockPage(page); 1472 goto page_ok; 1473 } 1474 1475readpage: 1476 /* ... and start the actual read. The read will unlock the page. */ 1477 error = mapping->a_ops->readpage(filp, page); 1478 1479 if (!error) { 1480 if (Page_Uptodate(page)) 1481 goto page_ok; 1482 1483 /* Again, try some read-ahead while waiting for the page to finish.. */ 1484 generic_file_readahead(reada_ok, filp, inode, page); 1485 wait_on_page(page); 1486 if (Page_Uptodate(page)) 1487 goto page_ok; 1488 error = -EIO; 1489 } 1490 1491 /* UHHUH! A synchronous read error occurred. Report it */ 1492 desc->error = error; 1493 page_cache_release(page); 1494 break; 1495 1496no_cached_page: 1497 /* 1498 * Ok, it wasn't cached, so we need to create a new 1499 * page.. 1500 * 1501 * We get here with the page cache lock held. 1502 */ 1503 if (!cached_page) { 1504 spin_unlock(&pagecache_lock); 1505 cached_page = page_cache_alloc(mapping); 1506 if (!cached_page) { 1507 desc->error = -ENOMEM; 1508 break; 1509 } 1510 1511 /* 1512 * Somebody may have added the page while we 1513 * dropped the page cache lock. Check for that. 1514 */ 1515 spin_lock(&pagecache_lock); 1516 page = __find_page_nolock(mapping, index, *hash); 1517 if (page) 1518 goto found_page; 1519 } 1520 1521 /* 1522 * Ok, add the new page to the hash-queues... 1523 */ 1524 page = cached_page; 1525 __add_to_page_cache(page, mapping, index, hash); 1526 spin_unlock(&pagecache_lock); 1527 lru_cache_add(page); 1528 cached_page = NULL; 1529 1530 goto readpage; 1531 } 1532 1533 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 1534 filp->f_reada = 1; 1535 if (cached_page) 1536 page_cache_release(cached_page); 1537 UPDATE_ATIME(inode); 1538} 1539 1540static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) 1541{ 1542 ssize_t retval; 1543 int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; 1544 struct kiobuf * iobuf; 1545 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping; 1546 struct inode * inode = mapping->host; 1547 loff_t size = inode->i_size; 1548 1549 new_iobuf = 0; 1550 iobuf = filp->f_iobuf; 1551 if (test_and_set_bit(0, &filp->f_iobuf_lock)) { 1552 /* 1553 * A parallel read/write is using the preallocated iobuf 1554 * so just run slow and allocate a new one. 1555 */ 1556 retval = alloc_kiovec(1, &iobuf); 1557 if (retval) 1558 goto out; 1559 new_iobuf = 1; 1560 } 1561 1562 blocksize = 1 << inode->i_blkbits; 1563 blocksize_bits = inode->i_blkbits; 1564 blocksize_mask = blocksize - 1; 1565 chunk_size = KIO_MAX_ATOMIC_IO << 10; 1566 1567 retval = -EINVAL; 1568 if ((offset & blocksize_mask) || (count & blocksize_mask)) 1569 goto out_free; 1570 if (!mapping->a_ops->direct_IO) 1571 goto out_free; 1572 1573 if ((rw == READ) && (offset + count > size)) 1574 count = size - offset; 1575 1576 /* 1577 * Flush to disk exclusively the _data_, metadata must remain 1578 * completly asynchronous or performance will go to /dev/null. 1579 */ 1580 retval = filemap_fdatasync(mapping); 1581 if (retval == 0) 1582 retval = fsync_inode_data_buffers(inode); 1583 if (retval == 0) 1584 retval = filemap_fdatawait(mapping); 1585 if (retval < 0) 1586 goto out_free; 1587 1588 progress = retval = 0; 1589 while (count > 0) { 1590 iosize = count; 1591 if (iosize > chunk_size) 1592 iosize = chunk_size; 1593 1594 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); 1595 if (retval) 1596 break; 1597 1598 retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); 1599 1600 if (rw == READ && retval > 0) 1601 mark_dirty_kiobuf(iobuf, retval); 1602 1603 if (retval >= 0) { 1604 count -= retval; 1605 buf += retval; 1606 /* warning: weird semantics here, we're reporting a read behind the end of the file */ 1607 progress += retval; 1608 } 1609 1610 unmap_kiobuf(iobuf); 1611 1612 if (retval != iosize) 1613 break; 1614 } 1615 1616 if (progress) 1617 retval = progress; 1618 1619 out_free: 1620 if (!new_iobuf) 1621 clear_bit(0, &filp->f_iobuf_lock); 1622 else 1623 free_kiovec(1, &iobuf); 1624 out: 1625 return retval; 1626} 1627 1628int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) 1629{ 1630 char *kaddr; 1631 unsigned long left, count = desc->count; 1632 1633 if (size > count) 1634 size = count; 1635 1636 kaddr = kmap(page); 1637 left = __copy_to_user(desc->buf, kaddr + offset, size); 1638 kunmap(page); 1639 1640 if (left) { 1641 size -= left; 1642 desc->error = -EFAULT; 1643 } 1644 desc->count = count - size; 1645 desc->written += size; 1646 desc->buf += size; 1647 return size; 1648} 1649 1650/* 1651 * This is the "read()" routine for all filesystems 1652 * that can use the page cache directly. 1653 */ 1654ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos) 1655{ 1656 ssize_t retval; 1657 1658 if ((ssize_t) count < 0) 1659 return -EINVAL; 1660 1661 if (filp->f_flags & O_DIRECT) 1662 goto o_direct; 1663 1664 retval = -EFAULT; 1665 if (access_ok(VERIFY_WRITE, buf, count)) { 1666 retval = 0; 1667 1668 if (count) { 1669 read_descriptor_t desc; 1670 1671 desc.written = 0; 1672 desc.count = count; 1673 desc.buf = buf; 1674 desc.error = 0; 1675 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1676 1677 retval = desc.written; 1678 if (!retval) 1679 retval = desc.error; 1680 } 1681 } 1682 out: 1683 return retval; 1684 1685 o_direct: 1686 { 1687 loff_t pos = *ppos, size; 1688 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; 1689 struct inode *inode = mapping->host; 1690 1691 retval = 0; 1692 if (!count) 1693 goto out; /* skip atime */ 1694 size = inode->i_size; 1695 if (pos < size) { 1696 retval = generic_file_direct_IO(READ, filp, buf, count, pos); 1697 if (retval > 0) 1698 *ppos = pos + retval; 1699 } 1700 UPDATE_ATIME(filp->f_dentry->d_inode); 1701 goto out; 1702 } 1703} 1704 1705static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) 1706{ 1707 ssize_t written; 1708 unsigned long count = desc->count; 1709 struct file *file = (struct file *) desc->buf; 1710 1711 if (size > count) 1712 size = count; 1713 1714 if (file->f_op->sendpage) { 1715 written = file->f_op->sendpage(file, page, offset, 1716 size, &file->f_pos, size<count); 1717 } else { 1718 char *kaddr; 1719 mm_segment_t old_fs; 1720 1721 old_fs = get_fs(); 1722 set_fs(KERNEL_DS); 1723 1724 kaddr = kmap(page); 1725 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos); 1726 kunmap(page); 1727 1728 set_fs(old_fs); 1729 } 1730 if (written < 0) { 1731 desc->error = written; 1732 written = 0; 1733 } 1734 desc->count = count - written; 1735 desc->written += written; 1736 return written; 1737} 1738 1739asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count) 1740{ 1741 ssize_t retval; 1742 struct file * in_file, * out_file; 1743 struct inode * in_inode, * out_inode; 1744 1745 /* 1746 * Get input file, and verify that it is ok.. 1747 */ 1748 retval = -EBADF; 1749 in_file = fget(in_fd); 1750 if (!in_file) 1751 goto out; 1752 if (!(in_file->f_mode & FMODE_READ)) 1753 goto fput_in; 1754 retval = -EINVAL; 1755 in_inode = in_file->f_dentry->d_inode; 1756 if (!in_inode) 1757 goto fput_in; 1758 if (!in_inode->i_mapping->a_ops->readpage) 1759 goto fput_in; 1760 retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count); 1761 if (retval) 1762 goto fput_in; 1763 1764 /* 1765 * Get output file, and verify that it is ok.. 1766 */ 1767 retval = -EBADF; 1768 out_file = fget(out_fd); 1769 if (!out_file) 1770 goto fput_in; 1771 if (!(out_file->f_mode & FMODE_WRITE)) 1772 goto fput_out; 1773 retval = -EINVAL; 1774 if (!out_file->f_op || !out_file->f_op->write) 1775 goto fput_out; 1776 out_inode = out_file->f_dentry->d_inode; 1777 retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count); 1778 if (retval) 1779 goto fput_out; 1780 1781 retval = 0; 1782 if (count) { 1783 read_descriptor_t desc; 1784 loff_t pos = 0, *ppos; 1785 1786 retval = -EFAULT; 1787 ppos = &in_file->f_pos; 1788 if (offset) { 1789 if (get_user(pos, offset)) 1790 goto fput_out; 1791 ppos = &pos; 1792 } 1793 1794 desc.written = 0; 1795 desc.count = count; 1796 desc.buf = (char *) out_file; 1797 desc.error = 0; 1798 do_generic_file_read(in_file, ppos, &desc, file_send_actor); 1799 1800 retval = desc.written; 1801 if (!retval) 1802 retval = desc.error; 1803 if (offset) 1804 put_user(pos, offset); 1805 } 1806 1807fput_out: 1808 fput(out_file); 1809fput_in: 1810 fput(in_file); 1811out: 1812 return retval; 1813} 1814 1815static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr) 1816{ 1817 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 1818 unsigned long max; 1819 1820 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 1821 return -EINVAL; 1822 1823 /* Limit it to the size of the file.. */ 1824 max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT; 1825 if (index > max) 1826 return 0; 1827 max -= index; 1828 if (nr > max) 1829 nr = max; 1830 1831 /* And limit it to a sane percentage of the inactive list.. */ 1832 max = nr_inactive_pages / 2; 1833 if (nr > max) 1834 nr = max; 1835 1836 while (nr) { 1837 page_cache_read(file, index); 1838 index++; 1839 nr--; 1840 } 1841 return 0; 1842} 1843 1844asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) 1845{ 1846 ssize_t ret; 1847 struct file *file; 1848 1849 ret = -EBADF; 1850 file = fget(fd); 1851 if (file) { 1852 if (file->f_mode & FMODE_READ) { 1853 unsigned long start = offset >> PAGE_CACHE_SHIFT; 1854 unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT; 1855 ret = do_readahead(file, start, len); 1856 } 1857 fput(file); 1858 } 1859 return ret; 1860} 1861 1862/* 1863 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are 1864 * sure this is sequential access, we don't need a flexible read-ahead 1865 * window size -- we can always use a large fixed size window. 1866 */ 1867static void nopage_sequential_readahead(struct vm_area_struct * vma, 1868 unsigned long pgoff, unsigned long filesize) 1869{ 1870 unsigned long ra_window; 1871 1872 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode); 1873 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1); 1874 1875 /* vm_raend is zero if we haven't read ahead in this area yet. */ 1876 if (vma->vm_raend == 0) 1877 vma->vm_raend = vma->vm_pgoff + ra_window; 1878 1879 /* 1880 * If we've just faulted the page half-way through our window, 1881 * then schedule reads for the next window, and release the 1882 * pages in the previous window. 1883 */ 1884 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) { 1885 unsigned long start = vma->vm_pgoff + vma->vm_raend; 1886 unsigned long end = start + ra_window; 1887 1888 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff)) 1889 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff; 1890 if (start > end) 1891 return; 1892 1893 while ((start < end) && (start < filesize)) { 1894 if (read_cluster_nonblocking(vma->vm_file, 1895 start, filesize) < 0) 1896 break; 1897 start += CLUSTER_PAGES; 1898 } 1899 run_task_queue(&tq_disk); 1900 1901 /* if we're far enough past the beginning of this area, 1902 recycle pages that are in the previous window. */ 1903 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) { 1904 unsigned long window = ra_window << PAGE_SHIFT; 1905 1906 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT); 1907 end -= window + window; 1908 filemap_sync(vma, end - window, window, MS_INVALIDATE); 1909 } 1910 1911 vma->vm_raend += ra_window; 1912 } 1913 1914 return; 1915} 1916 1917/* 1918 * filemap_nopage() is invoked via the vma operations vector for a 1919 * mapped memory region to read in file data during a page fault. 1920 * 1921 * The goto's are kind of ugly, but this streamlines the normal case of having 1922 * it in the page cache, and handles the special cases reasonably without 1923 * having a lot of duplicated code. 1924 */ 1925struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) 1926{ 1927 int error; 1928 struct file *file = area->vm_file; 1929 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 1930 struct inode *inode = mapping->host; 1931 struct page *page, **hash; 1932 unsigned long size, pgoff, endoff; 1933 1934 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 1935 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; 1936 1937retry_all: 1938 /* 1939 * An external ptracer can access pages that normally aren't 1940 * accessible.. 1941 */ 1942 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1943 if ((pgoff >= size) && (area->vm_mm == current->mm)) 1944 return NULL; 1945 1946 /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ 1947 if (size > endoff) 1948 size = endoff; 1949 1950 /* 1951 * Do we have something in the page cache already? 1952 */ 1953 hash = page_hash(mapping, pgoff); 1954retry_find: 1955 page = __find_get_page(mapping, pgoff, hash); 1956 if (!page) 1957 goto no_cached_page; 1958 1959 /* 1960 * Ok, found a page in the page cache, now we need to check 1961 * that it's up-to-date. 1962 */ 1963 if (!Page_Uptodate(page)) 1964 goto page_not_uptodate; 1965 1966success: 1967 /* 1968 * Try read-ahead for sequential areas. 1969 */ 1970 if (VM_SequentialReadHint(area)) 1971 nopage_sequential_readahead(area, pgoff, size); 1972 1973 /* 1974 * Found the page and have a reference on it, need to check sharing 1975 * and possibly copy it over to another page.. 1976 */ 1977 mark_page_accessed(page); 1978 flush_page_to_ram(page); 1979 return page; 1980 1981no_cached_page: 1982 /* 1983 * If the requested offset is within our file, try to read a whole 1984 * cluster of pages at once. 1985 * 1986 * Otherwise, we're off the end of a privately mapped file, 1987 * so we need to map a zero page. 1988 */ 1989 if ((pgoff < size) && !VM_RandomReadHint(area)) 1990 error = read_cluster_nonblocking(file, pgoff, size); 1991 else 1992 error = page_cache_read(file, pgoff); 1993 1994 /* 1995 * The page we want has now been added to the page cache. 1996 * In the unlikely event that someone removed it in the 1997 * meantime, we'll just come back here and read it again. 1998 */ 1999 if (error >= 0) 2000 goto retry_find; 2001 2002 /* 2003 * An error return from page_cache_read can result if the 2004 * system is low on memory, or a problem occurs while trying 2005 * to schedule I/O. 2006 */ 2007 if (error == -ENOMEM) 2008 return NOPAGE_OOM; 2009 return NULL; 2010 2011page_not_uptodate: 2012 lock_page(page); 2013 2014 /* Did it get unhashed while we waited for it? */ 2015 if (!page->mapping) { 2016 UnlockPage(page); 2017 page_cache_release(page); 2018 goto retry_all; 2019 } 2020 2021 /* Did somebody else get it up-to-date? */ 2022 if (Page_Uptodate(page)) { 2023 UnlockPage(page); 2024 goto success; 2025 } 2026 2027 if (!mapping->a_ops->readpage(file, page)) { 2028 wait_on_page(page); 2029 if (Page_Uptodate(page)) 2030 goto success; 2031 } 2032 2033 /* 2034 * Umm, take care of errors if the page isn't up-to-date. 2035 * Try to re-read it _once_. We do this synchronously, 2036 * because there really aren't any performance issues here 2037 * and we need to check for errors. 2038 */ 2039 lock_page(page); 2040 2041 /* Somebody truncated the page on us? */ 2042 if (!page->mapping) { 2043 UnlockPage(page); 2044 page_cache_release(page); 2045 goto retry_all; 2046 } 2047 2048 /* Somebody else successfully read it in? */ 2049 if (Page_Uptodate(page)) { 2050 UnlockPage(page); 2051 goto success; 2052 } 2053 ClearPageError(page); 2054 if (!mapping->a_ops->readpage(file, page)) { 2055 wait_on_page(page); 2056 if (Page_Uptodate(page)) 2057 goto success; 2058 } 2059 2060 /* 2061 * Things didn't work out. Return zero to tell the 2062 * mm layer so, possibly freeing the page cache page first. 2063 */ 2064 page_cache_release(page); 2065 return NULL; 2066} 2067 2068/* Called with mm->page_table_lock held to protect against other 2069 * threads/the swapper from ripping pte's out from under us. 2070 */ 2071static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, 2072 unsigned long address, unsigned int flags) 2073{ 2074 pte_t pte = *ptep; 2075 2076 if (pte_present(pte)) { 2077 struct page *page = pte_page(pte); 2078 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) { 2079 flush_tlb_page(vma, address); 2080 set_page_dirty(page); 2081 } 2082 } 2083 return 0; 2084} 2085 2086static inline int filemap_sync_pte_range(pmd_t * pmd, 2087 unsigned long address, unsigned long size, 2088 struct vm_area_struct *vma, unsigned long offset, unsigned int flags) 2089{ 2090 pte_t * pte; 2091 unsigned long end; 2092 int error; 2093 2094 if (pmd_none(*pmd)) 2095 return 0; 2096 if (pmd_bad(*pmd)) { 2097 pmd_ERROR(*pmd); 2098 pmd_clear(pmd); 2099 return 0; 2100 } 2101 pte = pte_offset(pmd, address); 2102 offset += address & PMD_MASK; 2103 address &= ~PMD_MASK; 2104 end = address + size; 2105 if (end > PMD_SIZE) 2106 end = PMD_SIZE; 2107 error = 0; 2108 do { 2109 error |= filemap_sync_pte(pte, vma, address + offset, flags); 2110 address += PAGE_SIZE; 2111 pte++; 2112 } while (address && (address < end)); 2113 return error; 2114} 2115 2116static inline int filemap_sync_pmd_range(pgd_t * pgd, 2117 unsigned long address, unsigned long size, 2118 struct vm_area_struct *vma, unsigned int flags) 2119{ 2120 pmd_t * pmd; 2121 unsigned long offset, end; 2122 int error; 2123 2124 if (pgd_none(*pgd)) 2125 return 0; 2126 if (pgd_bad(*pgd)) { 2127 pgd_ERROR(*pgd); 2128 pgd_clear(pgd); 2129 return 0; 2130 } 2131 pmd = pmd_offset(pgd, address); 2132 offset = address & PGDIR_MASK; 2133 address &= ~PGDIR_MASK; 2134 end = address + size; 2135 if (end > PGDIR_SIZE) 2136 end = PGDIR_SIZE; 2137 error = 0; 2138 do { 2139 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags); 2140 address = (address + PMD_SIZE) & PMD_MASK; 2141 pmd++; 2142 } while (address && (address < end)); 2143 return error; 2144} 2145 2146int filemap_sync(struct vm_area_struct * vma, unsigned long address, 2147 size_t size, unsigned int flags) 2148{ 2149 pgd_t * dir; 2150 unsigned long end = address + size; 2151 int error = 0; 2152 2153 /* Aquire the lock early; it may be possible to avoid dropping 2154 * and reaquiring it repeatedly. 2155 */ 2156 spin_lock(&vma->vm_mm->page_table_lock); 2157 2158 dir = pgd_offset(vma->vm_mm, address); 2159 flush_cache_range(vma->vm_mm, end - size, end); 2160 if (address >= end) 2161 BUG(); 2162 do { 2163 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags); 2164 address = (address + PGDIR_SIZE) & PGDIR_MASK; 2165 dir++; 2166 } while (address && (address < end)); 2167 flush_tlb_range(vma->vm_mm, end - size, end); 2168 2169 spin_unlock(&vma->vm_mm->page_table_lock); 2170 2171 return error; 2172} 2173 2174static struct vm_operations_struct generic_file_vm_ops = { 2175 nopage: filemap_nopage, 2176}; 2177 2178/* This is used for a general mmap of a disk file */ 2179 2180int generic_file_mmap(struct file * file, struct vm_area_struct * vma) 2181{ 2182 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 2183 struct inode *inode = mapping->host; 2184 2185 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { 2186 if (!mapping->a_ops->writepage) 2187 return -EINVAL; 2188 } 2189 if (!mapping->a_ops->readpage) 2190 return -ENOEXEC; 2191 UPDATE_ATIME(inode); 2192 vma->vm_ops = &generic_file_vm_ops; 2193 return 0; 2194} 2195 2196/* 2197 * The msync() system call. 2198 */ 2199 2200/* 2201 * MS_SYNC syncs the entire file - including mappings. 2202 * 2203 * MS_ASYNC initiates writeout of just the dirty mapped data. 2204 * This provides no guarantee of file integrity - things like indirect 2205 * blocks may not have started writeout. MS_ASYNC is primarily useful 2206 * where the application knows that it has finished with the data and 2207 * wishes to intelligently schedule its own I/O traffic. 2208 */ 2209static int msync_interval(struct vm_area_struct * vma, 2210 unsigned long start, unsigned long end, int flags) 2211{ 2212 int ret = 0; 2213 struct file * file = vma->vm_file; 2214 2215 if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) ) 2216 return -EBUSY; 2217 2218 if (file && (vma->vm_flags & VM_SHARED)) { 2219 ret = filemap_sync(vma, start, end-start, flags); 2220 2221 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) { 2222 struct inode * inode = file->f_dentry->d_inode; 2223 2224 down(&inode->i_sem); 2225 ret = filemap_fdatasync(inode->i_mapping); 2226 if (flags & MS_SYNC) { 2227 int err; 2228 2229 if (file->f_op && file->f_op->fsync) { 2230 err = file->f_op->fsync(file, file->f_dentry, 1); 2231 if (err && !ret) 2232 ret = err; 2233 } 2234 err = filemap_fdatawait(inode->i_mapping); 2235 if (err && !ret) 2236 ret = err; 2237 } 2238 up(&inode->i_sem); 2239 } 2240 } 2241 return ret; 2242} 2243 2244asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 2245{ 2246 unsigned long end; 2247 struct vm_area_struct * vma; 2248 int unmapped_error, error = -EINVAL; 2249 2250 down_read(¤t->mm->mmap_sem); 2251 if (start & ~PAGE_MASK) 2252 goto out; 2253 len = (len + ~PAGE_MASK) & PAGE_MASK; 2254 end = start + len; 2255 if (end < start) 2256 goto out; 2257 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 2258 goto out; 2259 if ((flags & MS_ASYNC) && (flags & MS_SYNC)) 2260 goto out; 2261 2262 error = 0; 2263 if (end == start) 2264 goto out; 2265 /* 2266 * If the interval [start,end) covers some unmapped address ranges, 2267 * just ignore them, but return -ENOMEM at the end. 2268 */ 2269 vma = find_vma(current->mm, start); 2270 unmapped_error = 0; 2271 for (;;) { 2272 /* Still start < end. */ 2273 error = -ENOMEM; 2274 if (!vma) 2275 goto out; 2276 /* Here start < vma->vm_end. */ 2277 if (start < vma->vm_start) { 2278 unmapped_error = -ENOMEM; 2279 start = vma->vm_start; 2280 } 2281 /* Here vma->vm_start <= start < vma->vm_end. */ 2282 if (end <= vma->vm_end) { 2283 if (start < end) { 2284 error = msync_interval(vma, start, end, flags); 2285 if (error) 2286 goto out; 2287 } 2288 error = unmapped_error; 2289 goto out; 2290 } 2291 /* Here vma->vm_start <= start < vma->vm_end < end. */ 2292 error = msync_interval(vma, start, vma->vm_end, flags); 2293 if (error) 2294 goto out; 2295 start = vma->vm_end; 2296 vma = vma->vm_next; 2297 } 2298out: 2299 up_read(¤t->mm->mmap_sem); 2300 return error; 2301} 2302 2303static inline void setup_read_behavior(struct vm_area_struct * vma, 2304 int behavior) 2305{ 2306 VM_ClearReadHint(vma); 2307 switch(behavior) { 2308 case MADV_SEQUENTIAL: 2309 vma->vm_flags |= VM_SEQ_READ; 2310 break; 2311 case MADV_RANDOM: 2312 vma->vm_flags |= VM_RAND_READ; 2313 break; 2314 default: 2315 break; 2316 } 2317 return; 2318} 2319 2320static long madvise_fixup_start(struct vm_area_struct * vma, 2321 unsigned long end, int behavior) 2322{ 2323 struct vm_area_struct * n; 2324 struct mm_struct * mm = vma->vm_mm; 2325 2326 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2327 if (!n) 2328 return -EAGAIN; 2329 *n = *vma; 2330 n->vm_end = end; 2331 setup_read_behavior(n, behavior); 2332 n->vm_raend = 0; 2333 if (n->vm_file) 2334 get_file(n->vm_file); 2335 if (n->vm_ops && n->vm_ops->open) 2336 n->vm_ops->open(n); 2337 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; 2338 lock_vma_mappings(vma); 2339 spin_lock(&mm->page_table_lock); 2340 vma->vm_start = end; 2341 __insert_vm_struct(mm, n); 2342 spin_unlock(&mm->page_table_lock); 2343 unlock_vma_mappings(vma); 2344 return 0; 2345} 2346 2347static long madvise_fixup_end(struct vm_area_struct * vma, 2348 unsigned long start, int behavior) 2349{ 2350 struct vm_area_struct * n; 2351 struct mm_struct * mm = vma->vm_mm; 2352 2353 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2354 if (!n) 2355 return -EAGAIN; 2356 *n = *vma; 2357 n->vm_start = start; 2358 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; 2359 setup_read_behavior(n, behavior); 2360 n->vm_raend = 0; 2361 if (n->vm_file) 2362 get_file(n->vm_file); 2363 if (n->vm_ops && n->vm_ops->open) 2364 n->vm_ops->open(n); 2365 lock_vma_mappings(vma); 2366 spin_lock(&mm->page_table_lock); 2367 vma->vm_end = start; 2368 __insert_vm_struct(mm, n); 2369 spin_unlock(&mm->page_table_lock); 2370 unlock_vma_mappings(vma); 2371 return 0; 2372} 2373 2374static long madvise_fixup_middle(struct vm_area_struct * vma, 2375 unsigned long start, unsigned long end, int behavior) 2376{ 2377 struct vm_area_struct * left, * right; 2378 struct mm_struct * mm = vma->vm_mm; 2379 2380 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2381 if (!left) 2382 return -EAGAIN; 2383 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 2384 if (!right) { 2385 kmem_cache_free(vm_area_cachep, left); 2386 return -EAGAIN; 2387 } 2388 *left = *vma; 2389 *right = *vma; 2390 left->vm_end = start; 2391 right->vm_start = end; 2392 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; 2393 left->vm_raend = 0; 2394 right->vm_raend = 0; 2395 if (vma->vm_file) 2396 atomic_add(2, &vma->vm_file->f_count); 2397 2398 if (vma->vm_ops && vma->vm_ops->open) { 2399 vma->vm_ops->open(left); 2400 vma->vm_ops->open(right); 2401 } 2402 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; 2403 vma->vm_raend = 0; 2404 lock_vma_mappings(vma); 2405 spin_lock(&mm->page_table_lock); 2406 vma->vm_start = start; 2407 vma->vm_end = end; 2408 setup_read_behavior(vma, behavior); 2409 __insert_vm_struct(mm, left); 2410 __insert_vm_struct(mm, right); 2411 spin_unlock(&mm->page_table_lock); 2412 unlock_vma_mappings(vma); 2413 return 0; 2414} 2415 2416/* 2417 * We can potentially split a vm area into separate 2418 * areas, each area with its own behavior. 2419 */ 2420static long madvise_behavior(struct vm_area_struct * vma, 2421 unsigned long start, unsigned long end, int behavior) 2422{ 2423 int error = 0; 2424 2425 /* This caps the number of vma's this process can own */ 2426 if (vma->vm_mm->map_count > max_map_count) 2427 return -ENOMEM; 2428 2429 if (start == vma->vm_start) { 2430 if (end == vma->vm_end) { 2431 setup_read_behavior(vma, behavior); 2432 vma->vm_raend = 0; 2433 } else 2434 error = madvise_fixup_start(vma, end, behavior); 2435 } else { 2436 if (end == vma->vm_end) 2437 error = madvise_fixup_end(vma, start, behavior); 2438 else 2439 error = madvise_fixup_middle(vma, start, end, behavior); 2440 } 2441 2442 return error; 2443} 2444 2445/* 2446 * Schedule all required I/O operations, then run the disk queue 2447 * to make sure they are started. Do not wait for completion. 2448 */ 2449static long madvise_willneed(struct vm_area_struct * vma, 2450 unsigned long start, unsigned long end) 2451{ 2452 long error = -EBADF; 2453 struct file * file; 2454 unsigned long size, rlim_rss; 2455 2456 /* Doesn't work if there's no mapped file. */ 2457 if (!vma->vm_file) 2458 return error; 2459 file = vma->vm_file; 2460 size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> 2461 PAGE_CACHE_SHIFT; 2462 2463 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2464 if (end > vma->vm_end) 2465 end = vma->vm_end; 2466 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2467 2468 /* Make sure this doesn't exceed the process's max rss. */ 2469 error = -EIO; 2470 rlim_rss = current->rlim ? current->rlim[RLIMIT_RSS].rlim_cur : 2471 LONG_MAX; /* default: see resource.h */ 2472 if ((vma->vm_mm->rss + (end - start)) > rlim_rss) 2473 return error; 2474 2475 /* round to cluster boundaries if this isn't a "random" area. */ 2476 if (!VM_RandomReadHint(vma)) { 2477 start = CLUSTER_OFFSET(start); 2478 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1); 2479 2480 while ((start < end) && (start < size)) { 2481 error = read_cluster_nonblocking(file, start, size); 2482 start += CLUSTER_PAGES; 2483 if (error < 0) 2484 break; 2485 } 2486 } else { 2487 while ((start < end) && (start < size)) { 2488 error = page_cache_read(file, start); 2489 start++; 2490 if (error < 0) 2491 break; 2492 } 2493 } 2494 2495 /* Don't wait for someone else to push these requests. */ 2496 run_task_queue(&tq_disk); 2497 2498 return error; 2499} 2500 2501/* 2502 * Application no longer needs these pages. If the pages are dirty, 2503 * it's OK to just throw them away. The app will be more careful about 2504 * data it wants to keep. Be sure to free swap resources too. The 2505 * zap_page_range call sets things up for refill_inactive to actually free 2506 * these pages later if no one else has touched them in the meantime, 2507 * although we could add these pages to a global reuse list for 2508 * refill_inactive to pick up before reclaiming other pages. 2509 * 2510 * NB: This interface discards data rather than pushes it out to swap, 2511 * as some implementations do. This has performance implications for 2512 * applications like large transactional databases which want to discard 2513 * pages in anonymous maps after committing to backing store the data 2514 * that was kept in them. There is no reason to write this data out to 2515 * the swap area if the application is discarding it. 2516 * 2517 * An interface that causes the system to free clean pages and flush 2518 * dirty pages is already available as msync(MS_INVALIDATE). 2519 */ 2520static long madvise_dontneed(struct vm_area_struct * vma, 2521 unsigned long start, unsigned long end) 2522{ 2523 if (vma->vm_flags & VM_LOCKED) 2524 return -EINVAL; 2525 2526 zap_page_range(vma->vm_mm, start, end - start); 2527 return 0; 2528} 2529 2530static long madvise_vma(struct vm_area_struct * vma, unsigned long start, 2531 unsigned long end, int behavior) 2532{ 2533 long error = -EBADF; 2534 2535 switch (behavior) { 2536 case MADV_NORMAL: 2537 case MADV_SEQUENTIAL: 2538 case MADV_RANDOM: 2539 error = madvise_behavior(vma, start, end, behavior); 2540 break; 2541 2542 case MADV_WILLNEED: 2543 error = madvise_willneed(vma, start, end); 2544 break; 2545 2546 case MADV_DONTNEED: 2547 error = madvise_dontneed(vma, start, end); 2548 break; 2549 2550 default: 2551 error = -EINVAL; 2552 break; 2553 } 2554 2555 return error; 2556} 2557 2558/* 2559 * The madvise(2) system call. 2560 * 2561 * Applications can use madvise() to advise the kernel how it should 2562 * handle paging I/O in this VM area. The idea is to help the kernel 2563 * use appropriate read-ahead and caching techniques. The information 2564 * provided is advisory only, and can be safely disregarded by the 2565 * kernel without affecting the correct operation of the application. 2566 * 2567 * behavior values: 2568 * MADV_NORMAL - the default behavior is to read clusters. This 2569 * results in some read-ahead and read-behind. 2570 * MADV_RANDOM - the system should read the minimum amount of data 2571 * on any access, since it is unlikely that the appli- 2572 * cation will need more than what it asks for. 2573 * MADV_SEQUENTIAL - pages in the given range will probably be accessed 2574 * once, so they can be aggressively read ahead, and 2575 * can be freed soon after they are accessed. 2576 * MADV_WILLNEED - the application is notifying the system to read 2577 * some pages ahead. 2578 * MADV_DONTNEED - the application is finished with the given range, 2579 * so the kernel can free resources associated with it. 2580 * 2581 * return values: 2582 * zero - success 2583 * -EINVAL - start + len < 0, start is not page-aligned, 2584 * "behavior" is not a valid value, or application 2585 * is attempting to release locked or shared pages. 2586 * -ENOMEM - addresses in the specified range are not currently 2587 * mapped, or are outside the AS of the process. 2588 * -EIO - an I/O error occurred while paging in data. 2589 * -EBADF - map exists, but area maps something that isn't a file. 2590 * -EAGAIN - a kernel resource was temporarily unavailable. 2591 */ 2592asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior) 2593{ 2594 unsigned long end; 2595 struct vm_area_struct * vma; 2596 int unmapped_error = 0; 2597 int error = -EINVAL; 2598 2599 down_write(¤t->mm->mmap_sem); 2600 2601 if (start & ~PAGE_MASK) 2602 goto out; 2603 len = (len + ~PAGE_MASK) & PAGE_MASK; 2604 end = start + len; 2605 if (end < start) 2606 goto out; 2607 2608 error = 0; 2609 if (end == start) 2610 goto out; 2611 2612 /* 2613 * If the interval [start,end) covers some unmapped address 2614 * ranges, just ignore them, but return -ENOMEM at the end. 2615 */ 2616 vma = find_vma(current->mm, start); 2617 for (;;) { 2618 /* Still start < end. */ 2619 error = -ENOMEM; 2620 if (!vma) 2621 goto out; 2622 2623 /* Here start < vma->vm_end. */ 2624 if (start < vma->vm_start) { 2625 unmapped_error = -ENOMEM; 2626 start = vma->vm_start; 2627 } 2628 2629 /* Here vma->vm_start <= start < vma->vm_end. */ 2630 if (end <= vma->vm_end) { 2631 if (start < end) { 2632 error = madvise_vma(vma, start, end, 2633 behavior); 2634 if (error) 2635 goto out; 2636 } 2637 error = unmapped_error; 2638 goto out; 2639 } 2640 2641 /* Here vma->vm_start <= start < vma->vm_end < end. */ 2642 error = madvise_vma(vma, start, vma->vm_end, behavior); 2643 if (error) 2644 goto out; 2645 start = vma->vm_end; 2646 vma = vma->vm_next; 2647 } 2648 2649out: 2650 up_write(¤t->mm->mmap_sem); 2651 return error; 2652} 2653 2654/* 2655 * Later we can get more picky about what "in core" means precisely. 2656 * For now, simply check to see if the page is in the page cache, 2657 * and is up to date; i.e. that no page-in operation would be required 2658 * at this time if an application were to map and access this page. 2659 */ 2660static unsigned char mincore_page(struct vm_area_struct * vma, 2661 unsigned long pgoff) 2662{ 2663 unsigned char present = 0; 2664 struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping; 2665 struct page * page, ** hash = page_hash(as, pgoff); 2666 2667 spin_lock(&pagecache_lock); 2668 page = __find_page_nolock(as, pgoff, *hash); 2669 if ((page) && (Page_Uptodate(page))) 2670 present = 1; 2671 spin_unlock(&pagecache_lock); 2672 2673 return present; 2674} 2675 2676static long mincore_vma(struct vm_area_struct * vma, 2677 unsigned long start, unsigned long end, unsigned char * vec) 2678{ 2679 long error, i, remaining; 2680 unsigned char * tmp; 2681 2682 error = -ENOMEM; 2683 if (!vma->vm_file) 2684 return error; 2685 2686 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2687 if (end > vma->vm_end) 2688 end = vma->vm_end; 2689 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2690 2691 error = -EAGAIN; 2692 tmp = (unsigned char *) __get_free_page(GFP_KERNEL); 2693 if (!tmp) 2694 return error; 2695 2696 /* (end - start) is # of pages, and also # of bytes in "vec */ 2697 remaining = (end - start), 2698 2699 error = 0; 2700 for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) { 2701 int j = 0; 2702 long thispiece = (remaining < PAGE_SIZE) ? 2703 remaining : PAGE_SIZE; 2704 2705 while (j < thispiece) 2706 tmp[j++] = mincore_page(vma, start++); 2707 2708 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) { 2709 error = -EFAULT; 2710 break; 2711 } 2712 } 2713 2714 free_page((unsigned long) tmp); 2715 return error; 2716} 2717 2718/* 2719 * The mincore(2) system call. 2720 * 2721 * mincore() returns the memory residency status of the pages in the 2722 * current process's address space specified by [addr, addr + len). 2723 * The status is returned in a vector of bytes. The least significant 2724 * bit of each byte is 1 if the referenced page is in memory, otherwise 2725 * it is zero. 2726 * 2727 * Because the status of a page can change after mincore() checks it 2728 * but before it returns to the application, the returned vector may 2729 * contain stale information. Only locked pages are guaranteed to 2730 * remain in memory. 2731 * 2732 * return values: 2733 * zero - success 2734 * -EFAULT - vec points to an illegal address 2735 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE, 2736 * or len has a nonpositive value 2737 * -ENOMEM - Addresses in the range [addr, addr + len] are 2738 * invalid for the address space of this process, or 2739 * specify one or more pages which are not currently 2740 * mapped 2741 * -EAGAIN - A kernel resource was temporarily unavailable. 2742 */ 2743asmlinkage long sys_mincore(unsigned long start, size_t len, 2744 unsigned char * vec) 2745{ 2746 int index = 0; 2747 unsigned long end; 2748 struct vm_area_struct * vma; 2749 int unmapped_error = 0; 2750 long error = -EINVAL; 2751 2752 down_read(¤t->mm->mmap_sem); 2753 2754 if (start & ~PAGE_CACHE_MASK) 2755 goto out; 2756 len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK; 2757 end = start + len; 2758 if (end < start) 2759 goto out; 2760 2761 error = 0; 2762 if (end == start) 2763 goto out; 2764 2765 /* 2766 * If the interval [start,end) covers some unmapped address 2767 * ranges, just ignore them, but return -ENOMEM at the end. 2768 */ 2769 vma = find_vma(current->mm, start); 2770 for (;;) { 2771 /* Still start < end. */ 2772 error = -ENOMEM; 2773 if (!vma) 2774 goto out; 2775 2776 /* Here start < vma->vm_end. */ 2777 if (start < vma->vm_start) { 2778 unmapped_error = -ENOMEM; 2779 start = vma->vm_start; 2780 } 2781 2782 /* Here vma->vm_start <= start < vma->vm_end. */ 2783 if (end <= vma->vm_end) { 2784 if (start < end) { 2785 error = mincore_vma(vma, start, end, 2786 &vec[index]); 2787 if (error) 2788 goto out; 2789 } 2790 error = unmapped_error; 2791 goto out; 2792 } 2793 2794 /* Here vma->vm_start <= start < vma->vm_end < end. */ 2795 error = mincore_vma(vma, start, vma->vm_end, &vec[index]); 2796 if (error) 2797 goto out; 2798 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT; 2799 start = vma->vm_end; 2800 vma = vma->vm_next; 2801 } 2802 2803out: 2804 up_read(¤t->mm->mmap_sem); 2805 return error; 2806} 2807 2808static inline 2809struct page *__read_cache_page(struct address_space *mapping, 2810 unsigned long index, 2811 int (*filler)(void *,struct page*), 2812 void *data) 2813{ 2814 struct page **hash = page_hash(mapping, index); 2815 struct page *page, *cached_page = NULL; 2816 int err; 2817repeat: 2818 page = __find_get_page(mapping, index, hash); 2819 if (!page) { 2820 if (!cached_page) { 2821 cached_page = page_cache_alloc(mapping); 2822 if (!cached_page) 2823 return ERR_PTR(-ENOMEM); 2824 } 2825 page = cached_page; 2826 if (add_to_page_cache_unique(page, mapping, index, hash)) 2827 goto repeat; 2828 cached_page = NULL; 2829 err = filler(data, page); 2830 if (err < 0) { 2831 page_cache_release(page); 2832 page = ERR_PTR(err); 2833 } 2834 } 2835 if (cached_page) 2836 page_cache_release(cached_page); 2837 return page; 2838} 2839 2840/* 2841 * Read into the page cache. If a page already exists, 2842 * and Page_Uptodate() is not set, try to fill the page. 2843 */ 2844struct page *read_cache_page(struct address_space *mapping, 2845 unsigned long index, 2846 int (*filler)(void *,struct page*), 2847 void *data) 2848{ 2849 struct page *page; 2850 int err; 2851 2852retry: 2853 page = __read_cache_page(mapping, index, filler, data); 2854 if (IS_ERR(page)) 2855 goto out; 2856 mark_page_accessed(page); 2857 if (Page_Uptodate(page)) 2858 goto out; 2859 2860 lock_page(page); 2861 if (!page->mapping) { 2862 UnlockPage(page); 2863 page_cache_release(page); 2864 goto retry; 2865 } 2866 if (Page_Uptodate(page)) { 2867 UnlockPage(page); 2868 goto out; 2869 } 2870 err = filler(data, page); 2871 if (err < 0) { 2872 page_cache_release(page); 2873 page = ERR_PTR(err); 2874 } 2875 out: 2876 return page; 2877} 2878 2879static inline struct page * __grab_cache_page(struct address_space *mapping, 2880 unsigned long index, struct page **cached_page) 2881{ 2882 struct page *page, **hash = page_hash(mapping, index); 2883repeat: 2884 page = __find_lock_page(mapping, index, hash); 2885 if (!page) { 2886 if (!*cached_page) { 2887 *cached_page = page_cache_alloc(mapping); 2888 if (!*cached_page) 2889 return NULL; 2890 } 2891 page = *cached_page; 2892 if (add_to_page_cache_unique(page, mapping, index, hash)) 2893 goto repeat; 2894 *cached_page = NULL; 2895 } 2896 return page; 2897} 2898 2899inline void remove_suid(struct inode *inode) 2900{ 2901 unsigned int mode; 2902 2903 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */ 2904 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID; 2905 2906 /* was any of the uid bits set? */ 2907 mode &= inode->i_mode; 2908 if (mode && !capable(CAP_FSETID)) { 2909 inode->i_mode &= ~mode; 2910 mark_inode_dirty(inode); 2911 } 2912} 2913 2914/* 2915 * Write to a file through the page cache. 2916 * 2917 * We currently put everything into the page cache prior to writing it. 2918 * This is not a problem when writing full pages. With partial pages, 2919 * however, we first have to read the data into the cache, then 2920 * dirty the page, and finally schedule it for writing. Alternatively, we 2921 * could write-through just the portion of data that would go into that 2922 * page, but that would kill performance for applications that write data 2923 * line by line, and it's prone to race conditions. 2924 * 2925 * Note that this routine doesn't try to keep track of dirty pages. Each 2926 * file system has to do this all by itself, unfortunately. 2927 * okir@monad.swb.de 2928 */ 2929ssize_t 2930generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) 2931{ 2932 struct address_space *mapping = file->f_dentry->d_inode->i_mapping; 2933 struct inode *inode = mapping->host; 2934 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; 2935 loff_t pos; 2936 struct page *page, *cached_page; 2937 ssize_t written; 2938 long status = 0; 2939 int err; 2940 unsigned bytes; 2941 2942 if ((ssize_t) count < 0) 2943 return -EINVAL; 2944 2945 if (!access_ok(VERIFY_READ, buf, count)) 2946 return -EFAULT; 2947 2948 cached_page = NULL; 2949 2950 down(&inode->i_sem); 2951 2952 pos = *ppos; 2953 err = -EINVAL; 2954 if (pos < 0) 2955 goto out; 2956 2957 err = file->f_error; 2958 if (err) { 2959 file->f_error = 0; 2960 goto out; 2961 } 2962 2963 written = 0; 2964 2965 if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) 2966 pos = inode->i_size; 2967 2968 /* 2969 * Check whether we've reached the file size limit. 2970 */ 2971 err = -EFBIG; 2972 2973 if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) { 2974 if (pos >= limit) { 2975 send_sig(SIGXFSZ, current, 0); 2976 goto out; 2977 } 2978 if (pos > 0xFFFFFFFFULL || count > limit - (u32)pos) { 2979 /* send_sig(SIGXFSZ, current, 0); */ 2980 count = limit - (u32)pos; 2981 } 2982 } 2983 2984 /* 2985 * LFS rule 2986 */ 2987 if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { 2988 if (pos >= MAX_NON_LFS) { 2989 send_sig(SIGXFSZ, current, 0); 2990 goto out; 2991 } 2992 if (count > MAX_NON_LFS - (u32)pos) { 2993 /* send_sig(SIGXFSZ, current, 0); */ 2994 count = MAX_NON_LFS - (u32)pos; 2995 } 2996 } 2997 2998 /* 2999 * Are we about to exceed the fs block limit ? 3000 * 3001 * If we have written data it becomes a short write 3002 * If we have exceeded without writing data we send 3003 * a signal and give them an EFBIG. 3004 * 3005 * Linus frestrict idea will clean these up nicely.. 3006 */ 3007 3008 if (!S_ISBLK(inode->i_mode)) { 3009 if (pos >= inode->i_sb->s_maxbytes) 3010 { 3011 if (count || pos > inode->i_sb->s_maxbytes) { 3012 send_sig(SIGXFSZ, current, 0); 3013 err = -EFBIG; 3014 goto out; 3015 } 3016 /* zero-length writes at ->s_maxbytes are OK */ 3017 } 3018 3019 if (pos + count > inode->i_sb->s_maxbytes) 3020 count = inode->i_sb->s_maxbytes - pos; 3021 } else { 3022 if (is_read_only(inode->i_rdev)) { 3023 err = -EPERM; 3024 goto out; 3025 } 3026 if (pos >= inode->i_size) { 3027 if (count || pos > inode->i_size) { 3028 err = -ENOSPC; 3029 goto out; 3030 } 3031 } 3032 3033 if (pos + count > inode->i_size) 3034 count = inode->i_size - pos; 3035 } 3036 3037 err = 0; 3038 if (count == 0) 3039 goto out; 3040 3041 remove_suid(inode); 3042 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 3043 mark_inode_dirty_sync(inode); 3044 3045 if (file->f_flags & O_DIRECT) 3046 goto o_direct; 3047 3048 do { 3049 unsigned long index, offset; 3050 long page_fault; 3051 char *kaddr; 3052 3053 /* 3054 * Try to find the page in the cache. If it isn't there, 3055 * allocate a free page. 3056 */ 3057 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 3058 index = pos >> PAGE_CACHE_SHIFT; 3059 bytes = PAGE_CACHE_SIZE - offset; 3060 if (bytes > count) 3061 bytes = count; 3062 3063 /* 3064 * Bring in the user page that we will copy from _first_. 3065 * Otherwise there's a nasty deadlock on copying from the 3066 * same page as we're writing to, without it being marked 3067 * up-to-date. 3068 */ 3069 { volatile unsigned char dummy; 3070 __get_user(dummy, buf); 3071 __get_user(dummy, buf+bytes-1); 3072 } 3073 3074 status = -ENOMEM; /* we'll assign it later anyway */ 3075 page = __grab_cache_page(mapping, index, &cached_page); 3076 if (!page) 3077 break; 3078 3079 /* We have exclusive IO access to the page.. */ 3080 if (!PageLocked(page)) { 3081 PAGE_BUG(page); 3082 } 3083 3084 kaddr = kmap(page); 3085 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes); 3086 if (status) 3087 goto sync_failure; 3088 page_fault = __copy_from_user(kaddr+offset, buf, bytes); 3089 flush_dcache_page(page); 3090 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); 3091 if (page_fault) 3092 goto fail_write; 3093 if (!status) 3094 status = bytes; 3095 3096 if (status >= 0) { 3097 written += status; 3098 count -= status; 3099 pos += status; 3100 buf += status; 3101 } 3102unlock: 3103 kunmap(page); 3104 /* Mark it unlocked again and drop the page.. */ 3105 SetPageReferenced(page); 3106 UnlockPage(page); 3107 page_cache_release(page); 3108 3109 if (status < 0) 3110 break; 3111 } while (count); 3112done: 3113 *ppos = pos; 3114 3115 if (cached_page) 3116 page_cache_release(cached_page); 3117 3118 /* For now, when the user asks for O_SYNC, we'll actually 3119 * provide O_DSYNC. */ 3120 if (status >= 0) { 3121 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) 3122 status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); 3123 } 3124 3125out_status: 3126 err = written ? written : status; 3127out: 3128 3129 up(&inode->i_sem); 3130 return err; 3131fail_write: 3132 status = -EFAULT; 3133 goto unlock; 3134 3135sync_failure: 3136 /* 3137 * If blocksize < pagesize, prepare_write() may have instantiated a 3138 * few blocks outside i_size. Trim these off again. 3139 */ 3140 kunmap(page); 3141 UnlockPage(page); 3142 page_cache_release(page); 3143 if (pos + bytes > inode->i_size) 3144 vmtruncate(inode, inode->i_size); 3145 goto done; 3146 3147o_direct: 3148 written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); 3149 if (written > 0) { 3150 loff_t end = pos + written; 3151 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { 3152 inode->i_size = end; 3153 mark_inode_dirty(inode); 3154 } 3155 *ppos = end; 3156 invalidate_inode_pages2(mapping); 3157 } 3158 /* 3159 * Sync the fs metadata but not the minor inode changes and 3160 * of course not the data as we did direct DMA for the IO. 3161 */ 3162 if (written >= 0 && file->f_flags & O_SYNC) 3163 status = generic_osync_inode(inode, OSYNC_METADATA); 3164 goto out_status; 3165} 3166 3167void __init page_cache_init(unsigned long mempages) 3168{ 3169 unsigned long htable_size, order; 3170 3171 htable_size = mempages; 3172 htable_size *= sizeof(struct page *); 3173 for(order = 0; (PAGE_SIZE << order) < htable_size; order++) 3174 ; 3175 3176 do { 3177 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *); 3178 3179 page_hash_bits = 0; 3180 while((tmp >>= 1UL) != 0UL) 3181 page_hash_bits++; 3182 3183 page_hash_table = (struct page **) 3184 __get_free_pages(GFP_ATOMIC, order); 3185 } while(page_hash_table == NULL && --order > 0); 3186 3187 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n", 3188 (1 << page_hash_bits), order, (PAGE_SIZE << order)); 3189 if (!page_hash_table) 3190 panic("Failed to allocate page hash table\n"); 3191 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *)); 3192} 3193