| Kernel v2.4.12-ac6 /mm/vmscan.c |
|---|
 2.4.12-ac6
 mm
 vmscan.c
diff -u --new-file --recursive --exclude-from /usr/src/exclude linux.vanilla/mm/vmscan.c linux.ac/mm/vmscan.c
--- linux.vanilla/mm/vmscan.c Thu Oct 11 13:52:14 2001
+++ linux.ac/mm/vmscan.c Sun Oct 21 18:58:07 2001
@@ -7,7 +7,6 @@
* kswapd added: 7.1.96 sct
* Removed kswapd_ctl limits, and swap out as many pages as needed
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
- * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
* Multiqueue VM started 5.8.00, Rik van Riel.
*/
@@ -21,17 +20,107 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/file.h>
-#include <linux/compiler.h>
#include <asm/pgalloc.h>
+int vm_static_inactive_target;
+
+static inline void age_page_up(struct page *page)
+{
+ page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX);
+}
+
+static inline void age_page_down(struct page *page)
+{
+ page->age -= min(PAGE_AGE_DECL, (int)page->age);
+}
+
/*
- * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 6 for DEF_PRIORITY implies
- * that we'll scan 1/64th of the queues ("queue_length >> 6")
- * during a normal aging round.
+ * Estimate whether a zone has enough inactive or free pages..
*/
-#define DEF_PRIORITY (6)
+static unsigned int zone_inactive_plenty(zone_t *zone)
+{
+ unsigned int inactive;
+
+ if (!zone->size)
+ return 0;
+
+ inactive = zone->inactive_dirty_pages;
+ inactive += zone->inactive_clean_pages;
+ inactive += zone->free_pages;
+
+ return (inactive > (zone->size * 2 / 5));
+}
+
+#define FREE_PLENTY_FACTOR 2
+static unsigned int zone_free_plenty(zone_t *zone)
+{
+ unsigned int free;
+
+ free = zone->free_pages;
+ free += zone->inactive_clean_pages;
+
+ return free > zone->pages_high * FREE_PLENTY_FACTOR;
+}
+
+static unsigned int free_plenty(void)
+{
+ unsigned int free;
+
+ free = nr_free_pages();
+ free += nr_inactive_clean_pages();
+
+ return free > freepages.high * FREE_PLENTY_FACTOR;
+}
+
+/*
+ * We only do page aging if the object in question is in use or
+ * if the cache is getting small. The "small cache" thing happens
+ * when the working set of processes is getting very large and we
+ * need to be careful which pages we evict...
+ */
+static inline int cache_is_small(void)
+{
+ int bufferpages = atomic_read(&buffermem_pages);
+ int pagecache = atomic_read(&page_cache_size) - swapper_space.nrpages;
+
+ int limit = num_physpages * page_cache.borrow_percent / 100;
+
+ return bufferpages + pagecache < limit;
+}
+
+static inline int page_mapping_notused(struct page * page)
+{
+ struct address_space * mapping = page->mapping;
+
+ /*
+ * If a swap cache page is in the RSS of a process, we age it.
+ * Otherwise, we don't.
+ */
+ if (PageSwapCache(page)) {
+ if (page_count(page) > (1 + !!page->buffers))
+ return 0;
+
+ return 1;
+ }
+
+ /* If the cache is small, always use page aging. */
+ if (cache_is_small())
+ return 0;
+
+ if (!mapping)
+ return 1;
+
+ /* This mapping is really large and would monopolise the pagecache. */
+ if (mapping->nrpages > atomic_read(&page_cache_size) / 20);
+ return 1;
+
+ /* File is mmaped by somebody. */
+ if (mapping->i_mmap || mapping->i_mmap_shared)
+ return 0;
+
+ return 1;
+}
/*
* The swap-out function returns 1 if it successfully
@@ -43,24 +132,43 @@
*/
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, str+
uct page *page, zone_t * classzone)
+static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct pa+
ge *page)
{
pte_t pte;
swp_entry_t entry;
- int right_classzone;
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
- flush_tlb_page(vma, address);
- return 0;
+ age_page_up(page);
+ return;
}
- if (TryLockPage(page))
- return 0;
+ /*
+ * We age down really anonymous pages here, pages which
+ * already have a mapping are aged down on the active
+ * list instead.
+ * This is done so heavily shared pages (think libc.so)
+ * are only aged down once and won't be swapped out when
+ * still in active use.
+ */
+ if (!page->mapping)
+ age_page_down(page);
+
+ /*
+ * If we have plenty inactive pages on this
+ * zone, skip it.
+ */
+ if (zone_inactive_plenty(page->zone))
+ return;
+
+ /*
+ * Don't swap out a page which is still in use.
+ */
+ if (page->age > 0)
+ return;
- right_classzone = 1;
- if (!memclass(page->zone, classzone))
- right_classzone = 0;
+ if (TryLockPage(page))
+ return;
/* From this point on, the odds are that we're going to
* nuke this pte, so read and clear the pte. This hook
@@ -85,12 +193,11 @@
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
+ if (!page->age)
+ deactivate_page(page);
UnlockPage(page);
- {
- int freeable = page_count(page) - !!page->buffers <= 2;
- page_cache_release(page);
- return freeable & right_classzone;
- }
+ page_cache_release(page);
+ return;
}
/*
@@ -141,11 +248,11 @@
/* No swap space left */
set_pte(page_table, pte);
UnlockPage(page);
- return 0;
+ return;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long+
end, int count, zone_t * classzone)
+static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, i+
nt count)
{
pte_t * pte;
unsigned long pmd_end;
@@ -169,22 +276,20 @@
struct page *page = pte_page(*pte);
if (VALID_PAGE(page) && !PageReserved(page)) {
- count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
- if (!count) {
- address += PAGE_SIZE;
+ try_to_swap_out(mm, vma, address, pte, page);
+ if (!--count)
break;
- }
}
}
address += PAGE_SIZE;
pte++;
} while (address && (address < end));
- mm->swap_address = address;
+ mm->swap_address = address + PAGE_SIZE;
return count;
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long+
end, int count, zone_t * classzone)
+static inline int swap_out_pgd( struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned lon+
g end, int count)
{
pmd_t * pmd;
unsigned long pgd_end;
@@ -204,7 +309,7 @@
end = pgd_end;
do {
- count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
+ count = swap_out_pmd(mm, vma, pmd, address, end, count);
if (!count)
break;
address = (address + PMD_SIZE) & PMD_MASK;
@@ -214,7 +319,7 @@
}
/* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * class+
zone)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
{
pgd_t *pgdir;
unsigned long end;
@@ -229,7 +334,7 @@
if (address >= end)
BUG();
do {
- count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
+ count = swap_out_pgd(mm, vma, pgdir, address, end, count);
if (!count)
break;
address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -238,424 +343,719 @@
return count;
}
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
/*
- * Returns remaining count of pages to be swapped out by followup call.
+ * Returns non-zero if we scanned all `count' pages
*/
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
+static int swap_out_mm(struct mm_struct * mm, int count)
{
unsigned long address;
struct vm_area_struct* vma;
+ if (!count)
+ return 1;
+ /*
+ * Go through process' page directory.
+ */
+
/*
* Find the proper vm-area after freezing the vma chain
* and ptes.
*/
spin_lock(&mm->page_table_lock);
address = mm->swap_address;
- if (address == TASK_SIZE || swap_mm != mm) {
- /* We raced: don't count this mm but try again */
- ++*mmcounter;
- goto out_unlock;
- }
vma = find_vma(mm, address);
if (vma) {
if (address < vma->vm_start)
address = vma->vm_start;
for (;;) {
- count = swap_out_vma(mm, vma, address, count, classzone);
+ count = swap_out_vma(mm, vma, address, count);
+ if (!count)
+ goto out_unlock;
vma = vma->vm_next;
if (!vma)
break;
- if (!count)
- goto out_unlock;
address = vma->vm_start;
}
}
- /* Indicate that we reached the end of address space */
- mm->swap_address = TASK_SIZE;
+ /* Reset to 0 when we reach the end of address space */
+ mm->swap_address = 0;
out_unlock:
spin_unlock(&mm->page_table_lock);
- return count;
+ return !count;
+}
+
+#define SWAP_SHIFT 5
+#define SWAP_MIN 8
+
+static inline int swap_amount(struct mm_struct *mm)
+{
+ int nr = mm->rss >> SWAP_SHIFT;
+ if (nr < SWAP_MIN) {
+ nr = SWAP_MIN;
+ if (nr > mm->rss)
+ nr = mm->rss;
+ }
+ return nr;
}
-static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
-static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
+static void swap_out(unsigned int priority, int gfp_mask)
{
int counter;
- struct mm_struct *mm;
+ int retval = 0;
+ struct mm_struct *mm = current->mm;
- /* Then, look at the other mm's */
- counter = mmlist_nr / priority;
+ /* Scan part of the process virtual memory. */
+ counter = (mmlist_nr << SWAP_SHIFT) >> priority;
do {
- if (unlikely(current->need_resched)) {
- __set_current_state(TASK_RUNNING);
- schedule();
- }
-
spin_lock(&mmlist_lock);
mm = swap_mm;
- while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
- mm->swap_address = 0;
+ if (mm == &init_mm) {
mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
- if (mm == swap_mm)
+ if (mm == &init_mm)
goto empty;
- swap_mm = mm;
}
+ /* Set pointer for next call to next in the list */
+ swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
/* Make sure the mm doesn't disappear when we drop the lock.. */
atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
- nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
-
+ /* Walk about 6% of the address space each time */
+ retval |= swap_out_mm(mm, swap_amount(mm));
mmput(mm);
-
- if (!nr_pages)
- return 1;
} while (--counter >= 0);
-
- return 0;
+ return;
empty:
spin_unlock(&mmlist_lock);
- return 0;
}
-static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
-static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
+
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
{
- struct list_head * entry;
+ struct page * page = NULL;
+ struct list_head * page_lru;
+ swp_entry_t entry = {0};
+ int maxscan;
+ /*
+ * We only need the pagemap_lru_lock if we don't reclaim the page,
+ * but we have to grab the pagecache_lock before the pagemap_lru_lock
+ * to avoid deadlocks and most of the time we'll succeed anyway.
+ */
+ spin_lock(&pagecache_lock);
spin_lock(&pagemap_lru_lock);
- while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
- struct page * page;
+ maxscan = zone->inactive_clean_pages;
+ while ((page_lru = zone->inactive_clean_list.prev) !=
+ &zone->inactive_clean_list && maxscan--) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveClean(page)) {
+ printk("VM: reclaim_page, wrong page on list.\n");
+ list_del(page_lru);
+ page->zone->inactive_clean_pages--;
+ continue;
+ }
- if (unlikely(current->need_resched)) {
- spin_unlock(&pagemap_lru_lock);
- __set_current_state(TASK_RUNNING);
- schedule();
- spin_lock(&pagemap_lru_lock);
+ /* Page is or was in use? Move it to the active list. */
+ if (PageReferenced(page) || page->age > 0 ||
+ page_count(page) > (1 + !!page->buffers)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_active_list(page);
+ page->age = max((int)page->age, PAGE_AGE_START);
continue;
}
- page = list_entry(entry, struct page, lru);
+ /* The page is dirty, or locked, move to inactive_dirty list. */
+ if (page->buffers || PageDirty(page) || TryLockPage(page)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_inactive_dirty_list(page);
+ continue;
+ }
- if (unlikely(!PageInactive(page) && !PageActive(page)))
- BUG();
+ /* OK, remove the page from the caches. */
+ if (PageSwapCache(page)) {
+ entry.val = page->index;
+ __delete_from_swap_cache(page);
+ goto found_page;
+ }
- list_del(entry);
- list_add(entry, &inactive_list);
- if (PageTestandClearReferenced(page))
- continue;
+ if (page->mapping) {
+ __remove_inode_page(page);
+ goto found_page;
+ }
+
+ /* We should never ever get here. */
+ printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+ list_del(page_lru);
+ zone->inactive_clean_pages--;
+ UnlockPage(page);
+ }
+ spin_unlock(&pagemap_lru_lock);
+ spin_unlock(&pagecache_lock);
+ return NULL;
- max_scan--;
+found_page:
+ del_page_from_inactive_clean_list(page);
+ spin_unlock(&pagemap_lru_lock);
+ spin_unlock(&pagecache_lock);
+ if (entry.val)
+ swap_free(entry);
+ UnlockPage(page);
+ page->age = PAGE_AGE_START;
+ if (page_count(page) != 1)
+ printk("VM: reclaim_page, found page with count %d!\n",
+ page_count(page));
+ return page;
+}
- if (unlikely(!memclass(page->zone, classzone)))
- continue;
+static inline int page_dirty(struct page *page)
+{
+ struct buffer_head *tmp, *bh;
+
+ if (PageDirty(page))
+ return 1;
+
+ if (page->mapping && !page->buffers)
+ return 0;
+
+ tmp = bh = page->buffers;
+
+ do {
+ if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+ return 1;
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
+
+ return 0;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: are we allowed to do synchronous IO in emergencies ?
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define CAN_DO_FS ((gfp_mask & __GFP_FS) && should_write)
+#define WRITE_LOW_WATER 5
+#define WRITE_HIGH_WATER 10
+int page_launder(int gfp_mask, int sync)
+{
+ int maxscan, cleaned_pages, failed_pages, total;
+ struct list_head * page_lru;
+ struct page * page;
+
+ /*
+ * This flag determines if we should do writeouts of dirty data
+ * or not. When more than WRITE_HIGH_WATER percentage of the
+ * pages we process would need to be written out we set this flag
+ * and will do writeout, the flag is cleared once we go below
+ * WRITE_LOW_WATER. Note that only pages we actually process
+ * get counted, ie. pages where we make it beyond the TryLock.
+ *
+ * XXX: These flags still need tuning.
+ */
+ static int should_write = 0;
+
+ cleaned_pages = 0;
+ failed_pages = 0;
+
+ /*
+ * The gfp_mask tells try_to_free_buffers() below if it should
+ * wait do IO or may be allowed to wait on IO synchronously.
+ *
+ * Note that syncronous IO only happens when a page has not been
+ * written out yet when we see it for a second time, this is done
+ * through magic in try_to_free_buffers().
+ */
+ if (!should_write)
+ gfp_mask &= ~(__GFP_WAIT | __GFP_IO);
+ else if (!sync)
+ gfp_mask &= ~__GFP_WAIT;
- /* Racy check to avoid trylocking when not worthwhile */
- if (!page->buffers && page_count(page) != 1)
+ /* The main launder loop. */
+ spin_lock(&pagemap_lru_lock);
+ maxscan = nr_inactive_dirty_pages;
+ while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+ maxscan-- > 0) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveDirty(page)) {
+ printk("VM: page_launder, wrong page on list.\n");
+ list_del(page_lru);
+ nr_inactive_dirty_pages--;
+ page->zone->inactive_dirty_pages--;
continue;
+ }
/*
- * The page is locked. IO in progress?
- * Move it to the back of the list.
+ * The page is in active use or really unfreeable. Move to
+ * the active list and adjust the page age if needed.
*/
- if (unlikely(TryLockPage(page)))
+ if (PageReferenced(page) || page->age || page_ramdisk(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ page->age = max((int)page->age, PAGE_AGE_START);
continue;
+ }
- if (PageDirty(page) && is_page_cache_freeable(page)) {
- /*
- * It is not critical here to write it only if
- * the page is unmapped beause any direct writer
- * like O_DIRECT would set the PG_dirty bitflag
- * on the phisical page after having successfully
- * pinned it and after the I/O to the page is finished,
- * so the direct writes to the page cannot get lost.
- */
- int (*writepage)(struct page *);
-
- writepage = page->mapping->a_ops->writepage;
- if ((gfp_mask & __GFP_FS) && writepage) {
- ClearPageDirty(page);
- page_cache_get(page);
- spin_unlock(&pagemap_lru_lock);
-
- writepage(page);
- page_cache_release(page);
+ /*
+ * The page is still in the page tables of some process,
+ * move it to the active list but leave page age at 0;
+ * either swap_out() will make it freeable soon or it is
+ * mlock()ed...
+ *
+ * The !PageLocked() test is to protect us from ourselves,
+ * see the code around the writepage() call.
+ */
+ if ((page_count(page) > (1 + !!page->buffers)) &&
+ !PageLocked(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ continue;
+ }
- spin_lock(&pagemap_lru_lock);
+ /*
+ * If this zone has plenty of pages free, don't spend time
+ * on cleaning it but only move clean pages out of the way
+ * so we won't have to scan those again.
+ */
+ if (zone_free_plenty(page->zone)) {
+ if (!page->mapping || page_dirty(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
continue;
}
}
/*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
+ * The page is locked. IO in progress?
+ * Move it to the back of the list.
*/
- if (page->buffers) {
- spin_unlock(&pagemap_lru_lock);
-
- /* avoid to free a locked page */
- page_cache_get(page);
-
- if (try_to_free_buffers(page, gfp_mask)) {
- if (!page->mapping) {
- /*
- * We must not allow an anon page
- * with no buffers to be visible on
- * the LRU, so we unlock the page after
- * taking the lru lock
- */
- spin_lock(&pagemap_lru_lock);
- UnlockPage(page);
- __lru_cache_del(page);
+ if (TryLockPage(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
+ continue;
+ }
- /* effectively free the page here */
- page_cache_release(page);
+ /*
+ * Dirty swap-cache page? Write it out if
+ * last copy..
+ */
+ if (PageDirty(page)) {
+ int (*writepage)(struct page *);
- if (--nr_pages)
- continue;
- break;
- } else {
- /*
- * The page is still in pagecache so undo the stuff
- * before the try_to_free_buffers since we've not
- * finished and we can now try the next step.
- */
- page_cache_release(page);
+ /* Can a page get here without page->mapping? */
+ if (!page->mapping)
+ goto page_active;
+ writepage = page->mapping->a_ops->writepage;
+ if (!writepage)
+ goto page_active;
- spin_lock(&pagemap_lru_lock);
- }
- } else {
- /* failed to drop the buffers so stop here */
+ /* Can't do it? Move it to the back of the list. */
+ if (!CAN_DO_FS) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
UnlockPage(page);
- page_cache_release(page);
-
- spin_lock(&pagemap_lru_lock);
+ failed_pages++;
continue;
}
- }
- if (unlikely(!page->mapping))
- BUG();
-
- if (unlikely(!spin_trylock(&pagecache_lock))) {
- /* we hold the page lock so the page cannot go away from under us */
+ /* OK, do a physical asynchronous write to swap. */
+ ClearPageDirty(page);
+ page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
- spin_lock(&pagecache_lock);
+ writepage(page);
+ page_cache_release(page);
+
+ /* And re-start the thing.. */
spin_lock(&pagemap_lru_lock);
+ continue;
}
/*
- * this is the non-racy check, it is critical to check
- * PageDirty _after_ we made sure the page is freeable
- * so not in use by anybody.
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we either free
+ * the page (in case it was a buffercache only page) or we
+ * move the page to the inactive_clean list.
+ *
+ * On the first round, we should free all previously cleaned
+ * buffer pages
*/
- if (!is_page_cache_freeable(page) || PageDirty(page)) {
- spin_unlock(&pagecache_lock);
- UnlockPage(page);
- continue;
- }
+ if (page->buffers) {
+ int clearedbuf;
+ /*
+ * Since we might be doing disk IO, we have to
+ * drop the spinlock and take an extra reference
+ * on the page so it doesn't go away from under us.
+ */
+ del_page_from_inactive_dirty_list(page);
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
- /* point of no return */
- if (likely(!PageSwapCache(page))) {
- __remove_inode_page(page);
- spin_unlock(&pagecache_lock);
- } else {
- swp_entry_t swap;
- swap.val = page->index;
- __delete_from_swap_cache(page);
- spin_unlock(&pagecache_lock);
- swap_free(swap);
- }
+ /* Try to free the page buffers. */
+ clearedbuf = try_to_release_page(page, gfp_mask);
- __lru_cache_del(page);
- UnlockPage(page);
+ /*
+ * Re-take the spinlock. Note that we cannot
+ * unlock the page yet since we're still
+ * accessing the page_struct here...
+ */
+ spin_lock(&pagemap_lru_lock);
- /* effectively free the page here */
- page_cache_release(page);
+ /* The buffers were not freed. */
+ if (!clearedbuf) {
+ add_page_to_inactive_dirty_list(page);
+ failed_pages++;
+
+ /* The page was only in the buffer cache. */
+ } else if (!page->mapping) {
+ atomic_dec(&buffermem_pages);
+ cleaned_pages++;
+
+ /* The page has more users besides the cache and us. */
+ } else if (page_count(page) > 2) {
+ add_page_to_active_list(page);
+
+ /* OK, we "created" a freeable page. */
+ } else /* page->mapping && page_count(page) == 2 */ {
+ add_page_to_inactive_clean_list(page);
+ cleaned_pages++;
+ }
+
+ /*
+ * Unlock the page and drop the extra reference.
+ * We can only do it here because we are accessing
+ * the page struct above.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
- if (--nr_pages)
continue;
- break;
+ } else if (page->mapping && !PageDirty(page)) {
+ /*
+ * If a page had an extra reference in
+ * deactivate_page(), we will find it here.
+ * Now the page is really freeable, so we
+ * move it to the inactive_clean list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
+ UnlockPage(page);
+ cleaned_pages++;
+ } else {
+page_active:
+ /*
+ * OK, we don't know what to do with the page.
+ * It's no use keeping it here, so we move it to
+ * the active list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ page->age = max((int)page->age, PAGE_AGE_START);
+ UnlockPage(page);
+ }
}
spin_unlock(&pagemap_lru_lock);
- return nr_pages;
+ /*
+ * Set the should_write flag, for the next callers of page_launder.
+ * If we go below the low watermark we stop the writeout of dirty
+ * pages, writeout is started when we get above the high watermark.
+ */
+ total = failed_pages + cleaned_pages;
+ if (should_write && failed_pages * 100 < WRITE_LOW_WATER * total)
+ should_write = 0;
+ else if (!should_write && failed_pages * 100 > WRITE_HIGH_WATER * total)
+ should_write = 1;
+
+ /* Return the number of pages moved to the inactive_clean list. */
+ return cleaned_pages;
}
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @target: number of pages to deactivate, zero for background aging
*
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
*/
-static void refill_inactive(int nr_pages)
+int refill_inactive_scan(unsigned int priority)
{
- struct list_head * entry;
+ struct list_head * page_lru;
+ struct page * page;
+ int maxscan = nr_active_pages >> priority;
+ int nr_deactivated = 0;
+ /* Take the lock while messing with the list... */
spin_lock(&pagemap_lru_lock);
- entry = active_list.prev;
- while (nr_pages-- && entry != &active_list) {
- struct page * page;
+ while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+ page = list_entry(page_lru, struct page, lru);
- page = list_entry(entry, struct page, lru);
- entry = entry->prev;
- if (PageTestandClearReferenced(page)) {
- list_del(&page->lru);
- list_add(&page->lru, &active_list);
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageActive(page)) {
+ printk("VM: refill_inactive, wrong page on list.\n");
+ list_del(page_lru);
+ nr_active_pages--;
continue;
}
- del_page_from_active_list(page);
- add_page_to_inactive_list(page);
+ /*
+ * Do aging on the pages. Every time a page is referenced,
+ * page->age gets incremented. If it wasn't referenced, we
+ * decrement page->age. The page gets moved to the inactive
+ * list when one of the following is true:
+ * - the page age reaches 0
+ * - the object the page belongs to isn't in active use
+ * - the object the page belongs to is hogging the cache
+ */
+ if (PageTestandClearReferenced(page)) {
+ age_page_up(page);
+ } else {
+ age_page_down(page);
+ }
+
+ /*
+ * Don't deactivate pages from zones which have
+ * plenty inactive pages.
+ */
+ if (zone_inactive_plenty(page->zone)) {
+ goto skip_page;
+ }
+
+ /* Deactivate a page once page->age reaches 0. */
+ if (!page->age)
+ deactivate_page_nolock(page);
+
+ /*
+ * Deactivate pages from files which aren't in use, busy
+ * pages will be referenced while on the inactive list.
+ */
+ if (page_mapping_notused(page))
+ deactivate_page_nolock(page);
+
+ /*
+ * If the page is still on the active list, move it
+ * to the other end of the list. Otherwise we exit if
+ * we have done enough work.
+ */
+skip_page:
+ if (PageActive(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &active_list);
+ } else {
+ nr_deactivated++;
+ }
}
spin_unlock(&pagemap_lru_lock);
+
+ return nr_deactivated;
}
-static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
+long count_ramdisk_pages(void)
{
- int max_scan;
- int chunk_size = nr_pages;
- unsigned long ratio;
-
- nr_pages -= kmem_cache_reap(gfp_mask);
- if (nr_pages <= 0)
- return 0;
+ struct list_head *page_lru;
+ struct page *page;
+ long nr_ramdisk = 0;
- nr_pages = chunk_size;
- /* try to keep the active list 2/3 of the size of the cache */
- ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
- refill_inactive(ratio);
-
- max_scan = nr_inactive_pages / priority;
- nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
- if (nr_pages <= 0)
- return 0;
-
- shrink_dcache_memory(priority, gfp_mask);
- shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
- shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+ spin_lock(&pagemap_lru_lock);
+ for (page_lru = active_list.next; page_lru != &active_list;
+ page_lru = page_lru->next) {
+ page = list_entry(page_lru, struct page, lru);
+ if (page_ramdisk(page))
+ nr_ramdisk ++;
+ }
+ spin_unlock(&pagemap_lru_lock);
- return nr_pages;
+ return nr_ramdisk;
}
-int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
+/*
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int free_shortage(void)
{
- int ret = 0;
- int priority = DEF_PRIORITY;
- int nr_pages = SWAP_CLUSTER_MAX;
+ pg_data_t *pgdat;
+ unsigned int global_free = 0;
+ unsigned int global_target = freepages.high;
+ /* Are we low on free pages anywhere? */
+ pgdat = pgdat_list;
do {
- nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
- if (nr_pages <= 0)
- return 1;
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones+ i;
+ unsigned int free;
- ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2);
- } while (--priority);
+ if (!zone->size)
+ continue;
- return ret;
-}
+ free = zone->free_pages;
+ free += zone->inactive_clean_pages;
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+ /* Local shortage? */
+ if (free < zone->pages_low)
+ return 1;
-static int check_classzone_need_balance(zone_t * classzone)
-{
- zone_t * first_classzone;
+ global_free += free;
+ }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
- first_classzone = classzone->zone_pgdat->node_zones;
- while (classzone >= first_classzone) {
- if (classzone->free_pages > classzone->pages_high)
- return 0;
- classzone--;
- }
- return 1;
+ /* Global shortage? */
+ return global_free < global_target;
}
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int inactive_shortage(void)
{
- int need_more_balance = 0, i;
- zone_t * zone;
+ pg_data_t *pgdat;
+ unsigned int global_target = freepages.high + inactive_target();
+ unsigned int global_inactive = 0;
- for (i = pgdat->nr_zones-1; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (unlikely(current->need_resched))
- schedule();
- if (!zone->need_balance)
- continue;
- if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
- zone->need_balance = 0;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(HZ);
- continue;
+ pgdat = pgdat_list;
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+ unsigned int inactive;
+
+ if (!zone->size)
+ continue;
+
+ inactive = zone->inactive_dirty_pages;
+ inactive += zone->inactive_clean_pages;
+ inactive += zone->free_pages;
+
+ /* Local shortage? */
+ if (inactive < zone->pages_high)
+ return 1;
+
+ global_inactive += inactive;
}
- if (check_classzone_need_balance(zone))
- need_more_balance = 1;
- else
- zone->need_balance = 0;
- }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
- return need_more_balance;
+ /* Global shortage? */
+ return global_inactive < global_target;
}
-static void kswapd_balance(void)
+#define DEF_PRIORITY (6)
+
+/*
+ * Refill_inactive is the function used to scan and age the pages on
+ * the active list and in the working set of processes, moving the
+ * little-used pages to the inactive list.
+ *
+ * When called by kswapd, we try to deactivate as many pages as needed
+ * to recover from the inactive page shortage. This makes it possible
+ * for kswapd to keep up with memory demand so user processes can get
+ * low latency on memory allocations.
+ *
+ * However, when the system starts to get overloaded we can get called
+ * by user processes. For user processes we want to both reduce the
+ * latency and make sure that multiple user processes together don't
+ * deactivate too many pages. To achieve this we simply do less work
+ * when called from a user process.
+ */
+static int refill_inactive(unsigned int gfp_mask)
{
- int need_more_balance;
- pg_data_t * pgdat;
+ int progress = 0, maxtry;
+
+ maxtry = 1 << DEF_PRIORITY;
do {
- need_more_balance = 0;
- pgdat = pgdat_list;
- do
- need_more_balance |= kswapd_balance_pgdat(pgdat);
- while ((pgdat = pgdat->node_next));
- if (need_more_balance && out_of_memory()) {
- oom_kill();
+ if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
+ schedule();
+ if (!inactive_shortage())
+ return 1;
}
- } while (need_more_balance);
-}
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
- zone_t * zone;
- int i;
+ /* Walk the VM space for a bit.. */
+ swap_out(DEF_PRIORITY, gfp_mask);
- for (i = pgdat->nr_zones-1; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!zone->need_balance)
- continue;
- return 0;
- }
+ /* ..and refill the inactive list */
+ progress += refill_inactive_scan(DEF_PRIORITY);
- return 1;
+ if (--maxtry <= 0)
+ break;
+ } while (inactive_shortage());
+
+ return progress;
}
-static int kswapd_can_sleep(void)
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
{
- pg_data_t * pgdat;
+ int ret = 0;
- pgdat = pgdat_list;
- do {
- if (kswapd_can_sleep_pgdat(pgdat))
- continue;
- return 0;
- } while ((pgdat = pgdat->node_next));
+ /*
+ * Eat memory from filesystem page cache, buffer cache,
+ * dentry, inode and filesystem quota caches.
+ */
+ ret += page_launder(gfp_mask, user);
+ shrink_dcache_memory(0, gfp_mask);
+ shrink_icache_memory(0, gfp_mask);
+ shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
- return 1;
+ /*
+ * If needed, we move pages from the active list
+ * to the inactive list.
+ */
+ if (inactive_shortage())
+ ret += refill_inactive(gfp_mask);
+
+ /*
+ * Reclaim unused slab cache memory.
+ */
+ kmem_cache_reap(gfp_mask);
+
+ return ret;
}
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+
/*
* The background pageout daemon, started as a kernel thread
* from the init process.
@@ -672,7 +1072,6 @@
int kswapd(void *unused)
{
struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
daemonize();
strcpy(tsk->comm, "kswapd");
@@ -696,31 +1095,132 @@
* Kswapd main loop.
*/
for (;;) {
- __set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kswapd_wait, &wait);
+ static long recalc = 0;
- mb();
- if (kswapd_can_sleep())
- schedule();
+ /*
+ * We try to rebalance the VM either when we are short
+ * on free pages or when we have a shortage of inactive
+ * pages and are getting low on free pages.
+ */
+ if (free_shortage() || (inactive_shortage() && !free_plenty()))
+ do_try_to_free_pages(GFP_KSWAPD, 0);
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kswapd_wait, &wait);
+ /* Once a second ... */
+ if (time_after(jiffies, recalc + HZ)) {
+ recalc = jiffies;
+ /* Do background page aging. */
+ refill_inactive_scan(DEF_PRIORITY);
+ }
+
+ /*
+ * We go to sleep if either the free page shortage
+ * or the inactive page shortage is gone. We do this
+ * because:
+ * 1) we need no more free pages or
+ * 2) the inactive pages need to be flushed to disk,
+ * it wouldn't help to eat CPU time now ...
+ *
+ * We go to sleep for one second, but if it's needed
+ * we'll be woken up earlier...
+ */
+ if (!free_shortage() || !inactive_shortage()) {
+ interruptible_sleep_on_timeout(&kswapd_wait, HZ);
/*
- * If we actually get into a low-memory situation,
- * the processes needing more memory will wake us
- * up on a more timely basis.
+ * If we couldn't free enough memory, we see if it was
+ * due to the system just not having enough memory.
+ * If that is the case, the only solution is to kill
+ * a process (the alternative is enternal deadlock).
+ *
+ * If there still is enough memory around, we just loop
+ * and try free some more memory...
*/
- kswapd_balance();
- run_task_queue(&tq_disk);
+ } else if (out_of_memory()) {
+ oom_kill();
+ }
+ }
+}
+
+void wakeup_kswapd(void)
+{
+ if (waitqueue_active(&kswapd_wait))
+ wake_up_interruptible(&kswapd_wait);
+}
+
+/*
+ * Called by non-kswapd processes when they want more
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+ int ret = 1;
+
+ if (gfp_mask & __GFP_WAIT) {
+ current->flags |= PF_MEMALLOC;
+ ret = do_try_to_free_pages(gfp_mask, 1);
+ current->flags &= ~PF_MEMALLOC;
}
+
+ return ret;
}
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances.
+ */
+int kreclaimd(void *unused)
+{
+ struct task_struct *tsk = current;
+ pg_data_t *pgdat;
+
+ daemonize();
+ strcpy(tsk->comm, "kreclaimd");
+ sigfillset(&tsk->blocked);
+ current->flags |= PF_MEMALLOC;
+
+ while (1) {
+
+ /*
+ * We sleep until someone wakes us up from
+ * page_alloc.c::__alloc_pages().
+ */
+ interruptible_sleep_on(&kreclaimd_wait);
+
+ /*
+ * Move some pages from the inactive_clean lists to
+ * the free lists, if it is needed.
+ */
+ pgdat = pgdat_list;
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+ if (!zone->size)
+ continue;
+
+ while (zone->free_pages < zone->pages_low) {
+ struct page * page;
+ page = reclaim_page(zone);
+ if (!page)
+ break;
+ __free_page(page);
+ }
+ }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
+ }
+}
+
+
static int __init kswapd_init(void)
{
- printk("Starting kswapd\n");
+ printk("Starting kswapd v1.8\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+ kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}
|