Linux Headquarters
[ Register ]
[ About us ] [ Home Page ]

Advertisement
[ Kernel ] [ Documentation ] [ Links ] [ Books ]

Advertisement

Kernel v2.4.12-ac6 /mm/vmscan.c

Filename:/mm/vmscan.c
Lines Added:817
Lines Deleted:317
Also changed in: (Previous) 2.4.12-ac4  2.4.12-ac5  2.4.12-ac3  2.4.12-ac2  2.4.12-ac1  2.4.11 
(Following) 2.4.13  2.4.13-ac1  2.4.13-ac2  2.4.13-ac3  2.4.13-ac4  2.4.13-ac5 

Location
[  2.4.12-ac6
  [  mm
     o  vmscan.c

Patch

diff -u --new-file --recursive --exclude-from /usr/src/exclude linux.vanilla/mm/vmscan.c linux.ac/mm/vmscan.c
--- linux.vanilla/mm/vmscan.c   Thu Oct 11 13:52:14 2001
+++ linux.ac/mm/vmscan.c   Sun Oct 21 18:58:07 2001
@@ -7,7 +7,6 @@
  *  kswapd added: 7.1.96  sct
  *  Removed kswapd_ctl limits, and swap out as many pages as needed
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
- *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
@@ -21,17 +20,107 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/file.h>
-#include <linux/compiler.h>
 
 #include <asm/pgalloc.h>
 
+int vm_static_inactive_target;
+
+static inline void age_page_up(struct page *page)
+{
+   page->age = min((int) (page->age + PAGE_AGE_ADV), PAGE_AGE_MAX); 
+}
+
+static inline void age_page_down(struct page *page)
+{
+   page->age -= min(PAGE_AGE_DECL, (int)page->age);
+}
+
 /*
- * The "priority" of VM scanning is how much of the queues we
- * will scan in one go. A value of 6 for DEF_PRIORITY implies
- * that we'll scan 1/64th of the queues ("queue_length >> 6")
- * during a normal aging round.
+ * Estimate whether a zone has enough inactive or free pages..
  */
-#define DEF_PRIORITY (6)
+static unsigned int zone_inactive_plenty(zone_t *zone)
+{
+   unsigned int inactive;
+
+   if (!zone->size)
+      return 0;
+      
+   inactive = zone->inactive_dirty_pages;
+   inactive += zone->inactive_clean_pages;
+   inactive += zone->free_pages;
+
+   return (inactive > (zone->size * 2 / 5));
+}
+
+#define FREE_PLENTY_FACTOR 2
+static unsigned int zone_free_plenty(zone_t *zone)
+{
+   unsigned int free;
+
+   free = zone->free_pages;
+   free += zone->inactive_clean_pages;
+
+   return free > zone->pages_high * FREE_PLENTY_FACTOR;
+}
+
+static unsigned int free_plenty(void)
+{
+   unsigned int free;
+
+   free = nr_free_pages();
+   free += nr_inactive_clean_pages();
+
+   return free > freepages.high * FREE_PLENTY_FACTOR;
+}
+
+/*
+ * We only do page aging if the object in question is in use or
+ * if the cache is getting small. The "small cache" thing happens
+ * when the working set of processes is getting very large and we
+ * need to be careful which pages we evict...
+ */
+static inline int cache_is_small(void)
+{
+   int bufferpages = atomic_read(&buffermem_pages);
+   int pagecache = atomic_read(&page_cache_size) - swapper_space.nrpages;
+
+   int limit = num_physpages * page_cache.borrow_percent / 100;
+
+   return bufferpages + pagecache < limit;
+}
+
+static inline int page_mapping_notused(struct page * page)
+{
+   struct address_space * mapping = page->mapping;
+
+   /* 
+    * If a swap cache page is in the RSS of a process, we age it.
+    * Otherwise, we don't.
+    */
+   if (PageSwapCache(page)) {
+             if (page_count(page) > (1 + !!page->buffers))
+         return 0;
+
+      return 1;
+   }
+
+   /* If the cache is small, always use page aging. */
+   if (cache_is_small())
+      return 0;
+
+   if (!mapping)
+      return 1;
+
+   /* This mapping is really large and would monopolise the pagecache. */
+   if (mapping->nrpages > atomic_read(&page_cache_size) / 20);
+      return 1;
+
+   /* File is mmaped by somebody. */
+   if (mapping->i_mmap || mapping->i_mmap_shared)
+      return 0;
+
+   return 1;
+}
 
 /*
  * The swap-out function returns 1 if it successfully
@@ -43,24 +132,43 @@
  */
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, str+
uct page *page, zone_t * classzone)
+static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct pa+
ge *page)
 {
    pte_t pte;
    swp_entry_t entry;
-   int right_classzone;
 
    /* Don't look at this pte if it's been accessed recently. */
    if (ptep_test_and_clear_young(page_table)) {
-      flush_tlb_page(vma, address);
-      return 0;
+      age_page_up(page);
+      return;
    }
 
-   if (TryLockPage(page))
-      return 0;
+   /*
+    * We age down really anonymous pages here, pages which
+    * already have a mapping are aged down on the active
+    * list instead.
+    * This is done so heavily shared pages (think libc.so)
+    * are only aged down once and won't be swapped out when
+    * still in active use.
+    */
+   if (!page->mapping)
+      age_page_down(page);
+
+   /* 
+    * If we have plenty inactive pages on this 
+    * zone, skip it.
+    */
+   if (zone_inactive_plenty(page->zone))
+      return;
+
+   /*
+    * Don't swap out a page which is still in use.
+    */
+   if (page->age > 0)
+      return;
 
-   right_classzone = 1;
-   if (!memclass(page->zone, classzone))
-      right_classzone = 0;
+   if (TryLockPage(page))
+      return;
 
    /* From this point on, the odds are that we're going to
     * nuke this pte, so read and clear the pte.  This hook
@@ -85,12 +193,11 @@
       set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
       mm->rss--;
+      if (!page->age)
+         deactivate_page(page);
       UnlockPage(page);
-      {
-         int freeable = page_count(page) - !!page->buffers <= 2;
-         page_cache_release(page);
-         return freeable & right_classzone;
-      }
+      page_cache_release(page);
+      return;
    }
 
    /*
@@ -141,11 +248,11 @@
    /* No swap space left */
    set_pte(page_table, pte);
    UnlockPage(page);
-   return 0;
+   return;
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long+
 end, int count, zone_t * classzone)
+static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, i+
nt count)
 {
    pte_t * pte;
    unsigned long pmd_end;
@@ -169,22 +276,20 @@
          struct page *page = pte_page(*pte);
 
          if (VALID_PAGE(page) && !PageReserved(page)) {
-            count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
-            if (!count) {
-               address += PAGE_SIZE;
+            try_to_swap_out(mm, vma, address, pte, page);
+            if (!--count)
                break;
-            }
          }
       }
       address += PAGE_SIZE;
       pte++;
    } while (address && (address < end));
-   mm->swap_address = address;
+   mm->swap_address = address + PAGE_SIZE;
    return count;
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long+
 end, int count, zone_t * classzone)
+static inline int swap_out_pgd( struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned lon+
g end, int count)
 {
    pmd_t * pmd;
    unsigned long pgd_end;
@@ -204,7 +309,7 @@
       end = pgd_end;
    
    do {
-      count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
+      count = swap_out_pmd(mm, vma, pmd, address, end, count);
       if (!count)
          break;
       address = (address + PMD_SIZE) & PMD_MASK;
@@ -214,7 +319,7 @@
 }
 
 /* mm->page_table_lock is held. mmap_sem is not held */
-static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * class+
zone)
+static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count)
 {
    pgd_t *pgdir;
    unsigned long end;
@@ -229,7 +334,7 @@
    if (address >= end)
       BUG();
    do {
-      count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
+      count = swap_out_pgd(mm, vma, pgdir, address, end, count);
       if (!count)
          break;
       address = (address + PGDIR_SIZE) & PGDIR_MASK;
@@ -238,424 +343,719 @@
    return count;
 }
 
-/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
-struct mm_struct *swap_mm = &init_mm;
-
 /*
- * Returns remaining count of pages to be swapped out by followup call.
+ * Returns non-zero if we scanned all `count' pages
  */
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
+static int swap_out_mm(struct mm_struct * mm, int count)
 {
    unsigned long address;
    struct vm_area_struct* vma;
 
+   if (!count)
+      return 1;
+   /*
+    * Go through process' page directory.
+    */
+
    /*
     * Find the proper vm-area after freezing the vma chain 
     * and ptes.
     */
    spin_lock(&mm->page_table_lock);
    address = mm->swap_address;
-   if (address == TASK_SIZE || swap_mm != mm) {
-      /* We raced: don't count this mm but try again */
-      ++*mmcounter;
-      goto out_unlock;
-   }
    vma = find_vma(mm, address);
    if (vma) {
       if (address < vma->vm_start)
          address = vma->vm_start;
 
       for (;;) {
-         count = swap_out_vma(mm, vma, address, count, classzone);
+         count = swap_out_vma(mm, vma, address, count);
+         if (!count)
+            goto out_unlock;
          vma = vma->vm_next;
          if (!vma)
             break;
-         if (!count)
-            goto out_unlock;
          address = vma->vm_start;
       }
    }
-   /* Indicate that we reached the end of address space */
-   mm->swap_address = TASK_SIZE;
+   /* Reset to 0 when we reach the end of address space */
+   mm->swap_address = 0;
 
 out_unlock:
    spin_unlock(&mm->page_table_lock);
-   return count;
+   return !count;
+}
+
+#define SWAP_SHIFT   5
+#define SWAP_MIN   8
+
+static inline int swap_amount(struct mm_struct *mm)
+{
+   int nr = mm->rss >> SWAP_SHIFT;
+   if (nr < SWAP_MIN) {
+      nr = SWAP_MIN;
+      if (nr > mm->rss)
+         nr = mm->rss;
+   }
+   return nr;
 }
 
-static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
-static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
+/* Placeholder for swap_out(): may be updated by fork.c:mmput() */
+struct mm_struct *swap_mm = &init_mm;
+
+static void swap_out(unsigned int priority, int gfp_mask)
 {
    int counter;
-   struct mm_struct *mm;
+   int retval = 0;
+   struct mm_struct *mm = current->mm;
 
-   /* Then, look at the other mm's */
-   counter = mmlist_nr / priority;
+   /* Scan part of the process virtual memory. */
+   counter = (mmlist_nr << SWAP_SHIFT) >> priority;
    do {
-      if (unlikely(current->need_resched)) {
-         __set_current_state(TASK_RUNNING);
-         schedule();
-      }
-
       spin_lock(&mmlist_lock);
       mm = swap_mm;
-      while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
-         mm->swap_address = 0;
+      if (mm == &init_mm) {
          mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
-         if (mm == swap_mm)
+         if (mm == &init_mm)
             goto empty;
-         swap_mm = mm;
       }
+      /* Set pointer for next call to next in the list */
+      swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 
       /* Make sure the mm doesn't disappear when we drop the lock.. */
       atomic_inc(&mm->mm_users);
       spin_unlock(&mmlist_lock);
 
-      nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
-
+      /* Walk about 6% of the address space each time */
+      retval |= swap_out_mm(mm, swap_amount(mm));
       mmput(mm);
-
-      if (!nr_pages)
-         return 1;
    } while (--counter >= 0);
-
-   return 0;
+   return;
 
 empty:
    spin_unlock(&mmlist_lock);
-   return 0;
 }
 
-static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
-static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
+
+/**
+ * reclaim_page -   reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
+ */
+struct page * reclaim_page(zone_t * zone)
 {
-   struct list_head * entry;
+   struct page * page = NULL;
+   struct list_head * page_lru;
+   swp_entry_t entry = {0};
+   int maxscan;
 
+   /*
+    * We only need the pagemap_lru_lock if we don't reclaim the page,
+    * but we have to grab the pagecache_lock before the pagemap_lru_lock
+    * to avoid deadlocks and most of the time we'll succeed anyway.
+    */
+   spin_lock(&pagecache_lock);
    spin_lock(&pagemap_lru_lock);
-   while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
-      struct page * page;
+   maxscan = zone->inactive_clean_pages;
+   while ((page_lru = zone->inactive_clean_list.prev) !=
+         &zone->inactive_clean_list && maxscan--) {
+      page = list_entry(page_lru, struct page, lru);
+
+      /* Wrong page on list?! (list corruption, should not happen) */
+      if (!PageInactiveClean(page)) {
+         printk("VM: reclaim_page, wrong page on list.\n");
+         list_del(page_lru);
+         page->zone->inactive_clean_pages--;
+         continue;
+      }
 
-      if (unlikely(current->need_resched)) {
-         spin_unlock(&pagemap_lru_lock);
-         __set_current_state(TASK_RUNNING);
-         schedule();
-         spin_lock(&pagemap_lru_lock);
+      /* Page is or was in use?  Move it to the active list. */
+      if (PageReferenced(page) || page->age > 0 ||
+            page_count(page) > (1 + !!page->buffers)) {
+         del_page_from_inactive_clean_list(page);
+         add_page_to_active_list(page);
+         page->age = max((int)page->age, PAGE_AGE_START);
          continue;
       }
 
-      page = list_entry(entry, struct page, lru);
+      /* The page is dirty, or locked, move to inactive_dirty list. */
+      if (page->buffers || PageDirty(page) || TryLockPage(page)) {
+         del_page_from_inactive_clean_list(page);
+         add_page_to_inactive_dirty_list(page);
+         continue;
+      }
 
-      if (unlikely(!PageInactive(page) && !PageActive(page)))
-         BUG();
+      /* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+         entry.val = page->index;
+         __delete_from_swap_cache(page);
+         goto found_page;
+      }
 
-      list_del(entry);
-      list_add(entry, &inactive_list);
-      if (PageTestandClearReferenced(page))
-         continue;
+      if (page->mapping) {
+         __remove_inode_page(page);
+         goto found_page;
+      }
+
+      /* We should never ever get here. */
+      printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+      list_del(page_lru);
+      zone->inactive_clean_pages--;
+      UnlockPage(page);
+   }
+   spin_unlock(&pagemap_lru_lock);
+   spin_unlock(&pagecache_lock);
+   return NULL;
 
-      max_scan--;
+found_page:
+   del_page_from_inactive_clean_list(page);
+   spin_unlock(&pagemap_lru_lock);
+   spin_unlock(&pagecache_lock);
+   if (entry.val)
+      swap_free(entry);
+   UnlockPage(page);
+   page->age = PAGE_AGE_START;
+   if (page_count(page) != 1)
+      printk("VM: reclaim_page, found page with count %d!\n",
+            page_count(page));
+   return page;
+}
 
-      if (unlikely(!memclass(page->zone, classzone)))
-         continue;
+static inline int page_dirty(struct page *page)
+{
+   struct buffer_head *tmp, *bh;
+
+   if (PageDirty(page))
+      return 1;
+
+   if (page->mapping && !page->buffers)
+      return 0;
+
+   tmp = bh = page->buffers;
+
+   do {
+      if (tmp->b_state & ((1<<BH_Dirty) | (1<<BH_Lock)))
+         return 1;
+      tmp = tmp->b_this_page;
+   } while (tmp != bh);
+
+   return 0;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: are we allowed to do synchronous IO in emergencies ?
+ *
+ * This function is called when we are low on free / inactive_clean
+ * pages, its purpose is to refill the free/clean list as efficiently
+ * as possible.
+ *
+ * This means we do writes asynchronously as long as possible and will
+ * only sleep on IO when we don't have another option. Since writeouts
+ * cause disk seeks and make read IO slower, we skip writes alltogether
+ * when the amount of dirty pages is small.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define   CAN_DO_FS   ((gfp_mask & __GFP_FS) && should_write)
+#define   WRITE_LOW_WATER      5
+#define   WRITE_HIGH_WATER   10
+int page_launder(int gfp_mask, int sync)
+{
+   int maxscan, cleaned_pages, failed_pages, total;
+   struct list_head * page_lru;
+   struct page * page;
+
+   /*
+    * This flag determines if we should do writeouts of dirty data
+    * or not.  When more than WRITE_HIGH_WATER percentage of the
+    * pages we process would need to be written out we set this flag
+    * and will do writeout, the flag is cleared once we go below
+    * WRITE_LOW_WATER.  Note that only pages we actually process
+    * get counted, ie. pages where we make it beyond the TryLock.
+    *
+    * XXX: These flags still need tuning.
+    */
+   static int should_write = 0;
+
+   cleaned_pages = 0;
+   failed_pages = 0;
+   
+   /*
+    * The gfp_mask tells try_to_free_buffers() below if it should
+    * wait do IO or may be allowed to wait on IO synchronously.
+    *
+    * Note that syncronous IO only happens when a page has not been
+    * written out yet when we see it for a second time, this is done
+    * through magic in try_to_free_buffers().
+    */
+   if (!should_write)
+      gfp_mask &= ~(__GFP_WAIT | __GFP_IO);
+   else if (!sync)
+      gfp_mask &= ~__GFP_WAIT;
 
-      /* Racy check to avoid trylocking when not worthwhile */
-      if (!page->buffers && page_count(page) != 1)
+   /* The main launder loop. */
+   spin_lock(&pagemap_lru_lock);
+   maxscan = nr_inactive_dirty_pages;
+   while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+            maxscan-- > 0) {
+      page = list_entry(page_lru, struct page, lru);
+
+      /* Wrong page on list?! (list corruption, should not happen) */
+      if (!PageInactiveDirty(page)) {
+         printk("VM: page_launder, wrong page on list.\n");
+         list_del(page_lru);
+         nr_inactive_dirty_pages--;
+         page->zone->inactive_dirty_pages--;
          continue;
+      }
 
       /*
-       * The page is locked. IO in progress?
-       * Move it to the back of the list.
+       * The page is in active use or really unfreeable. Move to
+       * the active list and adjust the page age if needed.
        */
-      if (unlikely(TryLockPage(page)))
+      if (PageReferenced(page) || page->age || page_ramdisk(page)) {
+         del_page_from_inactive_dirty_list(page);
+         add_page_to_active_list(page);
+         page->age = max((int)page->age, PAGE_AGE_START);
          continue;
+      }
 
-      if (PageDirty(page) && is_page_cache_freeable(page)) {
-         /*
-          * It is not critical here to write it only if
-          * the page is unmapped beause any direct writer
-          * like O_DIRECT would set the PG_dirty bitflag
-          * on the phisical page after having successfully
-          * pinned it and after the I/O to the page is finished,
-          * so the direct writes to the page cannot get lost.
-          */
-         int (*writepage)(struct page *);
-
-         writepage = page->mapping->a_ops->writepage;
-         if ((gfp_mask & __GFP_FS) && writepage) {
-            ClearPageDirty(page);
-            page_cache_get(page);
-            spin_unlock(&pagemap_lru_lock);
-
-            writepage(page);
-            page_cache_release(page);
+      /*
+       * The page is still in the page tables of some process,
+       * move it to the active list but leave page age at 0;
+       * either swap_out() will make it freeable soon or it is
+       * mlock()ed...
+       *
+       * The !PageLocked() test is to protect us from ourselves,
+       * see the code around the writepage() call.
+       */
+      if ((page_count(page) > (1 + !!page->buffers)) &&
+                  !PageLocked(page)) {
+         del_page_from_inactive_dirty_list(page);
+         add_page_to_active_list(page);
+         continue;
+      }
 
-            spin_lock(&pagemap_lru_lock);
+      /*
+       * If this zone has plenty of pages free, don't spend time
+       * on cleaning it but only move clean pages out of the way
+       * so we won't have to scan those again.
+       */
+      if (zone_free_plenty(page->zone)) {
+         if (!page->mapping || page_dirty(page)) {
+            list_del(page_lru);
+            list_add(page_lru, &inactive_dirty_list);
             continue;
          }
       }
 
       /*
-       * If the page has buffers, try to free the buffer mappings
-       * associated with this page. If we succeed we try to free
-       * the page as well.
+       * The page is locked. IO in progress?
+       * Move it to the back of the list.
        */
-      if (page->buffers) {
-         spin_unlock(&pagemap_lru_lock);
-
-         /* avoid to free a locked page */
-         page_cache_get(page);
-
-         if (try_to_free_buffers(page, gfp_mask)) {
-            if (!page->mapping) {
-               /*
-                * We must not allow an anon page
-                * with no buffers to be visible on
-                * the LRU, so we unlock the page after
-                * taking the lru lock
-                */
-               spin_lock(&pagemap_lru_lock);
-               UnlockPage(page);
-               __lru_cache_del(page);
+      if (TryLockPage(page)) {
+         list_del(page_lru);
+         list_add(page_lru, &inactive_dirty_list);
+         continue;
+      }
 
-               /* effectively free the page here */
-               page_cache_release(page);
+      /*
+       * Dirty swap-cache page? Write it out if
+       * last copy..
+       */
+      if (PageDirty(page)) {
+         int (*writepage)(struct page *);
 
-               if (--nr_pages)
-                  continue;
-               break;
-            } else {
-               /*
-                * The page is still in pagecache so undo the stuff
-                * before the try_to_free_buffers since we've not
-                * finished and we can now try the next step.
-                */
-               page_cache_release(page);
+         /* Can a page get here without page->mapping? */
+         if (!page->mapping)
+            goto page_active;
+         writepage = page->mapping->a_ops->writepage;
+         if (!writepage)
+            goto page_active;
 
-               spin_lock(&pagemap_lru_lock);
-            }
-         } else {
-            /* failed to drop the buffers so stop here */
+         /* Can't do it? Move it to the back of the list. */
+         if (!CAN_DO_FS) {
+            list_del(page_lru);
+            list_add(page_lru, &inactive_dirty_list);
             UnlockPage(page);
-            page_cache_release(page);
-
-            spin_lock(&pagemap_lru_lock);
+            failed_pages++;
             continue;
          }
-      }
 
-      if (unlikely(!page->mapping))
-         BUG();
-
-      if (unlikely(!spin_trylock(&pagecache_lock))) {
-         /* we hold the page lock so the page cannot go away from under us */
+         /* OK, do a physical asynchronous write to swap.  */
+         ClearPageDirty(page);
+         page_cache_get(page);
          spin_unlock(&pagemap_lru_lock);
 
-         spin_lock(&pagecache_lock);
+         writepage(page);
+         page_cache_release(page);
+
+         /* And re-start the thing.. */
          spin_lock(&pagemap_lru_lock);
+         continue;
       }
 
       /*
-       * this is the non-racy check, it is critical to check
-       * PageDirty _after_ we made sure the page is freeable
-       * so not in use by anybody.
+       * If the page has buffers, try to free the buffer mappings
+       * associated with this page. If we succeed we either free
+       * the page (in case it was a buffercache only page) or we
+       * move the page to the inactive_clean list.
+       *
+       * On the first round, we should free all previously cleaned
+       * buffer pages
        */
-      if (!is_page_cache_freeable(page) || PageDirty(page)) {
-         spin_unlock(&pagecache_lock);
-         UnlockPage(page);
-         continue;
-      }
+      if (page->buffers) {
+         int clearedbuf;
+         /*
+          * Since we might be doing disk IO, we have to
+          * drop the spinlock and take an extra reference
+          * on the page so it doesn't go away from under us.
+          */
+         del_page_from_inactive_dirty_list(page);
+         page_cache_get(page);
+         spin_unlock(&pagemap_lru_lock);
 
-      /* point of no return */
-      if (likely(!PageSwapCache(page))) {
-         __remove_inode_page(page);
-         spin_unlock(&pagecache_lock);
-      } else {
-         swp_entry_t swap;
-         swap.val = page->index;
-         __delete_from_swap_cache(page);
-         spin_unlock(&pagecache_lock);
-         swap_free(swap);
-      }
+         /* Try to free the page buffers. */
+         clearedbuf = try_to_release_page(page, gfp_mask);
 
-      __lru_cache_del(page);
-      UnlockPage(page);
+         /*
+          * Re-take the spinlock. Note that we cannot
+          * unlock the page yet since we're still
+          * accessing the page_struct here...
+          */
+         spin_lock(&pagemap_lru_lock);
 
-      /* effectively free the page here */
-      page_cache_release(page);
+         /* The buffers were not freed. */
+         if (!clearedbuf) {
+            add_page_to_inactive_dirty_list(page);
+            failed_pages++;
+
+         /* The page was only in the buffer cache. */
+         } else if (!page->mapping) {
+            atomic_dec(&buffermem_pages);
+            cleaned_pages++;
+
+         /* The page has more users besides the cache and us. */
+         } else if (page_count(page) > 2) {
+            add_page_to_active_list(page);
+
+         /* OK, we "created" a freeable page. */
+         } else /* page->mapping && page_count(page) == 2 */ {
+            add_page_to_inactive_clean_list(page);
+            cleaned_pages++;
+         }
+
+         /*
+          * Unlock the page and drop the extra reference.
+          * We can only do it here because we are accessing
+          * the page struct above.
+          */
+         UnlockPage(page);
+         page_cache_release(page);
 
-      if (--nr_pages)
          continue;
-      break;
+      } else if (page->mapping && !PageDirty(page)) {
+         /*
+          * If a page had an extra reference in
+          * deactivate_page(), we will find it here.
+          * Now the page is really freeable, so we
+          * move it to the inactive_clean list.
+          */
+         del_page_from_inactive_dirty_list(page);
+         add_page_to_inactive_clean_list(page);
+         UnlockPage(page);
+         cleaned_pages++;
+      } else {
+page_active:
+         /*
+          * OK, we don't know what to do with the page.
+          * It's no use keeping it here, so we move it to
+          * the active list.
+          */
+         del_page_from_inactive_dirty_list(page);
+         add_page_to_active_list(page);
+         page->age = max((int)page->age, PAGE_AGE_START);
+         UnlockPage(page);
+      }
    }
    spin_unlock(&pagemap_lru_lock);
 
-   return nr_pages;
+   /*
+    * Set the should_write flag, for the next callers of page_launder.
+    * If we go below the low watermark we stop the writeout of dirty
+    * pages, writeout is started when we get above the high watermark.
+    */
+   total = failed_pages + cleaned_pages;
+   if (should_write && failed_pages * 100 < WRITE_LOW_WATER * total)
+      should_write = 0;
+   else if (!should_write && failed_pages * 100 > WRITE_HIGH_WATER * total)
+      should_write = 1;
+
+   /* Return the number of pages moved to the inactive_clean list. */
+   return cleaned_pages;
 }
 
-/*
- * This moves pages from the active list to
- * the inactive list.
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @target: number of pages to deactivate, zero for background aging
  *
- * We move them the other way when we see the
- * reference bit on the page.
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
  */
-static void refill_inactive(int nr_pages)
+int refill_inactive_scan(unsigned int priority)
 {
-   struct list_head * entry;
+   struct list_head * page_lru;
+   struct page * page;
+   int maxscan = nr_active_pages >> priority;
+   int nr_deactivated = 0;
 
+   /* Take the lock while messing with the list... */
    spin_lock(&pagemap_lru_lock);
-   entry = active_list.prev;
-   while (nr_pages-- && entry != &active_list) {
-      struct page * page;
+   while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+      page = list_entry(page_lru, struct page, lru);
 
-      page = list_entry(entry, struct page, lru);
-      entry = entry->prev;
-      if (PageTestandClearReferenced(page)) {
-         list_del(&page->lru);
-         list_add(&page->lru, &active_list);
+      /* Wrong page on list?! (list corruption, should not happen) */
+      if (!PageActive(page)) {
+         printk("VM: refill_inactive, wrong page on list.\n");
+         list_del(page_lru);
+         nr_active_pages--;
          continue;
       }
 
-      del_page_from_active_list(page);
-      add_page_to_inactive_list(page);
+      /*
+       * Do aging on the pages.  Every time a page is referenced,
+       * page->age gets incremented.  If it wasn't referenced, we
+       * decrement page->age.  The page gets moved to the inactive
+       * list when one of the following is true:
+       * - the page age reaches 0
+       * - the object the page belongs to isn't in active use
+       * - the object the page belongs to is hogging the cache
+       */
+      if (PageTestandClearReferenced(page)) {
+         age_page_up(page);
+      } else {
+         age_page_down(page);
+      }
+
+      /*
+       * Don't deactivate pages from zones which have
+       * plenty inactive pages.
+       */
+      if (zone_inactive_plenty(page->zone)) {
+         goto skip_page;
+      }
+
+      /* Deactivate a page once page->age reaches 0. */
+      if (!page->age)
+         deactivate_page_nolock(page);
+
+      /*
+       * Deactivate pages from files which aren't in use, busy
+       * pages will be referenced while on the inactive list.
+       */
+      if (page_mapping_notused(page))
+         deactivate_page_nolock(page);
+
+      /*
+       * If the page is still on the active list, move it
+       * to the other end of the list. Otherwise we exit if
+       * we have done enough work.
+       */
+skip_page:
+      if (PageActive(page)) {
+         list_del(page_lru);
+         list_add(page_lru, &active_list);
+      } else {
+         nr_deactivated++;
+      }
    }
    spin_unlock(&pagemap_lru_lock);
+
+   return nr_deactivated;
 }
 
-static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
-static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
+long count_ramdisk_pages(void)
 {
-   int max_scan;
-   int chunk_size = nr_pages;
-   unsigned long ratio;
-
-   nr_pages -= kmem_cache_reap(gfp_mask);
-   if (nr_pages <= 0)
-      return 0;
+   struct list_head *page_lru;
+   struct page *page;
+   long nr_ramdisk = 0;
 
-   nr_pages = chunk_size;
-   /* try to keep the active list 2/3 of the size of the cache */
-   ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2);
-   refill_inactive(ratio);
-  
-   max_scan = nr_inactive_pages / priority;
-   nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
-   if (nr_pages <= 0)
-      return 0;
-
-   shrink_dcache_memory(priority, gfp_mask);
-   shrink_icache_memory(priority, gfp_mask);
-#ifdef CONFIG_QUOTA
-   shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
-#endif
+   spin_lock(&pagemap_lru_lock);
+   for (page_lru = active_list.next; page_lru != &active_list;
+        page_lru = page_lru->next) {
+      page = list_entry(page_lru, struct page, lru);
+      if (page_ramdisk(page))
+         nr_ramdisk ++;
+   }
+   spin_unlock(&pagemap_lru_lock);
 
-   return nr_pages;
+   return nr_ramdisk;
 }
 
-int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order)
+/*
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
+ */
+int free_shortage(void)
 {
-   int ret = 0;
-   int priority = DEF_PRIORITY;
-   int nr_pages = SWAP_CLUSTER_MAX;
+   pg_data_t *pgdat;
+   unsigned int global_free = 0;
+   unsigned int global_target = freepages.high;
 
+   /* Are we low on free pages anywhere? */
+   pgdat = pgdat_list;
    do {
-      nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages);
-      if (nr_pages <= 0)
-         return 1;
+      int i;
+      for(i = 0; i < MAX_NR_ZONES; i++) {
+         zone_t *zone = pgdat->node_zones+ i;
+         unsigned int free;
 
-      ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2);
-   } while (--priority);
+         if (!zone->size)
+            continue;
 
-   return ret;
-}
+         free = zone->free_pages;
+         free += zone->inactive_clean_pages;
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+         /* Local shortage? */
+         if (free < zone->pages_low)
+            return 1;
 
-static int check_classzone_need_balance(zone_t * classzone)
-{
-   zone_t * first_classzone;
+         global_free += free;
+      }
+      pgdat = pgdat->node_next;
+   } while (pgdat);
 
-   first_classzone = classzone->zone_pgdat->node_zones;
-   while (classzone >= first_classzone) {
-      if (classzone->free_pages > classzone->pages_high)
-         return 0;
-      classzone--;
-   }
-   return 1;
+   /* Global shortage? */
+   return global_free < global_target;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+/*
+ * Are we low on inactive pages globally or in any zone?
+ */
+int inactive_shortage(void)
 {
-   int need_more_balance = 0, i;
-   zone_t * zone;
+   pg_data_t *pgdat;
+   unsigned int global_target = freepages.high + inactive_target();
+   unsigned int global_inactive = 0;
 
-   for (i = pgdat->nr_zones-1; i >= 0; i--) {
-      zone = pgdat->node_zones + i;
-      if (unlikely(current->need_resched))
-         schedule();
-      if (!zone->need_balance)
-         continue;
-      if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-         zone->need_balance = 0;
-         __set_current_state(TASK_INTERRUPTIBLE);
-         schedule_timeout(HZ);
-         continue;
+   pgdat = pgdat_list;
+   do {
+      int i;
+      for(i = 0; i < MAX_NR_ZONES; i++) {
+         zone_t *zone = pgdat->node_zones + i;
+         unsigned int inactive;
+
+         if (!zone->size)
+            continue;
+
+         inactive  = zone->inactive_dirty_pages;
+         inactive += zone->inactive_clean_pages;
+         inactive += zone->free_pages;
+
+         /* Local shortage? */
+         if (inactive < zone->pages_high)
+            return 1;
+
+         global_inactive += inactive;
       }
-      if (check_classzone_need_balance(zone))
-         need_more_balance = 1;
-      else
-         zone->need_balance = 0;
-   }
+      pgdat = pgdat->node_next;
+   } while (pgdat);
 
-   return need_more_balance;
+   /* Global shortage? */
+   return global_inactive < global_target;
 }
 
-static void kswapd_balance(void)
+#define DEF_PRIORITY (6)
+
+/*
+ * Refill_inactive is the function used to scan and age the pages on
+ * the active list and in the working set of processes, moving the
+ * little-used pages to the inactive list.
+ *
+ * When called by kswapd, we try to deactivate as many pages as needed
+ * to recover from the inactive page shortage. This makes it possible
+ * for kswapd to keep up with memory demand so user processes can get
+ * low latency on memory allocations.
+ *
+ * However, when the system starts to get overloaded we can get called
+ * by user processes. For user processes we want to both reduce the
+ * latency and make sure that multiple user processes together don't
+ * deactivate too many pages. To achieve this we simply do less work
+ * when called from a user process.
+ */
+static int refill_inactive(unsigned int gfp_mask)
 {
-   int need_more_balance;
-   pg_data_t * pgdat;
+   int progress = 0, maxtry;
+
+   maxtry = 1 << DEF_PRIORITY;
 
    do {
-      need_more_balance = 0;
-      pgdat = pgdat_list;
-      do
-         need_more_balance |= kswapd_balance_pgdat(pgdat);
-      while ((pgdat = pgdat->node_next));
-      if (need_more_balance && out_of_memory()) {
-         oom_kill();   
+      if (current->need_resched) {
+          __set_current_state(TASK_RUNNING);
+         schedule();
+         if (!inactive_shortage())
+            return 1;
       }
-   } while (need_more_balance);
-}
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-   zone_t * zone;
-   int i;
+      /* Walk the VM space for a bit.. */
+      swap_out(DEF_PRIORITY, gfp_mask);
 
-   for (i = pgdat->nr_zones-1; i >= 0; i--) {
-      zone = pgdat->node_zones + i;
-      if (!zone->need_balance)
-         continue;
-      return 0;
-   }
+      /* ..and refill the inactive list */
+      progress += refill_inactive_scan(DEF_PRIORITY);
 
-   return 1;
+      if (--maxtry <= 0)
+         break;
+   } while (inactive_shortage());
+
+   return progress;
 }
 
-static int kswapd_can_sleep(void)
+/*
+ * Worker function for kswapd and try_to_free_pages, we get
+ * called whenever there is a shortage of free/inactive_clean
+ * pages.
+ *
+ * This function will also move pages to the inactive list,
+ * if needed.
+ */
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
 {
-   pg_data_t * pgdat;
+   int ret = 0;
 
-   pgdat = pgdat_list;
-   do {
-      if (kswapd_can_sleep_pgdat(pgdat))
-         continue;
-      return 0;
-   } while ((pgdat = pgdat->node_next));
+   /*
+    * Eat memory from filesystem page cache, buffer cache,
+    * dentry, inode and filesystem quota caches.
+    */
+   ret += page_launder(gfp_mask, user);
+   shrink_dcache_memory(0, gfp_mask);
+   shrink_icache_memory(0, gfp_mask);
+   shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
 
-   return 1;
+   /*
+    * If needed, we move pages from the active list
+    * to the inactive list.
+    */
+   if (inactive_shortage())
+      ret += refill_inactive(gfp_mask);
+
+   /*    
+    * Reclaim unused slab cache memory.
+    */
+   kmem_cache_reap(gfp_mask);
+
+   return ret;
 }
 
+DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process. 
@@ -672,7 +1072,6 @@
 int kswapd(void *unused)
 {
    struct task_struct *tsk = current;
-   DECLARE_WAITQUEUE(wait, tsk);
 
    daemonize();
    strcpy(tsk->comm, "kswapd");
@@ -696,31 +1095,132 @@
     * Kswapd main loop.
     */
    for (;;) {
-      __set_current_state(TASK_INTERRUPTIBLE);
-      add_wait_queue(&kswapd_wait, &wait);
+      static long recalc = 0;
 
-      mb();
-      if (kswapd_can_sleep())
-         schedule();
+      /*
+       * We try to rebalance the VM either when we are short
+       * on free pages or when we have a shortage of inactive
+       * pages and are getting low on free pages.
+       */
+      if (free_shortage() || (inactive_shortage() && !free_plenty()))
+         do_try_to_free_pages(GFP_KSWAPD, 0);
 
-      __set_current_state(TASK_RUNNING);
-      remove_wait_queue(&kswapd_wait, &wait);
+      /* Once a second ... */
+      if (time_after(jiffies, recalc + HZ)) {
+         recalc = jiffies;
 
+         /* Do background page aging. */
+         refill_inactive_scan(DEF_PRIORITY);
+      }
+
+      /* 
+       * We go to sleep if either the free page shortage
+       * or the inactive page shortage is gone. We do this
+       * because:
+       * 1) we need no more free pages   or
+       * 2) the inactive pages need to be flushed to disk,
+       *    it wouldn't help to eat CPU time now ...
+       *
+       * We go to sleep for one second, but if it's needed
+       * we'll be woken up earlier...
+       */
+      if (!free_shortage() || !inactive_shortage()) {
+         interruptible_sleep_on_timeout(&kswapd_wait, HZ);
       /*
-       * If we actually get into a low-memory situation,
-       * the processes needing more memory will wake us
-       * up on a more timely basis.
+       * If we couldn't free enough memory, we see if it was
+       * due to the system just not having enough memory.
+       * If that is the case, the only solution is to kill
+       * a process (the alternative is enternal deadlock).
+       *
+       * If there still is enough memory around, we just loop
+       * and try free some more memory...
        */
-      kswapd_balance();
-      run_task_queue(&tq_disk);
+      } else if (out_of_memory()) {
+         oom_kill();
+      }
+   }
+}
+
+void wakeup_kswapd(void)
+{
+   if (waitqueue_active(&kswapd_wait))
+      wake_up_interruptible(&kswapd_wait);
+}
+
+/*
+ * Called by non-kswapd processes when they want more
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
+ */
+int try_to_free_pages(unsigned int gfp_mask)
+{
+   int ret = 1;
+
+   if (gfp_mask & __GFP_WAIT) {
+      current->flags |= PF_MEMALLOC;
+      ret = do_try_to_free_pages(gfp_mask, 1);
+      current->flags &= ~PF_MEMALLOC;
    }
+
+   return ret;
 }
 
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances.
+ */
+int kreclaimd(void *unused)
+{
+   struct task_struct *tsk = current;
+   pg_data_t *pgdat;
+
+   daemonize();
+   strcpy(tsk->comm, "kreclaimd");
+   sigfillset(&tsk->blocked);
+   current->flags |= PF_MEMALLOC;
+
+   while (1) {
+
+      /*
+       * We sleep until someone wakes us up from
+       * page_alloc.c::__alloc_pages().
+       */
+      interruptible_sleep_on(&kreclaimd_wait);
+
+      /*
+       * Move some pages from the inactive_clean lists to
+       * the free lists, if it is needed.
+       */
+      pgdat = pgdat_list;
+      do {
+         int i;
+         for(i = 0; i < MAX_NR_ZONES; i++) {
+            zone_t *zone = pgdat->node_zones + i;
+            if (!zone->size)
+               continue;
+
+            while (zone->free_pages < zone->pages_low) {
+               struct page * page;
+               page = reclaim_page(zone);
+               if (!page)
+                  break;
+               __free_page(page);
+            }
+         }
+         pgdat = pgdat->node_next;
+      } while (pgdat);
+   }
+}
+
+
 static int __init kswapd_init(void)
 {
-   printk("Starting kswapd\n");
+   printk("Starting kswapd v1.8\n");
    swap_setup();
    kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+   kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
    return 0;
 }
 


Comments: webmaster (at) linuxhq.com.
Advertising: banners (at) linuxhq.com.
Compilation ©1998-2008 Linux Headquarters, Inc.