Linux Headquarters
[ Register ]
[ About us ] [ Home Page ]

Advertisement
[ Kernel ] [ Documentation ] [ Links ] [ Books ]

Advertisement

Kernel v2.6.26-rc1 /mm/memory.c

Filename:/mm/memory.c
Lines Added:149
Lines Deleted:79
Also changed in: (Previous) 2.6.25-git20  2.6.25-git19  2.6.25-git18  2.6.25-git17  2.6.25-git16  2.6.25-git15 
(Following) 2.6.26-rc1-git5  2.6.26-rc1-git6  2.6.26-rc1-git7  2.6.26-rc1-git8  2.6.26-rc1-git9  2.6.26-rc2 

Location
[  2.6.26-rc1
  [  mm
     o  memory.c

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e..bbab1e3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
 }
 
 /*
- * This function gets the "struct page" associated with a pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
  *
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
- * will have each page table entry just pointing to a raw page frame
- * number, and as far as the VM layer is concerned, those do not have
- * pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
  *
- * The way we recognize those mappings is through the rules set up
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
- * and the vm_pgoff will point to the first PFN mapped: thus every
- * page that is a raw mapping will always honor the rule
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
  *
  *   pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
  *
- * and if that isn't true, the page has been COW'ed (in which case it
- * _does_ have a "struct page" associated with it even if it is in a
- * VM_PFNMAP range).
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
  */
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+            pte_t pte)
 {
-   unsigned long pfn = pte_pfn(pte);
+   unsigned long pfn;
 
-   if (unlikely(vma->vm_flags & VM_PFNMAP)) {
-      unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
-      if (pfn == vma->vm_pgoff + off)
-         return NULL;
-      if (!is_cow_mapping(vma->vm_flags))
-         return NULL;
+   if (HAVE_PTE_SPECIAL) {
+      if (likely(!pte_special(pte))) {
+         VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+         return pte_page(pte);
+      }
+      VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+      return NULL;
    }
 
-#ifdef CONFIG_DEBUG_VM
-   /*
-    * Add some anal sanity checks for now. Eventually,
-    * we should just do "return pfn_to_page(pfn)", but
-    * in the meantime we check that we get a valid pfn,
-    * and that the resulting page looks ok.
-    */
-   if (unlikely(!pfn_valid(pfn))) {
-      print_bad_pte(vma, pte, addr);
-      return NULL;
+   /* !HAVE_PTE_SPECIAL case follows: */
+
+   pfn = pte_pfn(pte);
+
+   if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+      if (vma->vm_flags & VM_MIXEDMAP) {
+         if (!pfn_valid(pfn))
+            return NULL;
+         goto out;
+      } else {
+         unsigned long off;
+         off = (addr - vma->vm_start) >> PAGE_SHIFT;
+         if (pfn == vma->vm_pgoff + off)
+            return NULL;
+         if (!is_cow_mapping(vma->vm_flags))
+            return NULL;
+      }
    }
-#endif
+
+   VM_BUG_ON(!pfn_valid(pfn));
 
    /*
-    * NOTE! We still have PageReserved() pages in the page 
-    * tables. 
+    * NOTE! We still have PageReserved() pages in the page tables.
     *
-    * The PAGE_ZERO() pages and various VDSO mappings can
-    * cause them to exist.
+    * eg. VDSO mappings can cause them to exist.
     */
+out:
    return pfn_to_page(pfn);
 }
 
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
       if (pages)
          foll_flags |= FOLL_GET;
       if (!write && !(vma->vm_flags & VM_LOCKED) &&
-          (!vma->vm_ops || (!vma->vm_ops->nopage &&
-               !vma->vm_ops->fault)))
+          (!vma->vm_ops || !vma->vm_ops->fault))
          foll_flags |= FOLL_ANON;
 
       do {
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
  * old drivers should use this, and they needed to mark their
  * pages reserved for the old functions anyway.
  */
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+         struct page *page, pgprot_t prot)
 {
+   struct mm_struct *mm = vma->vm_mm;
    int retval;
    pte_t *pte;
    spinlock_t *ptl;
@@ -1202,40 +1239,26 @@ out:
  *
  * The page does not need to be reserved.
  */
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+         struct page *page)
 {
    if (addr < vma->vm_start || addr >= vma->vm_end)
       return -EFAULT;
    if (!page_count(page))
       return -EINVAL;
    vma->vm_flags |= VM_INSERTPAGE;
-   return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+   return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
 
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-      unsigned long pfn)
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+         unsigned long pfn, pgprot_t prot)
 {
    struct mm_struct *mm = vma->vm_mm;
    int retval;
    pte_t *pte, entry;
    spinlock_t *ptl;
 
-   BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-   BUG_ON(is_cow_mapping(vma->vm_flags));
-
    retval = -ENOMEM;
    pte = get_locked_pte(mm, addr, &ptl);
    if (!pte)
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
       goto out_unlock;
 
    /* Ok, finally just insert the thing.. */
-   entry = pfn_pte(pfn, vma->vm_page_prot);
+   entry = pte_mkspecial(pfn_pte(pfn, prot));
    set_pte_at(mm, addr, pte, entry);
-   update_mmu_cache(vma, addr, entry);
+   update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
 
    retval = 0;
 out_unlock:
    pte_unmap_unlock(pte, ptl);
-
 out:
    return retval;
 }
+
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+         unsigned long pfn)
+{
+   /*
+    * Technically, architectures with pte_special can avoid all these
+    * restrictions (same for remap_pfn_range).  However we would like
+    * consistency in testing and feature parity among all, so we should
+    * try to keep these invariants in place for everybody.
+    */
+   BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+   BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+                  (VM_PFNMAP|VM_MIXEDMAP));
+   BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+   BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+      return -EFAULT;
+   return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
 EXPORT_SYMBOL(vm_insert_pfn);
 
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+         unsigned long pfn)
+{
+   BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+   if (addr < vma->vm_start || addr >= vma->vm_end)
+      return -EFAULT;
+
+   /*
+    * If we don't have pte special, then we have to use the pfn_valid()
+    * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+    * refcount the page if pfn_valid is true (hence insert_page rather
+    * than insert_pfn).
+    */
+   if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+      struct page *page;
+
+      page = pfn_to_page(pfn);
+      return insert_page(vma, addr, page, vma->vm_page_prot);
+   }
+   return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
    arch_enter_lazy_mmu_mode();
    do {
       BUG_ON(!pte_none(*pte));
-      set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+      set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
       pfn++;
    } while (pte++, addr += PAGE_SIZE, addr != end);
    arch_leave_lazy_mmu_mode();
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
    BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-   if (likely(vma->vm_ops->fault)) {
-      ret = vma->vm_ops->fault(vma, &vmf);
-      if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
-         return ret;
-   } else {
-      /* Legacy ->nopage path */
-      ret = 0;
-      vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-      /* no page was available -- either SIGBUS or OOM */
-      if (unlikely(vmf.page == NOPAGE_SIGBUS))
-         return VM_FAULT_SIGBUS;
-      else if (unlikely(vmf.page == NOPAGE_OOM))
-         return VM_FAULT_OOM;
-   }
+   ret = vma->vm_ops->fault(vma, &vmf);
+   if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+      return ret;
 
    /*
     * For consistency in subsequent calls, make the faulted page always
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
    unsigned long pfn;
 
    pte_unmap(page_table);
-   BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-   BUG_ON(is_cow_mapping(vma->vm_flags));
+   BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+   BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 
    pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+
+   BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
    if (unlikely(pfn == NOPFN_OOM))
       return VM_FAULT_OOM;
    else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
    if (!pte_present(entry)) {
       if (pte_none(entry)) {
          if (vma->vm_ops) {
-            if (vma->vm_ops->fault || vma->vm_ops->nopage)
+            if (likely(vma->vm_ops->fault))
                return do_linear_fault(mm, vma, address,
                   pte, pmd, write_access, entry);
             if (unlikely(vma->vm_ops->nopfn))


Comments: webmaster (at) linuxhq.com.
Advertising: banners (at) linuxhq.com.
Compilation ©1998-2008 Linux Headquarters, Inc.