aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds2016-12-14 17:25:18 -0800
committerLinus Torvalds2016-12-14 17:25:18 -0800
commita57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree5a42ee9a668f171143464bc86013954c1bbe94ad /mm
parentcf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parente1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a few misc things - kexec updates - DMA-mapping updates to better support networking DMA operations - IPC updates - various MM changes to improve DAX fault handling - lots of radix-tree changes, mainly to the test suite. All leading up to reimplementing the IDA/IDR code to be a wrapper layer over the radix-tree. However the final trigger-pulling patch is held off for 4.11. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) radix tree test suite: delete unused rcupdate.c radix tree test suite: add new tag check radix-tree: ensure counts are initialised radix tree test suite: cache recently freed objects radix tree test suite: add some more functionality idr: reduce the number of bits per level from 8 to 6 rxrpc: abstract away knowledge of IDR internals tpm: use idr_find(), not idr_find_slowpath() idr: add ida_is_empty radix tree test suite: check multiorder iteration radix-tree: fix replacement for multiorder entries radix-tree: add radix_tree_split_preload() radix-tree: add radix_tree_split radix-tree: add radix_tree_join radix-tree: delete radix_tree_range_tag_if_tagged() radix-tree: delete radix_tree_locate_item() radix-tree: improve multiorder iterators btrfs: fix race in btrfs_free_dummy_fs_info() radix-tree: improve dump output radix-tree: make radix_tree_find_next_bit more useful ...
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c17
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/gup.c20
-rw-r--r--mm/huge_memory.c173
-rw-r--r--mm/internal.h2
-rw-r--r--mm/khugepaged.c31
-rw-r--r--mm/memory.c859
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/process_vm_access.c12
-rw-r--r--mm/shmem.c32
12 files changed, 643 insertions, 569 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 223464227299..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
page_count(page) > page_mapcount(page))
goto isolate_fail;
+ /*
+ * Only allow to migrate anonymous pages in GFP_NOFS context
+ * because those do not depend on fs locks.
+ */
+ if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ goto isolate_fail;
+
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio)
{
- int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /* Check if the GFP flags allow compaction */
- if (!may_enter_fs || !may_perform_io)
+ /*
+ * Check if the GFP flags allow compaction - GFP_NOIO is really
+ * tricky context because the migration might require IO
+ */
+ if (!may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
};
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
+ .gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
diff --git a/mm/filemap.c b/mm/filemap.c
index b06517b7f97f..32be3c8f3a11 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2164,12 +2164,12 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
- struct file *file = fe->vma->vm_file;
+ struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
@@ -2225,11 +2225,11 @@ repeat:
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
- if (fe->pte)
- fe->pte += iter.index - last_pgoff;
+ vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ if (vmf->pte)
+ vmf->pte += iter.index - last_pgoff;
last_pgoff = iter.index;
- if (alloc_set_pte(fe, NULL, page))
+ if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2239,7 +2239,7 @@ skip:
put_page(page);
next:
/* Huge page is mapped? No need to proceed. */
- if (pmd_trans_huge(*fe->pmd))
+ if (pmd_trans_huge(*vmf->pmd))
break;
if (iter.index == end_pgoff)
break;
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
* caller if required (just like with __get_user_pages). "FOLL_GET"
* is set implicitly if "pages" is non-NULL.
*/
-__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
+ struct vm_area_struct **vmas, int *locked)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- NULL, false,
+ locked, true,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's. We also
- * obviously don't pass FOLL_REMOTE in here.
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cee42cf05477..10eedbf14421 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
+static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
gfp_t gfp)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
*/
__SetPageUptodate(page);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
@@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
if (userfaultfd_missing(vma)) {
int ret;
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
@@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
atomic_long_inc(&vma->vm_mm->nr_ptes);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct fault_env *fe)
+int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
@@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
@@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
set = false;
- if (pmd_none(*fe->pmd)) {
+ if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) {
- spin_unlock(fe->ptl);
- ret = handle_userfault(fe, VM_UFFD_MISSING);
+ spin_unlock(vmf->ptl);
+ ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
- haddr, fe->pmd, zero_page);
- spin_unlock(fe->ptl);
+ haddr, vmf->pmd, zero_page);
+ spin_unlock(vmf->ptl);
set = true;
}
} else
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
if (!set)
pte_free(vma->vm_mm, pgtable);
return ret;
@@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
return VM_FAULT_FALLBACK;
}
prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(fe, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -879,30 +879,30 @@ out:
return ret;
}
-void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
- fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
- haddr = fe->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
- fe->flags & FAULT_FLAG_WRITE))
- update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
+ haddr = vmf->address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
+ vmf->flags & FAULT_FLAG_WRITE))
+ update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
+static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
@@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE, vma,
- fe->address, page_to_nid(page));
+ vmf->address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
@@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
+ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
+ page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
- fe->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*fe->pte));
- set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
- pte_unmap(fe->pte);
+ vmf->pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*vmf->pte));
+ set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
+ pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, fe->pmd, pgtable);
+ pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
@@ -993,7 +993,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
@@ -1005,23 +1005,23 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
+int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
int ret = 0;
- fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
+ vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
@@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
@@ -1053,12 +1053,12 @@ alloc:
prep_transhuge_page(new_page);
} else {
if (!page) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
} else {
- ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
+ ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
put_page(page);
@@ -1070,7 +1070,7 @@ alloc:
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
put_page(new_page);
- split_huge_pmd(vma, fe->pmd, fe->address);
+ split_huge_pmd(vma, vmf->pmd, vmf->address);
if (page)
put_page(page);
ret |= VM_FAULT_FALLBACK;
@@ -1090,11 +1090,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
- spin_lock(fe->ptl);
+ spin_lock(vmf->ptl);
if (page)
put_page(page);
- if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
- spin_unlock(fe->ptl);
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
@@ -1102,12 +1102,12 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
} else {
@@ -1117,13 +1117,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
return ret;
}
@@ -1196,12 +1196,12 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
+int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
@@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
bool was_writable;
int flags = 0;
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_same(pmd, *fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
@@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
- if (unlikely(pmd_trans_migrating(*fe->pmd))) {
- page = pmd_page(*fe->pmd);
- spin_unlock(fe->ptl);
+ if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
+ page = pmd_page(*vmf->pmd);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
goto out;
}
@@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* to serialises splits
*/
get_page(page);
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(fe->ptl);
- if (unlikely(!pmd_same(pmd, *fe->pmd))) {
+ spin_lock(vmf->ptl);
+ if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
@@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
*/
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- fe->pmd, pmd, fe->address, page, target_nid);
+ vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
@@ -1304,18 +1304,19 @@ clear_pmdnuma:
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
- update_mmu_cache_pmd(vma, fe->address, fe->pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out_unlock:
- spin_unlock(fe->ptl);
+ spin_unlock(vmf->ptl);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
+ vmf->flags);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..44d68895a9b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,7 @@
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
-int do_swap_page(struct fault_env *fe, pte_t orig_pte);
+int do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 09460955e818..e32389a97030 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- pte_t pteval;
int swapped_in = 0, ret = 0;
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
.address = address,
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
+ .pgoff = linear_page_index(vma, address),
};
/* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
- fe.pte = pte_offset_map(pmd, address);
- for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
- fe.pte++, fe.address += PAGE_SIZE) {
- pteval = *fe.pte;
- if (!is_swap_pte(pteval))
+ vmf.pte = pte_offset_map(pmd, address);
+ for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
+ vmf.pte++, vmf.address += PAGE_SIZE) {
+ vmf.orig_pte = *vmf.pte;
+ if (!is_swap_pte(vmf.orig_pte))
continue;
swapped_in++;
- ret = do_swap_page(&fe, pteval);
+ ret = do_swap_page(&vmf);
/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
+ if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* pte is unmapped now, we need to map it */
- fe.pte = pte_offset_map(pmd, fe.address);
+ vmf.pte = pte_offset_map(pmd, vmf.address);
}
- fe.pte--;
- pte_unmap(fe.pte);
+ vmf.pte--;
+ pte_unmap(vmf.pte);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm,
radix_tree_replace_slot(&mapping->page_tree, slot,
new_page + (index % HPAGE_PMD_NR));
- slot = radix_tree_iter_next(&iter);
+ slot = radix_tree_iter_resume(slot, &iter);
index++;
continue;
out_lru:
@@ -1546,7 +1546,6 @@ tree_unlocked:
/* Put holes back where they were */
radix_tree_delete(&mapping->page_tree,
iter.index);
- slot = radix_tree_iter_next(&iter);
continue;
}
@@ -1557,11 +1556,11 @@ tree_unlocked:
page_ref_unfreeze(page, 2);
radix_tree_replace_slot(&mapping->page_tree,
slot, page);
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
putback_lru_page(page);
unlock_page(page);
spin_lock_irq(&mapping->tree_lock);
- slot = radix_tree_iter_next(&iter);
}
VM_BUG_ON(nr_none);
spin_unlock_irq(&mapping->tree_lock);
@@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
present++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da39de28..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static int do_page_mkwrite(struct vm_fault *vmf)
{
- struct vm_fault vmf;
int ret;
+ struct page *page = vmf->page;
+ unsigned int old_flags = vmf->flags;
- vmf.virtual_address = (void __user *)(address & PAGE_MASK);
- vmf.pgoff = page->index;
- vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.page = page;
- vmf.cow_page = NULL;
+ vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
- ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+ ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
+ /* Restore original flags so that caller is not surprised */
+ vmf->flags = old_flags;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
+ * Handle dirtying of a page in shared file mapping on a write fault.
+ *
+ * The function expects the page to be locked and unlocks it.
+ */
+static void fault_dirty_shared_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ struct address_space *mapping;
+ bool dirtied;
+ bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
+
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ /*
+ * Take a local copy of the address_space - page.mapping may be zeroed
+ * by truncate after unlock_page(). The address_space itself remains
+ * pinned by vma->vm_file's reference. We rely on unlock_page()'s
+ * release semantics to prevent the compiler from undoing this copying.
+ */
+ mapping = page_rmapping(page);
+ unlock_page(page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
+}
+
+/*
* Handle write page faults for pages that can be reused in the current vma
*
* This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
- struct page *page, int page_mkwrite, int dirty_shared)
- __releases(fe->ptl)
+static inline void wp_page_reuse(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
+ struct page *page = vmf->page;
pte_t entry;
/*
* Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
if (page)
page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
+ entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
- update_mmu_cache(vma, fe->address, fe->pte);
- pte_unmap_unlock(fe->pte, fe->ptl);
-
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
-
- if (!page_mkwrite)
- lock_page(page);
-
- dirtied = set_page_dirty(page);
- VM_BUG_ON_PAGE(PageAnon(page), page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
-
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
- }
-
- return VM_FAULT_WRITE;
+ if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
+static int wp_page_copy(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
+ struct page *old_page = vmf->page;
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = fe->address & PAGE_MASK;
+ const unsigned long mmun_start = vmf->address & PAGE_MASK;
const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- if (is_zero_pfn(pte_pfn(orig_pte))) {
- new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
+ new_page = alloc_zeroed_user_highpage_movable(vma,
+ vmf->address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- fe->address);
+ vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, fe->address, vma);
+ cow_user_page(new_page, old_page, vmf->address, vma);
}
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
/*
* Re-check the pte - we dropped the lock
*/
- fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte))) {
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
} else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
+ flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
- ptep_clear_flush_notify(vma, fe->address, fe->pte);
- page_add_new_anon_rmap(new_page, vma, fe->address, false);
+ ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+ page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
- set_pte_at_notify(mm, fe->address, fe->pte, entry);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
/*
* Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
if (new_page)
put_page(new_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
@@ -2263,79 +2269,91 @@ oom:
return VM_FAULT_OOM;
}
+/**
+ * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
+ * writeable once the page is prepared
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a write page fault in a
+ * shared mapping due to PTE being read-only once the mapped page is prepared.
+ * It handles locking of PTE and modifying it. The function returns
+ * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
+ * lock.
+ *
+ * The function expects the page to be locked or other protection against
+ * concurrent faults / writeback (such as DAX radix tree locks).
+ */
+int finish_mkwrite_fault(struct vm_fault *vmf)
+{
+ WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ /*
+ * We might have raced with another page fault while we released the
+ * pte_offset_map_lock.
+ */
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return VM_FAULT_NOPAGE;
+ }
+ wp_page_reuse(vmf);
+ return 0;
+}
+
/*
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
+static int wp_pfn_shared(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- struct vm_fault vmf = {
- .page = NULL,
- .pgoff = linear_page_index(vma, fe->address),
- .virtual_address =
- (void __user *)(fe->address & PAGE_MASK),
- .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
- };
int ret;
- pte_unmap_unlock(fe->pte, fe->ptl);
- ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
- if (ret & VM_FAULT_ERROR)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->flags |= FAULT_FLAG_MKWRITE;
+ ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- /*
- * We might have raced with another page fault while we
- * released the pte_offset_map_lock.
- */
- if (!pte_same(*fe->pte, orig_pte)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return 0;
- }
+ return finish_mkwrite_fault(vmf);
}
- return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
- struct page *old_page)
- __releases(fe->ptl)
+static int wp_page_shared(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- int page_mkwrite = 0;
+ struct vm_area_struct *vma = vmf->vma;
- get_page(old_page);
+ get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- pte_unmap_unlock(fe->pte, fe->ptl);
- tmp = do_page_mkwrite(vma, old_page, fe->address);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(old_page);
+ put_page(vmf->page);
return tmp;
}
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
- return 0;
+ tmp = finish_mkwrite_fault(vmf);
+ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ return tmp;
}
- page_mkwrite = 1;
+ } else {
+ wp_page_reuse(vmf);
+ lock_page(vmf->page);
}
+ fault_dirty_shared_page(vma, vmf->page);
+ put_page(vmf->page);
- return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
+ return VM_FAULT_WRITE;
}
/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
- __releases(fe->ptl)
+static int do_wp_page(struct vm_fault *vmf)
+ __releases(vmf->ptl)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *old_page;
+ struct vm_area_struct *vma = vmf->vma;
- old_page = vm_normal_page(vma, fe->address, orig_pte);
- if (!old_page) {
+ vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
+ if (!vmf->page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
- return wp_pfn_shared(fe, orig_pte);
+ return wp_pfn_shared(vmf);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
int total_mapcount;
- if (!trylock_page(old_page)) {
- get_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- lock_page(old_page);
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (!pte_same(*fe->pte, orig_pte)) {
- unlock_page(old_page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- put_page(old_page);
+ if (!trylock_page(vmf->page)) {
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ lock_page(vmf->page);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ unlock_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ put_page(vmf->page);
return 0;
}
- put_page(old_page);
+ put_page(vmf->page);
}
- if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (reuse_swap_page(vmf->page, &total_mapcount)) {
if (total_mapcount == 1) {
/*
* The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
* Protected against the rmap code by
* the page lock.
*/
- page_move_anon_rmap(old_page, vma);
+ page_move_anon_rmap(vmf->page, vma);
}
- unlock_page(old_page);
- return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
+ unlock_page(vmf->page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
}
- unlock_page(old_page);
+ unlock_page(vmf->page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
- return wp_page_shared(fe, orig_pte, old_page);
+ return wp_page_shared(vmf);
}
/*
* Ok, we need to copy. Oh, well..
*/
- get_page(old_page);
+ get_page(vmf->page);
- pte_unmap_unlock(fe->pte, fe->ptl);
- return wp_page_copy(fe, orig_pte, old_page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return wp_page_copy(vmf);
}
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct fault_env *fe, pte_t orig_pte)
+int do_swap_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page, *swapcache;
struct mem_cgroup *memcg;
swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
int exclusive = 0;
int ret = 0;
- if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
+ if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
- entry = pte_to_swp_entry(orig_pte);
+ entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
- migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
+ migration_entry_wait(vma->vm_mm, vmf->pmd,
+ vmf->address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
- print_bad_pte(vma, fe->address, orig_pte, NULL);
+ print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
}
goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, fe->address);
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
+ vmf->address);
if (!page) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
- fe->address, &fe->ptl);
- if (likely(pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
swapcache = page;
- locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
+ locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
goto out_page;
- page = ksm_might_need_to_copy(page, vma, fe->address);
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
/*
* Back out if somebody else already faulted in this pte.
*/
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (unlikely(!pte_same(*fe->pte, orig_pte)))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- fe->flags &= ~FAULT_FLAG_WRITE;
+ vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
- if (pte_swp_soft_dirty(orig_pte))
+ if (pte_swp_soft_dirty(vmf->orig_pte))
pte = pte_mksoft_dirty(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ vmf->orig_pte = pte;
if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, fe->address, exclusive);
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
put_page(swapcache);
}
- if (fe->flags & FAULT_FLAG_WRITE) {
- ret |= do_wp_page(fe, pte);
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ ret |= do_wp_page(vmf);
if (ret & VM_FAULT_ERROR)
ret &= VM_FAULT_ERROR;
goto out;
}
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
return ret;
out_nomap:
mem_cgroup_cancel_charge(page, memcg, false);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct fault_env *fe)
+static int do_anonymous_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
return VM_FAULT_SIGBUS;
/* Check if we need to add a guard page to the stack */
- if (check_stack_guard_page(vma, fe->address) < 0)
+ if (check_stack_guard_page(vma, vmf->address) < 0)
return VM_FAULT_SIGSEGV;
/*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(fe->pmd)))
+ if (unlikely(pmd_trans_unstable(vmf->pmd)))
return 0;
/* Use the zero-page for reads */
- if (!(fe->flags & FAULT_FLAG_WRITE) &&
+ if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm)) {
- entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
+ entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
vma->vm_page_prot));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
goto setpte;
}
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
- page = alloc_zeroed_user_highpage_movable(vma, fe->address);
+ page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (!page)
goto oom;
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
- if (!pte_none(*fe->pte))
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (!pte_none(*vmf->pte))
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return handle_userfault(fe, VM_UFFD_MISSING);
+ return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
- struct page *cow_page, struct page **page, void **entry)
+static int __do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct vm_fault vmf;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
- vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
- vmf.pgoff = pgoff;
- vmf.flags = fe->flags;
- vmf.page = NULL;
- vmf.gfp_mask = __get_fault_gfp_mask(vma);
- vmf.cow_page = cow_page;
-
- ret = vma->vm_ops->fault(vma, &vmf);
- if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- return ret;
- if (ret & VM_FAULT_DAX_LOCKED) {
- *entry = vmf.entry;
+ ret = vma->vm_ops->fault(vma, vmf);
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
+ VM_FAULT_DONE_COW)))
return ret;
- }
- if (unlikely(PageHWPoison(vmf.page))) {
+ if (unlikely(PageHWPoison(vmf->page))) {
if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf.page);
- put_page(vmf.page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
+ vmf->page = NULL;
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
- lock_page(vmf.page);
+ lock_page(vmf->page);
else
- VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
- *page = vmf.page;
return ret;
}
-static int pte_alloc_one_map(struct fault_env *fe)
+static int pte_alloc_one_map(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- if (!pmd_none(*fe->pmd))
+ if (!pmd_none(*vmf->pmd))
goto map_pte;
- if (fe->prealloc_pte) {
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd))) {
- spin_unlock(fe->ptl);
+ if (vmf->prealloc_pte) {
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd))) {
+ spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
- pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
- spin_unlock(fe->ptl);
- fe->prealloc_pte = 0;
- } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
+ pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
+ spin_unlock(vmf->ptl);
+ vmf->prealloc_pte = 0;
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
* through an atomic read in C, which is what pmd_trans_unstable()
* provides.
*/
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return VM_FAULT_NOPAGE;
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
- &fe->ptl);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
return 0;
}
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
return true;
}
-static void deposit_prealloc_pte(struct fault_env *fe)
+static void deposit_prealloc_pte(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
- pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
+ pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
- fe->prealloc_pte = 0;
+ vmf->prealloc_pte = 0;
}
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
- unsigned long haddr = fe->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
- if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
- fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
- if (unlikely(!pmd_none(*fe->pmd)))
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
- deposit_prealloc_pte(fe);
+ deposit_prealloc_pte(vmf);
- set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, haddr, fe->pmd);
+ update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
@@ -3005,13 +3013,13 @@ out:
* withdraw with pmd lock held.
*/
if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
- fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
- fe->pmd);
- spin_unlock(fe->ptl);
+ vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
+ vmf->pmd);
+ spin_unlock(vmf->ptl);
return ret;
}
#else
-static int do_set_pmd(struct fault_env *fe, struct page *page)
+static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
- * @fe: fault environment
+ * @vmf: fault environment
* @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
- * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
+ * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
+ * return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
- struct vm_area_struct *vma = fe->vma;
- bool write = fe->flags & FAULT_FLAG_WRITE;
+ struct vm_area_struct *vma = vmf->vma;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
int ret;
- if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
+ if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
- ret = do_set_pmd(fe, page);
+ ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
goto fault_handled;
}
- if (!fe->pte) {
- ret = pte_alloc_one_map(fe);
+ if (!vmf->pte) {
+ ret = pte_alloc_one_map(vmf);
if (ret)
goto fault_handled;
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*fe->pte))) {
+ if (unlikely(!pte_none(*vmf->pte))) {
ret = VM_FAULT_NOPAGE;
goto fault_handled;
}
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, fe->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
}
- set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
- update_mmu_cache(vma, fe->address, fe->pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
ret = 0;
fault_handled:
/* preallocated pagetable is unused: free it */
- if (fe->prealloc_pte) {
- pte_free(fe->vma->vm_mm, fe->prealloc_pte);
- fe->prealloc_pte = 0;
+ if (vmf->prealloc_pte) {
+ pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
+ vmf->prealloc_pte = 0;
}
return ret;
}
+
+/**
+ * finish_fault - finish page fault once we have prepared the page to fault
+ *
+ * @vmf: structure describing the fault
+ *
+ * This function handles all that is needed to finish a page fault once the
+ * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
+ * given page, adds reverse page mapping, handles memcg charges and LRU
+ * addition. The function returns 0 on success, VM_FAULT_ code in case of
+ * error.
+ *
+ * The function expects the page to be locked and on success it consumes a
+ * reference of a page being mapped (for the PTE which maps it).
+ */
+int finish_fault(struct vm_fault *vmf)
+{
+ struct page *page;
+ int ret;
+
+ /* Did we COW the page? */
+ if ((vmf->flags & FAULT_FLAG_WRITE) &&
+ !(vmf->vma->vm_flags & VM_SHARED))
+ page = vmf->cow_page;
+ else
+ page = vmf->page;
+ ret = alloc_set_pte(vmf, vmf->memcg, page);
+ if (vmf->pte)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return ret;
+}
+
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
-static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
+static int do_fault_around(struct vm_fault *vmf)
{
- unsigned long address = fe->address, nr_pages, mask;
+ unsigned long address = vmf->address, nr_pages, mask;
+ pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
- fe->address = max(address & mask, fe->vma->vm_start);
- off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ vmf->address = max(address & mask, vmf->vma->vm_start);
+ off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
start_pgoff -= off;
/*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
* or fault_around_pages() from start_pgoff, depending what is nearest.
*/
end_pgoff = start_pgoff -
- ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
PTRS_PER_PTE - 1;
- end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
+ end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
start_pgoff + nr_pages - 1);
- if (pmd_none(*fe->pmd)) {
- fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
- if (!fe->prealloc_pte)
+ if (pmd_none(*vmf->pmd)) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
}
- fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
+ vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
- if (pmd_trans_huge(*fe->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
- if (!fe->pte)
+ if (!vmf->pte)
goto out;
/* check if the page fault is solved */
- fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
- if (!pte_none(*fe->pte))
+ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
+ if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
- fe->address = address;
- fe->pte = NULL;
+ vmf->address = address;
+ vmf->pte = NULL;
return ret;
}
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
+ struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(fe, pgoff);
+ ret = do_fault_around(vmf);
if (ret)
return ret;
}
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- unlock_page(fault_page);
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
- put_page(fault_page);
+ put_page(vmf->page);
return ret;
}
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_cow_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page, *new_page;
- void *fault_entry;
- struct mem_cgroup *memcg;
+ struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
- if (!new_page)
+ vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- put_page(new_page);
+ if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
+ &vmf->memcg, false)) {
+ put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
- ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
+ if (ret & VM_FAULT_DONE_COW)
+ return ret;
- if (!(ret & VM_FAULT_DAX_LOCKED))
- copy_user_highpage(new_page, fault_page, fe->address, vma);
- __SetPageUptodate(new_page);
+ copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ __SetPageUptodate(vmf->cow_page);
- ret |= alloc_set_pte(fe, memcg, new_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
- unlock_page(fault_page);
- put_page(fault_page);
- } else {
- dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
- }
+ ret |= finish_fault(vmf);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg, false);
- put_page(new_page);
+ mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
+ put_page(vmf->cow_page);
return ret;
}
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
+static int do_shared_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- struct page *fault_page;
- struct address_space *mapping;
- int dirtied = 0;
+ struct vm_area_struct *vma = vmf->vma;
int ret, tmp;
- ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
+ ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* about to become writable
*/
if (vma->vm_ops->page_mkwrite) {
- unlock_page(fault_page);
- tmp = do_page_mkwrite(vma, fault_page, fe->address);
+ unlock_page(vmf->page);
+ tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- put_page(fault_page);
+ put_page(vmf->page);
return tmp;
}
}
- ret |= alloc_set_pte(fe, NULL, fault_page);
- if (fe->pte)
- pte_unmap_unlock(fe->pte, fe->ptl);
+ ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(fault_page);
- put_page(fault_page);
+ unlock_page(vmf->page);
+ put_page(vmf->page);
return ret;
}
- if (set_page_dirty(fault_page))
- dirtied = 1;
- /*
- * Take a local copy of the address_space - page.mapping may be zeroed
- * by truncate after unlock_page(). The address_space itself remains
- * pinned by vma->vm_file's reference. We rely on unlock_page()'s
- * release semantics to prevent the compiler from undoing this copying.
- */
- mapping = page_rmapping(fault_page);
- unlock_page(fault_page);
- if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping but still
- * dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!vma->vm_ops->page_mkwrite)
- file_update_time(vma->vm_file);
-
+ fault_dirty_shared_page(vma, vmf->page);
return ret;
}
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct fault_env *fe)
+static int do_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
- pgoff_t pgoff = linear_page_index(vma, fe->address);
+ struct vm_area_struct *vma = vmf->vma;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
+ if (!(vmf->flags & FAULT_FLAG_WRITE))
+ return do_read_fault(vmf);
if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ return do_cow_fault(vmf);
+ return do_shared_fault(vmf);
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct fault_env *fe, pte_t pte)
+static int do_numa_page(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = -1;
int last_cpupid;
int target_nid;
bool migrated = false;
+ pte_t pte = vmf->orig_pte;
bool was_writable = pte_write(pte);
int flags = 0;
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
* page table entry is not accessible, so there would be no
* concurrent hardware modifications to the PTE.
*/
- fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, pte))) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
- update_mmu_cache(vma, fe->address, fe->pte);
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vma, vmf->address, vmf->pte);
- page = vm_normal_page(vma, fe->address, pte);
+ page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
+ target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
@@ -3469,28 +3476,28 @@ out:
return 0;
}
-static int create_huge_pmd(struct fault_env *fe)
+static int create_huge_pmd(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = fe->vma;
+ struct vm_area_struct *vma = vmf->vma;
if (vma_is_anonymous(vma))
- return do_huge_pmd_anonymous_page(fe);
+ return do_huge_pmd_anonymous_page(vmf);
if (vma->vm_ops->pmd_fault)
- return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
- fe->flags);
+ return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
+ vmf->flags);
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
+static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
- if (vma_is_anonymous(fe->vma))
- return do_huge_pmd_wp_page(fe, orig_pmd);
- if (fe->vma->vm_ops->pmd_fault)
- return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
- fe->flags);
+ if (vma_is_anonymous(vmf->vma))
+ return do_huge_pmd_wp_page(vmf, orig_pmd);
+ if (vmf->vma->vm_ops->pmd_fault)
+ return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
+ vmf->pmd, vmf->flags);
/* COW handled on pte level: split pmd */
- VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
- __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
+ VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
+ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return VM_FAULT_FALLBACK;
}
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct fault_env *fe)
+static int handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
- if (unlikely(pmd_none(*fe->pmd))) {
+ if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
- fe->pte = NULL;
+ vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
- if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+ if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
- fe->pte = pte_offset_map(fe->pmd, fe->address);
-
- entry = *fe->pte;
+ vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+ vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
* ptl lock held. So here a barrier will do.
*/
barrier();
- if (pte_none(entry)) {
- pte_unmap(fe->pte);
- fe->pte = NULL;
+ if (pte_none(vmf->orig_pte)) {
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
}
}
- if (!fe->pte) {
- if (vma_is_anonymous(fe->vma))
- return do_anonymous_page(fe);
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
else
- return do_fault(fe);
+ return do_fault(vmf);
}
- if (!pte_present(entry))
- return do_swap_page(fe, entry);
+ if (!pte_present(vmf->orig_pte))
+ return do_swap_page(vmf);
- if (pte_protnone(entry) && vma_is_accessible(fe->vma))
- return do_numa_page(fe, entry);
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ return do_numa_page(vmf);
- fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
- spin_lock(fe->ptl);
- if (unlikely(!pte_same(*fe->pte, entry)))
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ entry = vmf->orig_pte;
+ if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
- if (fe->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
- return do_wp_page(fe, entry);
+ return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
- fe->flags & FAULT_FLAG_WRITE)) {
- update_mmu_cache(fe->vma, fe->address, fe->pte);
+ if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
+ vmf->flags & FAULT_FLAG_WRITE)) {
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
- if (fe->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(fe->vma, fe->address);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
- pte_unmap_unlock(fe->pte, fe->ptl);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
@@ -3606,10 +3613,12 @@ unlock:
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- struct fault_env fe = {
+ struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = address & PAGE_MASK,
.flags = flags,
+ .pgoff = linear_page_index(vma, address),
+ .gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
- fe.pmd = pmd_alloc(mm, pud, address);
- if (!fe.pmd)
+ vmf.pmd = pmd_alloc(mm, pud, address);
+ if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
- int ret = create_huge_pmd(&fe);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ int ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *fe.pmd;
+ pmd_t orig_pmd = *vmf.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&fe, orig_pmd);
+ return do_huge_pmd_numa_page(&vmf, orig_pmd);
- if ((fe.flags & FAULT_FLAG_WRITE) &&
+ if ((vmf.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&fe, orig_pmd);
+ ret = wp_huge_pmd(&vmf, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&fe, orig_pmd);
+ huge_pmd_set_accessed(&vmf, orig_pmd);
return 0;
}
}
}
- return handle_pte_fault(&fe);
+ return handle_pte_fault(&vmf);
}
/*
@@ -3808,8 +3817,8 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
+ spinlock_t **ptlp)
{
int res;
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct page *page = NULL;
ret = get_user_pages_remote(tsk, mm, addr, 1,
- gup_flags, &page, &vma);
+ gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
diff --git a/mm/nommu.c b/mm/nommu.c
index 27bc543128e5..210d7ec2843c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages_locked);
-long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+static long __get_user_pages_unlocked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ unsigned int gup_flags)
{
long ret;
down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
up_read(&mm->mmap_sem);
return ret;
}
-EXPORT_SYMBOL(__get_user_pages_unlocked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags)
@@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-void filemap_map_pages(struct fault_env *fe,
+void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
BUG();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 52e2f8e3b472..290e8b7d3181 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged;
-
- do {
- spin_lock_irq(&mapping->tree_lock);
- tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
- &start, end, WRITEBACK_TAG_BATCH,
- PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+ unsigned long tagged = 0;
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock_irq(&mapping->tree_lock);
+ radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
+ PAGECACHE_TAG_DIRTY) {
+ if (iter.index > end)
+ break;
+ radix_tree_iter_tag_set(&mapping->page_tree, &iter,
+ PAGECACHE_TAG_TOWRITE);
+ tagged++;
+ if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
spin_unlock_irq(&mapping->tree_lock);
- WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
- /* We check 'start' to handle wrapping when end == ~0UL */
- } while (tagged >= WRITEBACK_TAG_BATCH && start);
+ spin_lock_irq(&mapping->tree_lock);
+ }
+ spin_unlock_irq(&mapping->tree_lock);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f64e7bcb43b7..2c6d5f64feca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
return page;
}
+void __page_frag_drain(struct page *page, unsigned int order,
+ unsigned int count)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+
+ if (page_ref_sub_and_test(page, count)) {
+ if (order == 0)
+ free_hot_cold_page(page, false);
+ else
+ __free_pages_ok(page, order);
+ }
+}
+EXPORT_SYMBOL(__page_frag_drain);
+
void *__alloc_page_frag(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- unsigned int flags = FOLL_REMOTE;
+ unsigned int flags = 0;
/* Work out address and page range required */
if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
while (!rc && nr_pages && iov_iter_count(iter)) {
int pages = min(nr_pages, max_pages_per_loop);
+ int locked = 1;
size_t bytes;
/*
* Get the pages we're interested in. We must
- * add FOLL_REMOTE because task/mm might not
+ * access remotely because task/mm might not
* current/current->mm
*/
- pages = __get_user_pages_unlocked(task, mm, pa, pages,
- process_pages, flags);
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages_remote(task, mm, pa, pages, flags,
+ process_pages, NULL, &locked);
+ if (locked)
+ up_read(&mm->mmap_sem);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/shmem.c b/mm/shmem.c
index abd7403aba41..54287d443806 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
swapped++;
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
@@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
+static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned long found = -1;
+ unsigned int checked = 0;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, root, &iter, 0) {
+ if (*slot == item) {
+ found = iter.index;
+ break;
+ }
+ checked++;
+ if ((checked % 4096) != 0)
+ continue;
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
@@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
int error = 0;
radswap = swp_to_radix_entry(swap);
- index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ index = find_swap_entry(&mapping->page_tree, radswap);
if (index == -1)
return -EAGAIN; /* tell shmem_unuse we found nothing */
@@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping)
}
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
spin_unlock_irq(&mapping->tree_lock);
continue_resched:
if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
cond_resched_rcu();
- slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();