diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 208 |
1 files changed, 148 insertions, 60 deletions
@@ -31,6 +31,7 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/mmu_notifier.h> #include <linux/iomap.h> #include "internal.h" @@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { @@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); -} - /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. @@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } vmf->page = page; return VM_FAULT_LOCKED; } @@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return new_entry; } +static inline unsigned long +pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + return address; +} + +/* Walk all mappings of a given index of a file and writeprotect them */ +static void dax_mapping_entry_mkclean(struct address_space *mapping, + pgoff_t index, unsigned long pfn) +{ + struct vm_area_struct *vma; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + bool changed; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { + unsigned long address; + + cond_resched(); + + if (!(vma->vm_flags & VM_SHARED)) + continue; + + address = pgoff_address(index, vma); + changed = false; + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + continue; + if (pfn != pte_pfn(*ptep)) + goto unlock; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock: + pte_unmap_unlock(ptep, ptl); + + if (changed) + mmu_notifier_invalidate_page(vma->vm_mm, address); + } + i_mmap_unlock_read(mapping); +} + static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct radix_tree_node *node; struct blk_dax_ctl dax; - void **slot; + void *entry2, **slot; int ret = 0; - spin_lock_irq(&mapping->tree_lock); /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. + * A page got tagged dirty in DAX mapping? Something is seriously + * wrong. */ - if (!__radix_tree_lookup(page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - - /* another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto unlock; + if (WARN_ON(!radix_tree_exceptional_entry(entry))) + return -EIO; + spin_lock_irq(&mapping->tree_lock); + entry2 = get_unlocked_mapping_entry(mapping, index, &slot); + /* Entry got punched out / reallocated? */ + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. We have to + * compare sectors as we must not bail out due to difference in lockbit + * or entry type. + */ + if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { ret = -EIO; - goto unlock; + goto put_unlocked; } + /* Another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; + /* Lock the entry to serialize with page faults */ + entry = lock_slot(mapping, slot); + /* + * We can clear the tag now but we have to be careful so that concurrent + * dax_writeback_one() calls for the same index cannot finish before we + * actually flush the caches. This is achieved as the calls will look + * at the entry only under tree_lock and once they do that they will + * see the entry locked and wait for it to unlock. + */ + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + /* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to @@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev, */ dax.sector = dax_radix_sector(entry); dax.size = PAGE_SIZE << dax_radix_order(entry); - spin_unlock_irq(&mapping->tree_lock); /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); - if (ret < 0) + if (ret < 0) { + put_locked_mapping_entry(mapping, index, entry); return ret; + } if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); wb_cache_pmem(dax.addr, dax.size); - + /* + * After we have flushed the cache, we can clear the dirty tag. There + * cannot be new dirty data in the pfn after the flush has completed as + * the pfn mappings are writeprotected and fault waits for mapping + * entry lock. + */ spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); + put_locked_mapping_entry(mapping, index, entry); return ret; - unlock: + put_unlocked: + put_unlocked_mapping_entry(mapping, index, entry2); spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - void *entry; + void *entry, **slot; pgoff_t index = vmf->pgoff; spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) - goto out; + entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (!entry || !radix_tree_exceptional_entry(entry)) { + if (entry) + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return VM_FAULT_NOPAGE; + } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - put_unlocked_mapping_entry(mapping, index, entry); -out: + entry = lock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); + /* + * If we race with somebody updating the PTE and finish_mkwrite_fault() + * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry + * the fault in either case. + */ + finish_mkwrite_fault(vmf); + put_locked_mapping_entry(mapping, index, entry); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; - int locked_status = 0; + int vmf_ret = 0; void *entry; /* @@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto finish_iomap; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - locked_status = VM_FAULT_LOCKED; - } else { - vmf->entry = entry; - locked_status = VM_FAULT_DAX_LOCKED; - } + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - locked_status = dax_load_hole(mapping, entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); break; } /*FALLTHRU*/ @@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, finish_iomap: if (ops->iomap_end) { - if (error) { + if (error || (vmf_ret & VM_FAULT_ERROR)) { /* keep previous error */ ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, &iomap); @@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } } unlock_entry: - if (!locked_status || error) + if (vmf_ret != VM_FAULT_LOCKED || error) put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) @@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; - if (locked_status) { + if (vmf_ret) { WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ - return locked_status; + return vmf_ret; } return VM_FAULT_NOPAGE | major; } |