diff options
author | Linus Torvalds | 2023-06-28 10:28:11 -0700 |
---|---|---|
committer | Linus Torvalds | 2023-06-28 10:28:11 -0700 |
commit | 6e17c6de3ddf3073741d9c91a796ee696914d8a0 (patch) | |
tree | 2c425707f78642625dbe2c824c7fded2021e3dc7 /mm/swapfile.c | |
parent | 6aeadf7896bff4ca230702daba8788455e6b866e (diff) | |
parent | acc72d59c7509540c27c49625cb4b5a8db1f1a84 (diff) |
Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull mm updates from Andrew Morton:
- Yosry Ahmed brought back some cgroup v1 stats in OOM logs
- Yosry has also eliminated cgroup's atomic rstat flushing
- Nhat Pham adds the new cachestat() syscall. It provides userspace
with the ability to query pagecache status - a similar concept to
mincore() but more powerful and with improved usability
- Mel Gorman provides more optimizations for compaction, reducing the
prevalence of page rescanning
- Lorenzo Stoakes has done some maintanance work on the
get_user_pages() interface
- Liam Howlett continues with cleanups and maintenance work to the
maple tree code. Peng Zhang also does some work on maple tree
- Johannes Weiner has done some cleanup work on the compaction code
- David Hildenbrand has contributed additional selftests for
get_user_pages()
- Thomas Gleixner has contributed some maintenance and optimization
work for the vmalloc code
- Baolin Wang has provided some compaction cleanups,
- SeongJae Park continues maintenance work on the DAMON code
- Huang Ying has done some maintenance on the swap code's usage of
device refcounting
- Christoph Hellwig has some cleanups for the filemap/directio code
- Ryan Roberts provides two patch series which yield some
rationalization of the kernel's access to pte entries - use the
provided APIs rather than open-coding accesses
- Lorenzo Stoakes has some fixes to the interaction between pagecache
and directio access to file mappings
- John Hubbard has a series of fixes to the MM selftesting code
- ZhangPeng continues the folio conversion campaign
- Hugh Dickins has been working on the pagetable handling code, mainly
with a view to reducing the load on the mmap_lock
- Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
from 128 to 8
- Domenico Cerasuolo has improved the zswap reclaim mechanism by
reorganizing the LRU management
- Matthew Wilcox provides some fixups to make gfs2 work better with the
buffer_head code
- Vishal Moola also has done some folio conversion work
- Matthew Wilcox has removed the remnants of the pagevec code - their
functionality is migrated over to struct folio_batch
* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
mm/hugetlb: remove hugetlb_set_page_subpool()
mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
hugetlb: revert use of page_cache_next_miss()
Revert "page cache: fix page_cache_next/prev_miss off by one"
mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
mm: memcg: rename and document global_reclaim()
mm: kill [add|del]_page_to_lru_list()
mm: compaction: convert to use a folio in isolate_migratepages_block()
mm: zswap: fix double invalidate with exclusive loads
mm: remove unnecessary pagevec includes
mm: remove references to pagevec
mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
mm: remove struct pagevec
net: convert sunrpc from pagevec to folio_batch
i915: convert i915_gpu_error to use a folio_batch
pagevec: rename fbatch_count()
mm: remove check_move_unevictable_pages()
drm: convert drm_gem_put_pages() to use a folio_batch
i915: convert shmem_sg_free_table() to use a folio_batch
scatterlist: add sg_set_folio()
...
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r-- | mm/swapfile.c | 109 |
1 files changed, 46 insertions, 63 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6bc83060df9a..8e6dde68b389 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -41,6 +41,7 @@ #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> +#include <linux/suspend.h> #include <asm/tlbflush.h> #include <linux/swapops.h> @@ -1219,6 +1220,13 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, } /* + * When we get a swap entry, if there aren't some other ways to + * prevent swapoff, such as the folio in swap cache is locked, page + * table lock is held, etc., the swap entry may become invalid because + * of swapoff. Then, we need to enclose all swap related functions + * with get_swap_device() and put_swap_device(), unless the swap + * functions call get/put_swap_device() by themselves. + * * Check whether swap entry is valid in the swap device. If so, * return pointer to swap_info_struct, and keep the swap entry valid * via preventing the swap device from being swapoff, until @@ -1227,9 +1235,8 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, * Notice that swapoff or swapoff+swapon can still happen before the * percpu_ref_tryget_live() in get_swap_device() or after the * percpu_ref_put() in put_swap_device() if there isn't any other way - * to prevent swapoff, such as page lock, page table lock, etc. The - * caller must be prepared for that. For example, the following - * situation is possible. + * to prevent swapoff. The caller must be prepared for that. For + * example, the following situation is possible. * * CPU1 CPU2 * do_swap_page() @@ -1432,16 +1439,10 @@ void swapcache_free_entries(swp_entry_t *entries, int n) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si; + struct swap_info_struct *si = swp_swap_info(entry); pgoff_t offset = swp_offset(entry); - int count = 0; - si = get_swap_device(entry); - if (si) { - count = swap_count(si->swap_map[offset]); - put_swap_device(si); - } - return count; + return swap_count(si->swap_map[offset]); } /* @@ -1449,7 +1450,7 @@ int __swap_count(swp_entry_t entry) * This does not give an exact answer when swap count is continued, * but does include the high COUNT_CONTINUED flag to allow for that. */ -static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; @@ -1463,24 +1464,6 @@ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) /* * How many references to @entry are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -int __swp_swapcount(swp_entry_t entry) -{ - int count = 0; - struct swap_info_struct *si; - - si = get_swap_device(entry); - if (si) { - count = swap_swapcount(si, entry); - put_swap_device(si); - } - return count; -} - -/* - * How many references to @entry are currently swapped out? * This considers COUNT_CONTINUED so it returns exact answer. */ int swp_swapcount(swp_entry_t entry) @@ -1762,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; - pte_t *pte, new_pte; + pte_t *pte, new_pte, old_pte; bool hwposioned = false; int ret = 1; @@ -1774,11 +1757,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, hwposioned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), + swp_entry_to_pte(entry)))) { ret = 0; goto out; } + old_pte = ptep_get(pte); + if (unlikely(hwposioned || !PageUptodate(page))) { swp_entry_t swp_entry; @@ -1810,7 +1796,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, * call and have the page locked. */ VM_BUG_ON_PAGE(PageWriteback(page), page); - if (pte_swp_exclusive(*pte)) + if (pte_swp_exclusive(old_pte)) rmap_flags |= RMAP_EXCLUSIVE; page_add_anon_rmap(page, vma, addr, rmap_flags); @@ -1819,15 +1805,16 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, lru_cache_add_inactive_or_unevictable(page, vma); } new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (pte_swp_soft_dirty(*pte)) + if (pte_swp_soft_dirty(old_pte)) new_pte = pte_mksoft_dirty(new_pte); - if (pte_swp_uffd_wp(*pte)) + if (pte_swp_uffd_wp(old_pte)) new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); swap_free(entry); out: - pte_unmap_unlock(pte, ptl); + if (pte) + pte_unmap_unlock(pte, ptl); if (page != swapcache) { unlock_page(page); put_page(page); @@ -1839,27 +1826,37 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - swp_entry_t entry; - pte_t *pte; + pte_t *pte = NULL; struct swap_info_struct *si; - int ret = 0; si = swap_info[type]; - pte = pte_offset_map(pmd, addr); do { struct folio *folio; unsigned long offset; unsigned char swp_count; + swp_entry_t entry; + int ret; + pte_t ptent; + + if (!pte++) { + pte = pte_offset_map(pmd, addr); + if (!pte) + break; + } + + ptent = ptep_get_lockless(pte); - if (!is_swap_pte(*pte)) + if (!is_swap_pte(ptent)) continue; - entry = pte_to_swp_entry(*pte); + entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; offset = swp_offset(entry); pte_unmap(pte); + pte = NULL; + folio = swap_cache_get_folio(entry, vma, addr); if (!folio) { struct page *page; @@ -1878,8 +1875,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!folio) { swp_count = READ_ONCE(si->swap_map[offset]); if (swp_count == 0 || swp_count == SWAP_MAP_BAD) - goto try_next; - + continue; return -ENOMEM; } @@ -1889,20 +1885,17 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (ret < 0) { folio_unlock(folio); folio_put(folio); - goto out; + return ret; } folio_free_swap(folio); folio_unlock(folio); folio_put(folio); -try_next: - pte = pte_offset_map(pmd, addr); - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + } while (addr += PAGE_SIZE, addr != end); - ret = 0; -out: - return ret; + if (pte) + pte_unmap(pte); + return 0; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -1917,8 +1910,6 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, do { cond_resched(); next = pmd_addr_end(addr, end); - if (pmd_none_or_trans_huge_or_clear_bad(pmd)) - continue; ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; @@ -3288,9 +3279,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned char has_cache; int err; - p = get_swap_device(entry); - if (!p) - return -EINVAL; + p = swp_swap_info(entry); offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); @@ -3337,7 +3326,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); - put_swap_device(p); return err; } @@ -3468,11 +3456,6 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) goto out; } - /* - * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel page tables: so it - * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. - */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; |