From dd411433129c07a3bea43a0cc5ad9587bac1736a Mon Sep 17 00:00:00 2001
From: Kefeng Wang
Date: Wed, 8 Mar 2023 16:33:09 +0800
Subject: mm/damon/paddr: minor refactor of damon_pa_pageout()

Patch series "mm/damon/paddr: minor code improvement", v3.

Unify folio_put() to make code more clear, and also fix minor issue in
damon_pa_young().


This patch (of 3):

Omit three lines by unified folio_put(), and make code more clear.

Link: https://lkml.kernel.org/r/20230308083311.120951-1-wangkefeng.wang@huawei.com
Link: https://lkml.kernel.org/r/20230308083311.120951-2-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index dd9c33fbe805..0db724aec5cb 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -238,21 +238,18 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
 		if (!folio)
 			continue;
 
-		if (damos_pa_filter_out(s, folio)) {
-			folio_put(folio);
-			continue;
-		}
+		if (damos_pa_filter_out(s, folio))
+			goto put_folio;
 
 		folio_clear_referenced(folio);
 		folio_test_clear_young(folio);
-		if (!folio_isolate_lru(folio)) {
-			folio_put(folio);
-			continue;
-		}
+		if (!folio_isolate_lru(folio))
+			goto put_folio;
 		if (folio_test_unevictable(folio))
 			folio_putback_lru(folio);
 		else
 			list_add(&folio->lru, &folio_list);
+put_folio:
 		folio_put(folio);
 	}
 	applied = reclaim_pages(&folio_list);
-- 
cgit v1.2.3


From b6993be23601c8bc992dc9743fbf78c1ff5d6b6a Mon Sep 17 00:00:00 2001
From: Kefeng Wang
Date: Wed, 8 Mar 2023 16:33:10 +0800
Subject: mm/damon/paddr: minor refactor of
 damon_pa_mark_accessed_or_deactivate()

Omit one line by unified folio_put(), and make code more clear.

Link: https://lkml.kernel.org/r/20230308083311.120951-3-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 0db724aec5cb..b22f6fbb5816 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -268,16 +268,15 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
 		if (!folio)
 			continue;
 
-		if (damos_pa_filter_out(s, folio)) {
-			folio_put(folio);
-			continue;
-		}
+		if (damos_pa_filter_out(s, folio))
+			goto put_folio;
 
 		if (mark_accessed)
 			folio_mark_accessed(folio);
 		else
 			folio_deactivate(folio);
 		applied += folio_nr_pages(folio);
+put_folio:
 		folio_put(folio);
 	}
 	return applied * PAGE_SIZE;
-- 
cgit v1.2.3


From 70307b0e297a6634be855c49525ccbea50922a90 Mon Sep 17 00:00:00 2001
From: Kefeng Wang
Date: Wed, 8 Mar 2023 16:33:11 +0800
Subject: mm/damon/paddr: fix missing folio_sz update in damon_pa_young()

The *folio_sz in damon_pa_young() will be used(as last_folio_sz) by
__damon_pa_check_access(), so it's need to be updated, fix missing branch.

Link: https://lkml.kernel.org/r/20230308083311.120951-4-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: SeongJae Park <sj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/paddr.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index b22f6fbb5816..467b99166b43 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -134,10 +134,8 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
 	}
 
 	need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
-	if (need_lock && !folio_trylock(folio)) {
-		folio_put(folio);
-		return false;
-	}
+	if (need_lock && !folio_trylock(folio))
+		goto out;
 
 	rmap_walk(folio, &rwc);
 
-- 
cgit v1.2.3


From 24139c07f413ef4b555482c758343d71392a19bc Mon Sep 17 00:00:00 2001
From: David Hildenbrand
Date: Sat, 22 Apr 2023 22:54:18 +0200
Subject: mm/ksm: unmerge and clear VM_MERGEABLE when setting
 PR_SET_MEMORY_MERGE=0

Patch series "mm/ksm: improve PR_SET_MEMORY_MERGE=0 handling and cleanup
disabling KSM", v2.

(1) Make PR_SET_MEMORY_MERGE=0 unmerge pages like setting MADV_UNMERGEABLE
does, (2) add a selftest for it and (3) factor out disabling of KSM from
s390/gmap code.


This patch (of 3):

Let's unmerge any KSM pages when setting PR_SET_MEMORY_MERGE=0, and clear
the VM_MERGEABLE flag from all VMAs -- just like KSM would.  Of course,
only do that if we previously set PR_SET_MEMORY_MERGE=1.

Link: https://lkml.kernel.org/r/20230422205420.30372-1-david@redhat.com
Link: https://lkml.kernel.org/r/20230422205420.30372-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Stefan Roesch <shr@devkernel.io>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h |  1 +
 kernel/sys.c        | 12 +++--------
 mm/ksm.c            | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7a9b76fb6c3f..429efa6ff4ae 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -21,6 +21,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 
 void ksm_add_vma(struct vm_area_struct *vma);
 int ksm_enable_merge_any(struct mm_struct *mm);
+int ksm_disable_merge_any(struct mm_struct *mm);
 
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
diff --git a/kernel/sys.c b/kernel/sys.c
index 72cdb16e2636..339fee3eff6a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2695,16 +2695,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (mmap_write_lock_killable(me->mm))
 			return -EINTR;
 
-		if (arg2) {
+		if (arg2)
 			error = ksm_enable_merge_any(me->mm);
-		} else {
-			/*
-			 * TODO: we might want disable KSM on all VMAs and
-			 * trigger unsharing to completely disable KSM.
-			 */
-			clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
-			error = 0;
-		}
+		else
+			error = ksm_disable_merge_any(me->mm);
 		mmap_write_unlock(me->mm);
 		break;
 	case PR_GET_MEMORY_MERGE:
diff --git a/mm/ksm.c b/mm/ksm.c
index 9e48258985d2..823bb3475a68 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2520,6 +2520,22 @@ static void __ksm_add_vma(struct vm_area_struct *vma)
 		vm_flags_set(vma, VM_MERGEABLE);
 }
 
+static int __ksm_del_vma(struct vm_area_struct *vma)
+{
+	int err;
+
+	if (!(vma->vm_flags & VM_MERGEABLE))
+		return 0;
+
+	if (vma->anon_vma) {
+		err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+		if (err)
+			return err;
+	}
+
+	vm_flags_clear(vma, VM_MERGEABLE);
+	return 0;
+}
 /**
  * ksm_add_vma - Mark vma as mergeable if compatible
  *
@@ -2542,6 +2558,20 @@ static void ksm_add_vmas(struct mm_struct *mm)
 		__ksm_add_vma(vma);
 }
 
+static int ksm_del_vmas(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	int err;
+
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		err = __ksm_del_vma(vma);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 /**
  * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
  *                        compatible VMA's
@@ -2569,6 +2599,35 @@ int ksm_enable_merge_any(struct mm_struct *mm)
 	return 0;
 }
 
+/**
+ * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
+ *			   previously enabled via ksm_enable_merge_any().
+ *
+ * Disabling merging implies unmerging any merged pages, like setting
+ * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
+ * merging on all compatible VMA's remains enabled.
+ *
+ * @mm: Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_disable_merge_any(struct mm_struct *mm)
+{
+	int err;
+
+	if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return 0;
+
+	err = ksm_del_vmas(mm);
+	if (err) {
+		ksm_add_vmas(mm);
+		return err;
+	}
+
+	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+	return 0;
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
-- 
cgit v1.2.3


From 2c281f54f556e1f3266c8cb104adf9eea7a7b742 Mon Sep 17 00:00:00 2001
From: David Hildenbrand
Date: Sat, 22 Apr 2023 23:01:56 +0200
Subject: mm/ksm: move disabling KSM from s390/gmap code to KSM code

Let's factor out actual disabling of KSM.  The existing "mm->def_flags &=
~VM_MERGEABLE;" was essentially a NOP and can be dropped, because
def_flags should never include VM_MERGEABLE.  Note that we don't currently
prevent re-enabling KSM.

This should now be faster in case KSM was never enabled, because we only
conditionally iterate all VMAs.  Further, it certainly looks cleaner.

Link: https://lkml.kernel.org/r/20230422210156.33630-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Janosch Frank <frankja@linux.ibm.com>
Acked-by: Stefan Roesch <shr@devkernel.io>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/gmap.c | 20 +-------------------
 include/linux/ksm.h |  6 ++++++
 mm/ksm.c            | 11 +++++++++++
 3 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 0949811761e6..dfe905c7bd8e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2585,30 +2585,12 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
 
 int gmap_mark_unmergeable(void)
 {
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	unsigned long vm_flags;
-	int ret;
-	VMA_ITERATOR(vmi, mm, 0);
-
 	/*
 	 * Make sure to disable KSM (if enabled for the whole process or
 	 * individual VMAs). Note that nothing currently hinders user space
 	 * from re-enabling it.
 	 */
-	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
-
-	for_each_vma(vmi, vma) {
-		/* Copy vm_flags to avoid partial modifications in ksm_madvise */
-		vm_flags = vma->vm_flags;
-		ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
-				  MADV_UNMERGEABLE, &vm_flags);
-		if (ret)
-			return ret;
-		vm_flags_reset(vma, vm_flags);
-	}
-	mm->def_flags &= ~VM_MERGEABLE;
-	return 0;
+	return ksm_disable(current->mm);
 }
 EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
 
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 429efa6ff4ae..899a314bc487 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -22,6 +22,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 void ksm_add_vma(struct vm_area_struct *vma);
 int ksm_enable_merge_any(struct mm_struct *mm);
 int ksm_disable_merge_any(struct mm_struct *mm);
+int ksm_disable(struct mm_struct *mm);
 
 int __ksm_enter(struct mm_struct *mm);
 void __ksm_exit(struct mm_struct *mm);
@@ -80,6 +81,11 @@ static inline void ksm_add_vma(struct vm_area_struct *vma)
 {
 }
 
+static inline int ksm_disable(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
 	return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index 823bb3475a68..0156bded3a66 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2628,6 +2628,17 @@ int ksm_disable_merge_any(struct mm_struct *mm)
 	return 0;
 }
 
+int ksm_disable(struct mm_struct *mm)
+{
+	mmap_assert_write_locked(mm);
+
+	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+		return 0;
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return ksm_disable_merge_any(mm);
+	return ksm_del_vmas(mm);
+}
+
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
 {
-- 
cgit v1.2.3


From 65f67a3e002f0e5ae84a65be915248d57d480060 Mon Sep 17 00:00:00 2001
From: Baolin Wang
Date: Tue, 25 Apr 2023 20:44:53 +0800
Subject: mm/page_alloc: add some comments to explain the possible hole in
 __pageblock_pfn_to_page()

Now the __pageblock_pfn_to_page() is used by set_zone_contiguous(), which
checks whether the given zone contains holes, and uses
pfn_to_online_page() to validate if the start pfn is online and valid, as
well as using pfn_valid() to validate the end pfn.

However, the __pageblock_pfn_to_page() function may return non-NULL even
if the end pfn of a pageblock is in a memory hole in some situations.  For
example, if the pageblock order is MAX_ORDER, which will fall into 2
sub-sections, and the end pfn of the pageblock may be hole even though the
start pfn is online and valid.

See below memory layout as an example and suppose the pageblock order is
MAX_ORDER.

[    0.000000] Zone ranges:
[    0.000000]   DMA      [mem 0x0000000040000000-0x00000000ffffffff]
[    0.000000]   DMA32    empty
[    0.000000]   Normal   [mem 0x0000000100000000-0x0000001fa7ffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000040000000-0x0000001fa3c7ffff]
[    0.000000]   node   0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff]
[    0.000000]   node   0: [mem 0x0000001fa4000000-0x0000001fa402ffff]
[    0.000000]   node   0: [mem 0x0000001fa4030000-0x0000001fa40effff]
[    0.000000]   node   0: [mem 0x0000001fa40f0000-0x0000001fa73cffff]
[    0.000000]   node   0: [mem 0x0000001fa73d0000-0x0000001fa745ffff]
[    0.000000]   node   0: [mem 0x0000001fa7460000-0x0000001fa746ffff]
[    0.000000]   node   0: [mem 0x0000001fa7470000-0x0000001fa758ffff]
[    0.000000]   node   0: [mem 0x0000001fa7590000-0x0000001fa7dfffff]

Focus on the last memory range, and there is a hole for the range [mem
0x0000001fa7590000-0x0000001fa7dfffff].  That means the last pageblock
will contain the range from 0x1fa7c00000 to 0x1fa7ffffff, since the
pageblock must be 4M aligned.  And in this pageblock, these pfns will fall
into 2 sub-section (the sub-section size is 2M aligned).

So, the 1st sub-section (indicates pfn range: 0x1fa7c00000 - 0x1fa7dfffff
) in this pageblock is valid by calling subsection_map_init() in
free_area_init(), but the 2nd sub-section (indicates pfn range:
0x1fa7e00000 - 0x1fa7ffffff ) in this pageblock is not valid.

This did not break anything until now, but the zone continuous is fragile
in this possible scenario.  So as previous discussion[1], it is better to
add some comments to explain this possible issue in case there are some
future pfn walkers that rely on this.

[1] https://lore.kernel.org/all/87r0sdsmr6.fsf@yhuang6-desk2.ccr.corp.intel.com/

Link: https://lkml.kernel.org/r/5c26368865e79c743a453dea48d30670b19d2e4f.1682425534.git.baolin.wang@linux.alibaba.com
Link: https://lkml.kernel.org/r/5c26368865e79c743a453dea48d30670b19d2e4f.1682425534.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/page_alloc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6da423ec356f..d71c80f29946 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1502,6 +1502,15 @@ void __free_pages_core(struct page *page, unsigned int order)
  * interleaving within a single pageblock. It is therefore sufficient to check
  * the first and last page of a pageblock and avoid checking each individual
  * page in a pageblock.
+ *
+ * Note: the function may return non-NULL struct page even for a page block
+ * which contains a memory hole (i.e. there is no physical memory for a subset
+ * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
+ * even though the start pfn is online and valid. This should be safe most of
+ * the time because struct pages are still initialized via init_unavailable_range()
+ * and pfn walkers shouldn't touch any physical memory range for which they do
+ * not recognize any specific metadata in struct pages.
  */
 struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 				     unsigned long end_pfn, struct zone *zone)
-- 
cgit v1.2.3