aboutsummaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorOscar Salvador2021-05-04 18:39:42 -0700
committerLinus Torvalds2021-05-05 11:27:26 -0700
commita08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 (patch)
tree1378519ee9afca95a301f8e4afca6dbd6f9981a4 /include/linux
parentf9901144e48f6a7ba186249add705d10e74738ec (diff)
mm,memory_hotplug: allocate memmap from the added memory range
Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador <osalvador@suse.de> Reviewed-by: David Hildenbrand <david@redhat.com> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/memory.h8
-rw-r--r--include/linux/memory_hotplug.h15
-rw-r--r--include/linux/memremap.h2
-rw-r--r--include/linux/mmzone.h7
4 files changed, 27 insertions, 5 deletions
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 4da95e684e20..97e92e8b556a 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -29,6 +29,11 @@ struct memory_block {
int online_type; /* for passing data to online routine */
int nid; /* NID for this memory block */
struct device dev;
+ /*
+ * Number of vmemmap pages. These pages
+ * lay at the beginning of the memory block.
+ */
+ unsigned long nr_vmemmap_pages;
};
int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v)
#else
extern int register_memory_notifier(struct notifier_block *nb);
extern void unregister_memory_notifier(struct notifier_block *nb);
-int create_memory_block_devices(unsigned long start, unsigned long size);
+int create_memory_block_devices(unsigned long start, unsigned long size,
+ unsigned long vmemmap_pages);
void remove_memory_block_devices(unsigned long start, unsigned long size);
extern void memory_dev_init(void);
extern int memory_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7288aa5ef73b..28f32fd00fe9 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -56,6 +56,14 @@ typedef int __bitwise mhp_t;
#define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
/*
+ * We want memmap (struct page array) to be self contained.
+ * To do so, we will use the beginning of the hot-added range to build
+ * the page tables for the memmap array that describes the entire range.
+ * Only selected architectures support it with SPARSE_VMEMMAP.
+ */
+#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1))
+
+/*
* Extended parameters for memory hotplug:
* altmap: alternative allocator for memmap array (optional)
* pgprot: page protection flags to apply to newly created page tables
@@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone)
extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
+extern void adjust_present_page_count(struct zone *zone, long nr_pages);
/* VM interface that may be used by firmware interface */
+extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone);
+extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages);
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
- int online_type, int nid);
+ struct zone *zone);
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long end_pfn);
extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_
extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
struct mhp_params *params);
void arch_remove_linear_mapping(u64 start, u64 size);
+extern bool mhp_supports_memmap_on_memory(unsigned long size);
#endif /* CONFIG_MEMORY_HOTPLUG */
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f5b464daeeca..45a79da89c5f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -17,7 +17,7 @@ struct device;
* @alloc: track pages consumed, private to vmemmap_populate()
*/
struct vmem_altmap {
- const unsigned long base_pfn;
+ unsigned long base_pfn;
const unsigned long end_pfn;
const unsigned long reserve;
unsigned long free;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e8922a67d1a4..917bd6c604d5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -436,6 +436,11 @@ enum zone_type {
* situations where ZERO_PAGE(0) which is allocated differently
* on different platforms may end up in a movable zone. ZERO_PAGE(0)
* cannot be migrated.
+ * 7. Memory-hotplug: when using memmap_on_memory and onlining the
+ * memory to the MOVABLE zone, the vmemmap pages are also placed in
+ * such zone. Such pages cannot be really moved around as they are
+ * self-stored in the range, but they are treated as movable when
+ * the range they describe is about to be offlined.
*
* In general, no unmovable allocations that degrade memory offlining
* should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
@@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr)
#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
-#ifdef CONFIG_MEMORY_HOTREMOVE
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif
-#endif
static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{