aboutsummaryrefslogtreecommitdiff
path: root/mm/zsmalloc.c
diff options
context:
space:
mode:
authorLinus Torvalds2023-04-27 19:42:02 -0700
committerLinus Torvalds2023-04-27 19:42:02 -0700
commit7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch)
treecc8fd6b4f936ec01e73238643757451e20478c07 /mm/zsmalloc.c
parent91ec4b0d11fe115581ce2835300558802ce55e6c (diff)
parent4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff)
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of switching from a user process to a kernel thread. - More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav. - zsmalloc performance improvements from Sergey Senozhatsky. - Yue Zhao has found and fixed some data race issues around the alteration of memcg userspace tunables. - VFS rationalizations from Christoph Hellwig: - removal of most of the callers of write_one_page() - make __filemap_get_folio()'s return value more useful - Luis Chamberlain has changed tmpfs so it no longer requires swap backing. Use `mount -o noswap'. - Qi Zheng has made the slab shrinkers operate locklessly, providing some scalability benefits. - Keith Busch has improved dmapool's performance, making part of its operations O(1) rather than O(n). - Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd, permitting userspace to wr-protect anon memory unpopulated ptes. - Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather than exclusive, and has fixed a bunch of errors which were caused by its unintuitive meaning. - Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature, which causes minor faults to install a write-protected pte. - Vlastimil Babka has done some maintenance work on vma_merge(): cleanups to the kernel code and improvements to our userspace test harness. - Cleanups to do_fault_around() by Lorenzo Stoakes. - Mike Rapoport has moved a lot of initialization code out of various mm/ files and into mm/mm_init.c. - Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for DRM, but DRM doesn't use it any more. - Lorenzo has also coverted read_kcore() and vread() to use iterators and has thereby removed the use of bounce buffers in some cases. - Lorenzo has also contributed further cleanups of vma_merge(). - Chaitanya Prakash provides some fixes to the mmap selftesting code. - Matthew Wilcox changes xfs and afs so they no longer take sleeping locks in ->map_page(), a step towards RCUification of pagefaults. - Suren Baghdasaryan has improved mmap_lock scalability by switching to per-VMA locking. - Frederic Weisbecker has reworked the percpu cache draining so that it no longer causes latency glitches on cpu isolated workloads. - Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig logic. - Liu Shixin has changed zswap's initialization so we no longer waste a chunk of memory if zswap is not being used. - Yosry Ahmed has improved the performance of memcg statistics flushing. - David Stevens has fixed several issues involving khugepaged, userfaultfd and shmem. - Christoph Hellwig has provided some cleanup work to zram's IO-related code paths. - David Hildenbrand has fixed up some issues in the selftest code's testing of our pte state changing. - Pankaj Raghav has made page_endio() unneeded and has removed it. - Peter Xu contributed some rationalizations of the userfaultfd selftests. - Yosry Ahmed has fixed an issue around memcg's page recalim accounting. - Chaitanya Prakash has fixed some arm-related issues in the selftests/mm code. - Longlong Xia has improved the way in which KSM handles hwpoisoned pages. - Peter Xu fixes a few issues with uffd-wp at fork() time. - Stefan Roesch has changed KSM so that it may now be used on a per-process and per-cgroup basis. * tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits) mm,unmap: avoid flushing TLB in batch if PTE is inaccessible shmem: restrict noswap option to initial user namespace mm/khugepaged: fix conflicting mods to collapse_file() sparse: remove unnecessary 0 values from rc mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() hugetlb: pte_alloc_huge() to replace huge pte_alloc_map() maple_tree: fix allocation in mas_sparse_area() mm: do not increment pgfault stats when page fault handler retries zsmalloc: allow only one active pool compaction context selftests/mm: add new selftests for KSM mm: add new KSM process and sysfs knobs mm: add new api to enable ksm per process mm: shrinkers: fix debugfs file permissions mm: don't check VMA write permissions if the PTE/PMD indicates write permissions migrate_pages_batch: fix statistics for longterm pin retry userfaultfd: use helper function range_in_vma() lib/show_mem.c: use for_each_populated_zone() simplify code mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list() fs/buffer: convert create_page_buffers to folio_create_buffers fs/buffer: add folio_create_empty_buffers helper ...
Diffstat (limited to 'mm/zsmalloc.c')
-rw-r--r--mm/zsmalloc.c370
1 files changed, 185 insertions, 185 deletions
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 3aed46ab7e6c..44ddaf5d601e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -127,7 +127,7 @@
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define HUGE_BITS 1
-#define FULLNESS_BITS 2
+#define FULLNESS_BITS 4
#define CLASS_BITS 8
#define ISOLATED_BITS 5
#define MAGIC_VAL_BITS 8
@@ -159,51 +159,44 @@
#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \
ZS_SIZE_CLASS_DELTA) + 1)
+/*
+ * Pages are distinguished by the ratio of used memory (that is the ratio
+ * of ->inuse objects to all objects that page can store). For example,
+ * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%.
+ *
+ * The number of fullness groups is not random. It allows us to keep
+ * difference between the least busy page in the group (minimum permitted
+ * number of ->inuse objects) and the most busy page (maximum permitted
+ * number of ->inuse objects) at a reasonable value.
+ */
enum fullness_group {
- ZS_EMPTY,
- ZS_ALMOST_EMPTY,
- ZS_ALMOST_FULL,
- ZS_FULL,
- NR_ZS_FULLNESS,
+ ZS_INUSE_RATIO_0,
+ ZS_INUSE_RATIO_10,
+ /* NOTE: 8 more fullness groups here */
+ ZS_INUSE_RATIO_99 = 10,
+ ZS_INUSE_RATIO_100,
+ NR_FULLNESS_GROUPS,
};
enum class_stat_type {
- CLASS_EMPTY,
- CLASS_ALMOST_EMPTY,
- CLASS_ALMOST_FULL,
- CLASS_FULL,
- OBJ_ALLOCATED,
- OBJ_USED,
- NR_ZS_STAT_TYPE,
+ /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */
+ ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS,
+ ZS_OBJS_INUSE,
+ NR_CLASS_STAT_TYPES,
};
struct zs_size_stat {
- unsigned long objs[NR_ZS_STAT_TYPE];
+ unsigned long objs[NR_CLASS_STAT_TYPES];
};
#ifdef CONFIG_ZSMALLOC_STAT
static struct dentry *zs_stat_root;
#endif
-/*
- * We assign a page to ZS_ALMOST_EMPTY fullness group when:
- * n <= N / f, where
- * n = number of allocated objects
- * N = total number of objects zspage can store
- * f = fullness_threshold_frac
- *
- * Similarly, we assign zspage to:
- * ZS_ALMOST_FULL when n > N / f
- * ZS_EMPTY when n == 0
- * ZS_FULL when n == N
- *
- * (see: fix_fullness_group())
- */
-static const int fullness_threshold_frac = 4;
static size_t huge_class_size;
struct size_class {
- struct list_head fullness_list[NR_ZS_FULLNESS];
+ struct list_head fullness_list[NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -271,6 +264,7 @@ struct zs_pool {
struct work_struct free_work;
#endif
spinlock_t lock;
+ atomic_t compaction_in_progress;
};
struct zspage {
@@ -547,8 +541,8 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
}
static void get_zspage_mapping(struct zspage *zspage,
- unsigned int *class_idx,
- enum fullness_group *fullness)
+ unsigned int *class_idx,
+ int *fullness)
{
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
@@ -557,14 +551,14 @@ static void get_zspage_mapping(struct zspage *zspage,
}
static struct size_class *zspage_class(struct zs_pool *pool,
- struct zspage *zspage)
+ struct zspage *zspage)
{
return pool->size_class[zspage->class];
}
static void set_zspage_mapping(struct zspage *zspage,
- unsigned int class_idx,
- enum fullness_group fullness)
+ unsigned int class_idx,
+ int fullness)
{
zspage->class = class_idx;
zspage->fullness = fullness;
@@ -588,23 +582,19 @@ static int get_size_class_index(int size)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
-/* type can be of enum type class_stat_type or fullness_group */
static inline void class_stat_inc(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
-/* type can be of enum type class_stat_type or fullness_group */
static inline void class_stat_dec(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
-/* type can be of enum type class_stat_type or fullness_group */
-static inline unsigned long zs_stat_get(struct size_class *class,
- int type)
+static inline unsigned long zs_stat_get(struct size_class *class, int type)
{
return class->stats.objs[type];
}
@@ -630,32 +620,38 @@ static unsigned long zs_can_compact(struct size_class *class);
static int zs_stats_size_show(struct seq_file *s, void *v)
{
- int i;
+ int i, fg;
struct zs_pool *pool = s->private;
struct size_class *class;
int objs_per_zspage;
- unsigned long class_almost_full, class_almost_empty;
unsigned long obj_allocated, obj_used, pages_used, freeable;
- unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
unsigned long total_freeable = 0;
+ unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, };
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
- "class", "size", "almost_full", "almost_empty",
+ seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n",
+ "class", "size", "10%", "20%", "30%", "40%",
+ "50%", "60%", "70%", "80%", "90%", "99%", "100%",
"obj_allocated", "obj_used", "pages_used",
"pages_per_zspage", "freeable");
for (i = 0; i < ZS_SIZE_CLASSES; i++) {
+
class = pool->size_class[i];
if (class->index != i)
continue;
spin_lock(&pool->lock);
- class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
- class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
+
+ seq_printf(s, " %5u %5u ", i, class->size);
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
+ inuse_totals[fg] += zs_stat_get(class, fg);
+ seq_printf(s, "%9lu ", zs_stat_get(class, fg));
+ }
+
+ obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
freeable = zs_can_compact(class);
spin_unlock(&pool->lock);
@@ -663,14 +659,10 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu"
- " %10lu %10lu %16d %8lu\n",
- i, class->size, class_almost_full, class_almost_empty,
- obj_allocated, obj_used, pages_used,
- class->pages_per_zspage, freeable);
+ seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n",
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage, freeable);
- total_class_almost_full += class_almost_full;
- total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
@@ -678,10 +670,14 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
- "Total", "", total_class_almost_full,
- total_class_almost_empty, total_objs,
- total_used_objs, total_pages, "", total_freeable);
+ seq_printf(s, " %5s %5s ", "Total", "");
+
+ for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++)
+ seq_printf(s, "%9lu ", inuse_totals[fg]);
+
+ seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n",
+ total_objs, total_used_objs, total_pages, "",
+ total_freeable);
return 0;
}
@@ -726,30 +722,28 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
/*
* For each size class, zspages are divided into different groups
- * depending on how "full" they are. This was done so that we could
- * easily find empty or nearly empty zspages when we try to shrink
- * the pool (not yet implemented). This function returns fullness
+ * depending on their usage ratio. This function returns fullness
* status of the given page.
*/
-static enum fullness_group get_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int get_fullness_group(struct size_class *class, struct zspage *zspage)
{
- int inuse, objs_per_zspage;
- enum fullness_group fg;
+ int inuse, objs_per_zspage, ratio;
inuse = get_zspage_inuse(zspage);
objs_per_zspage = class->objs_per_zspage;
if (inuse == 0)
- fg = ZS_EMPTY;
- else if (inuse == objs_per_zspage)
- fg = ZS_FULL;
- else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
- fg = ZS_ALMOST_EMPTY;
- else
- fg = ZS_ALMOST_FULL;
+ return ZS_INUSE_RATIO_0;
+ if (inuse == objs_per_zspage)
+ return ZS_INUSE_RATIO_100;
- return fg;
+ ratio = 100 * inuse / objs_per_zspage;
+ /*
+ * Take integer division into consideration: a page with one inuse
+ * object out of 127 possible, will end up having 0 usage ratio,
+ * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group.
+ */
+ return ratio / 10 + 1;
}
/*
@@ -760,21 +754,10 @@ static enum fullness_group get_fullness_group(struct size_class *class,
*/
static void insert_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
- struct zspage *head;
-
class_stat_inc(class, fullness, 1);
- head = list_first_entry_or_null(&class->fullness_list[fullness],
- struct zspage, list);
- /*
- * We want to see more ZS_FULL pages and less almost empty/full.
- * Put pages with higher ->inuse first.
- */
- if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head))
- list_add(&zspage->list, &head->list);
- else
- list_add(&zspage->list, &class->fullness_list[fullness]);
+ list_add(&zspage->list, &class->fullness_list[fullness]);
}
/*
@@ -783,7 +766,7 @@ static void insert_zspage(struct size_class *class,
*/
static void remove_zspage(struct size_class *class,
struct zspage *zspage,
- enum fullness_group fullness)
+ int fullness)
{
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
@@ -794,17 +777,16 @@ static void remove_zspage(struct size_class *class,
/*
* Each size class maintains zspages in different fullness groups depending
* on the number of live objects they contain. When allocating or freeing
- * objects, the fullness status of the page can change, say, from ALMOST_FULL
- * to ALMOST_EMPTY when freeing an object. This function checks if such
- * a status change has occurred for the given page and accordingly moves the
- * page from the freelist of the old fullness group to that of the new
- * fullness group.
+ * objects, the fullness status of the page can change, for instance, from
+ * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function
+ * checks if such a status change has occurred for the given page and
+ * accordingly moves the page from the list of the old fullness group to that
+ * of the new fullness group.
*/
-static enum fullness_group fix_fullness_group(struct size_class *class,
- struct zspage *zspage)
+static int fix_fullness_group(struct size_class *class, struct zspage *zspage)
{
int class_idx;
- enum fullness_group currfg, newfg;
+ int currfg, newfg;
get_zspage_mapping(zspage, &class_idx, &currfg);
newfg = get_fullness_group(class, zspage);
@@ -977,7 +959,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
struct page *page, *next;
- enum fullness_group fg;
+ int fg;
unsigned int class_idx;
get_zspage_mapping(zspage, &class_idx, &fg);
@@ -985,7 +967,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
assert_spin_locked(&pool->lock);
VM_BUG_ON(get_zspage_inuse(zspage));
- VM_BUG_ON(fg != ZS_EMPTY);
+ VM_BUG_ON(fg != ZS_INUSE_RATIO_0);
/* Free all deferred handles from zs_free */
free_handles(pool, class, zspage);
@@ -1003,9 +985,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage);
- class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
- atomic_long_sub(class->pages_per_zspage,
- &pool->pages_allocated);
+ class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
}
static void free_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1024,7 +1005,7 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
return;
}
- remove_zspage(class, zspage, ZS_EMPTY);
+ remove_zspage(class, zspage, ZS_INUSE_RATIO_0);
#ifdef CONFIG_ZPOOL
list_del(&zspage->lru);
#endif
@@ -1160,9 +1141,9 @@ static struct zspage *find_get_zspage(struct size_class *class)
int i;
struct zspage *zspage;
- for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+ for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) {
zspage = list_first_entry_or_null(&class->fullness_list[i],
- struct zspage, list);
+ struct zspage, list);
if (zspage)
break;
}
@@ -1521,7 +1502,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
{
unsigned long handle, obj;
struct size_class *class;
- enum fullness_group newfg;
+ int newfg;
struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
@@ -1543,7 +1524,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
record_obj(handle, obj);
- class_stat_inc(class, OBJ_USED, 1);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
spin_unlock(&pool->lock);
return handle;
@@ -1563,10 +1544,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
- atomic_long_add(class->pages_per_zspage,
- &pool->pages_allocated);
- class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
- class_stat_inc(class, OBJ_USED, 1);
+ atomic_long_add(class->pages_per_zspage, &pool->pages_allocated);
+ class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
+ class_stat_inc(class, ZS_OBJS_INUSE, 1);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
@@ -1622,7 +1602,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
struct page *f_page;
unsigned long obj;
struct size_class *class;
- enum fullness_group fullness;
+ int fullness;
if (IS_ERR_OR_NULL((void *)handle))
return;
@@ -1637,7 +1617,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page);
class = zspage_class(pool, zspage);
- class_stat_dec(class, OBJ_USED, 1);
+ class_stat_dec(class, ZS_OBJS_INUSE, 1);
#ifdef CONFIG_ZPOOL
if (zspage->under_reclaim) {
@@ -1655,7 +1635,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
obj_free(class->size, obj, NULL);
fullness = fix_fullness_group(class, zspage);
- if (fullness == ZS_EMPTY)
+ if (fullness == ZS_INUSE_RATIO_0)
free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);
@@ -1796,15 +1776,14 @@ struct zs_compact_control {
int obj_idx;
};
-static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
- struct zs_compact_control *cc)
+static void migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
{
unsigned long used_obj, free_obj;
unsigned long handle;
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
int obj_idx = cc->obj_idx;
- int ret = 0;
while (1) {
handle = find_alloced_obj(class, s_page, &obj_idx);
@@ -1817,10 +1796,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
}
/* Stop if there is no more space */
- if (zspage_full(class, get_zspage(d_page))) {
- ret = -ENOMEM;
+ if (zspage_full(class, get_zspage(d_page)))
break;
- }
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(pool, get_zspage(d_page), handle);
@@ -1833,26 +1810,35 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->obj_idx = obj_idx;
-
- return ret;
}
-static struct zspage *isolate_zspage(struct size_class *class, bool source)
+static struct zspage *isolate_src_zspage(struct size_class *class)
{
- int i;
struct zspage *zspage;
- enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
+ int fg;
- if (!source) {
- fg[0] = ZS_ALMOST_FULL;
- fg[1] = ZS_ALMOST_EMPTY;
+ for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
+ if (zspage) {
+ remove_zspage(class, zspage, fg);
+ return zspage;
+ }
}
- for (i = 0; i < 2; i++) {
- zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
- struct zspage, list);
+ return zspage;
+}
+
+static struct zspage *isolate_dst_zspage(struct size_class *class)
+{
+ struct zspage *zspage;
+ int fg;
+
+ for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) {
+ zspage = list_first_entry_or_null(&class->fullness_list[fg],
+ struct zspage, list);
if (zspage) {
- remove_zspage(class, zspage, fg[i]);
+ remove_zspage(class, zspage, fg);
return zspage;
}
}
@@ -1865,12 +1851,11 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)
* @class: destination class
* @zspage: target page
*
- * Return @zspage's fullness_group
+ * Return @zspage's fullness status
*/
-static enum fullness_group putback_zspage(struct size_class *class,
- struct zspage *zspage)
+static int putback_zspage(struct size_class *class, struct zspage *zspage)
{
- enum fullness_group fullness;
+ int fullness;
fullness = get_fullness_group(class, zspage);
insert_zspage(class, zspage, fullness);
@@ -2134,7 +2119,7 @@ static void async_free_zspage(struct work_struct *work)
int i;
struct size_class *class;
unsigned int class_idx;
- enum fullness_group fullness;
+ int fullness;
struct zspage *zspage, *tmp;
LIST_HEAD(free_pages);
struct zs_pool *pool = container_of(work, struct zs_pool,
@@ -2146,7 +2131,8 @@ static void async_free_zspage(struct work_struct *work)
continue;
spin_lock(&pool->lock);
- list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+ list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
+ &free_pages);
spin_unlock(&pool->lock);
}
@@ -2155,7 +2141,7 @@ static void async_free_zspage(struct work_struct *work)
lock_zspage(zspage);
get_zspage_mapping(zspage, &class_idx, &fullness);
- VM_BUG_ON(fullness != ZS_EMPTY);
+ VM_BUG_ON(fullness != ZS_INUSE_RATIO_0);
class = pool->size_class[class_idx];
spin_lock(&pool->lock);
#ifdef CONFIG_ZPOOL
@@ -2203,8 +2189,8 @@ static inline void zs_flush_migration(struct zs_pool *pool) { }
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
- unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- unsigned long obj_used = zs_stat_get(class, OBJ_USED);
+ unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE);
if (obj_allocated <= obj_used)
return 0;
@@ -2219,7 +2205,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
struct size_class *class)
{
struct zs_compact_control cc;
- struct zspage *src_zspage;
+ struct zspage *src_zspage = NULL;
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
@@ -2228,50 +2214,45 @@ static unsigned long __zs_compact(struct zs_pool *pool,
* as well as zpage allocation/free
*/
spin_lock(&pool->lock);
- while ((src_zspage = isolate_zspage(class, true))) {
- /* protect someone accessing the zspage(i.e., zs_map_object) */
- migrate_write_lock(src_zspage);
+ while (zs_can_compact(class)) {
+ int fg;
- if (!zs_can_compact(class))
+ if (!dst_zspage) {
+ dst_zspage = isolate_dst_zspage(class);
+ if (!dst_zspage)
+ break;
+ migrate_write_lock(dst_zspage);
+ cc.d_page = get_first_page(dst_zspage);
+ }
+
+ src_zspage = isolate_src_zspage(class);
+ if (!src_zspage)
break;
+ migrate_write_lock_nested(src_zspage);
+
cc.obj_idx = 0;
cc.s_page = get_first_page(src_zspage);
+ migrate_zspage(pool, class, &cc);
+ fg = putback_zspage(class, src_zspage);
+ migrate_write_unlock(src_zspage);
- while ((dst_zspage = isolate_zspage(class, false))) {
- migrate_write_lock_nested(dst_zspage);
-
- cc.d_page = get_first_page(dst_zspage);
- /*
- * If there is no more space in dst_page, resched
- * and see if anyone had allocated another zspage.
- */
- if (!migrate_zspage(pool, class, &cc))
- break;
+ if (fg == ZS_INUSE_RATIO_0) {
+ free_zspage(pool, class, src_zspage);
+ pages_freed += class->pages_per_zspage;
+ }
+ src_zspage = NULL;
+ if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
+ || spin_is_contended(&pool->lock)) {
putback_zspage(class, dst_zspage);
migrate_write_unlock(dst_zspage);
dst_zspage = NULL;
- if (spin_is_contended(&pool->lock))
- break;
- }
-
- /* Stop if we couldn't find slot */
- if (dst_zspage == NULL)
- break;
- putback_zspage(class, dst_zspage);
- migrate_write_unlock(dst_zspage);
-
- if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
- migrate_write_unlock(src_zspage);
- free_zspage(pool, class, src_zspage);
- pages_freed += class->pages_per_zspage;
- } else
- migrate_write_unlock(src_zspage);
- spin_unlock(&pool->lock);
- cond_resched();
- spin_lock(&pool->lock);
+ spin_unlock(&pool->lock);
+ cond_resched();
+ spin_lock(&pool->lock);
+ }
}
if (src_zspage) {
@@ -2279,6 +2260,10 @@ static unsigned long __zs_compact(struct zs_pool *pool,
migrate_write_unlock(src_zspage);
}
+ if (dst_zspage) {
+ putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+ }
spin_unlock(&pool->lock);
return pages_freed;
@@ -2290,6 +2275,15 @@ unsigned long zs_compact(struct zs_pool *pool)
struct size_class *class;
unsigned long pages_freed = 0;
+ /*
+ * Pool compaction is performed under pool->lock so it is basically
+ * single-threaded. Having more than one thread in __zs_compact()
+ * will increase pool->lock contention, which will impact other
+ * zsmalloc operations that need pool->lock.
+ */
+ if (atomic_xchg(&pool->compaction_in_progress, 1))
+ return 0;
+
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
if (class->index != i)
@@ -2297,6 +2291,7 @@ unsigned long zs_compact(struct zs_pool *pool)
pages_freed += __zs_compact(pool, class);
}
atomic_long_add(pages_freed, &pool->stats.pages_compacted);
+ atomic_set(&pool->compaction_in_progress, 0);
return pages_freed;
}
@@ -2404,6 +2399,7 @@ struct zs_pool *zs_create_pool(const char *name)
init_deferred_free(pool);
spin_lock_init(&pool->lock);
+ atomic_set(&pool->compaction_in_progress, 0);
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
@@ -2421,7 +2417,7 @@ struct zs_pool *zs_create_pool(const char *name)
int pages_per_zspage;
int objs_per_zspage;
struct size_class *class;
- int fullness = 0;
+ int fullness;
size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
if (size > ZS_MAX_ALLOC_SIZE)
@@ -2475,9 +2471,12 @@ struct zs_pool *zs_create_pool(const char *name)
class->pages_per_zspage = pages_per_zspage;
class->objs_per_zspage = objs_per_zspage;
pool->size_class[i] = class;
- for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
- fullness++)
+
+ fullness = ZS_INUSE_RATIO_0;
+ while (fullness < NR_FULLNESS_GROUPS) {
INIT_LIST_HEAD(&class->fullness_list[fullness]);
+ fullness++;
+ }
prev_class = class;
}
@@ -2523,11 +2522,12 @@ void zs_destroy_pool(struct zs_pool *pool)
if (class->index != i)
continue;
- for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
- if (!list_empty(&class->fullness_list[fg])) {
- pr_info("Freeing non-empty class with size %db, fullness group %d\n",
- class->size, fg);
- }
+ for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) {
+ if (list_empty(&class->fullness_list[fg]))
+ continue;
+
+ pr_err("Class-%d fullness group %d is not empty\n",
+ class->size, fg);
}
kfree(class);
}
@@ -2629,7 +2629,7 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
unsigned long handle;
struct zspage *zspage;
struct page *page;
- enum fullness_group fullness;
+ int fullness;
/* Lock LRU and fullness list */
spin_lock(&pool->lock);
@@ -2699,7 +2699,7 @@ next:
* while the page is removed from the pool. Fix it
* up for the check in __free_zspage().
*/
- zspage->fullness = ZS_EMPTY;
+ zspage->fullness = ZS_INUSE_RATIO_0;
__free_zspage(pool, class, zspage);
spin_unlock(&pool->lock);