aboutsummaryrefslogtreecommitdiff
path: root/mm/zswap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/zswap.c')
-rw-r--r--mm/zswap.c389
1 files changed, 149 insertions, 240 deletions
diff --git a/mm/zswap.c b/mm/zswap.c
index 6f8850c44b61..a50e2986cd2f 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -20,7 +20,6 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
-#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
@@ -43,8 +42,6 @@
/*********************************
* statistics
**********************************/
-/* Total bytes used by the compressed storage */
-u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */
@@ -126,19 +123,6 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
-/*
- * Enable/disable handling same-value filled pages (enabled by default).
- * If disabled every page is considered non-same-value filled.
- */
-static bool zswap_same_filled_pages_enabled = true;
-module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
- bool, 0644);
-
-/* Enable/disable handling non-same-value filled pages (enabled by default) */
-static bool zswap_non_same_filled_pages_enabled = true;
-module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
- bool, 0644);
-
/* Number of zpools in zswap_pool (empirically determined for scalability) */
#define ZSWAP_NR_ZPOOLS 32
@@ -183,8 +167,6 @@ struct zswap_pool {
/* Global LRU lists shared by all zswap pools. */
static struct list_lru zswap_list_lru;
-/* counter of pages stored in all zswap pools. */
-static atomic_t zswap_nr_stored = ATOMIC_INIT(0);
/* The lock protects zswap_next_shrink updates. */
static DEFINE_SPINLOCK(zswap_shrink_lock);
@@ -198,7 +180,6 @@ static struct shrinker *zswap_shrinker;
* This structure contains the metadata for tracking a single compressed
* page within zswap.
*
- * rbnode - links the entry into red-black tree for the appropriate swap type
* swpentry - associated swap entry, the offset indexes into the red-black tree
* length - the length in bytes of the compressed page data. Needed during
* decompression. For a same value filled page length is 0, and both
@@ -210,7 +191,6 @@ static struct shrinker *zswap_shrinker;
* lru - handle to the pool's lru used to evict pages.
*/
struct zswap_entry {
- struct rb_node rbnode;
swp_entry_t swpentry;
unsigned int length;
struct zswap_pool *pool;
@@ -222,12 +202,7 @@ struct zswap_entry {
struct list_head lru;
};
-struct zswap_tree {
- struct rb_root rbroot;
- spinlock_t lock;
-};
-
-static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
+static struct xarray *zswap_trees[MAX_SWAPFILES];
static unsigned int nr_zswap_trees[MAX_SWAPFILES];
/* RCU-protected iteration */
@@ -255,7 +230,7 @@ static bool zswap_has_pool;
* helpers and fwd declarations
**********************************/
-static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
+static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
{
return &zswap_trees[swp_type(swp)][swp_offset(swp)
>> SWAP_ADDRESS_SPACE_SHIFT];
@@ -265,45 +240,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
zpool_get_type((p)->zpools[0]))
-static bool zswap_is_full(void)
-{
- return totalram_pages() * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static bool zswap_can_accept(void)
-{
- return totalram_pages() * zswap_accept_thr_percent / 100 *
- zswap_max_pool_percent / 100 >
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static u64 get_zswap_pool_size(struct zswap_pool *pool)
-{
- u64 pool_size = 0;
- int i;
-
- for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
- pool_size += zpool_get_total_size(pool->zpools[i]);
-
- return pool_size;
-}
-
-static void zswap_update_total_size(void)
-{
- struct zswap_pool *pool;
- u64 total = 0;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(pool, &zswap_pools, list)
- total += get_zswap_pool_size(pool);
-
- rcu_read_unlock();
-
- zswap_pool_total_size = total;
-}
-
/*********************************
* pool functions
**********************************/
@@ -541,6 +477,48 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+static unsigned long zswap_max_pages(void)
+{
+ return totalram_pages() * zswap_max_pool_percent / 100;
+}
+
+static unsigned long zswap_accept_thr_pages(void)
+{
+ return zswap_max_pages() * zswap_accept_thr_percent / 100;
+}
+
+unsigned long zswap_total_pages(void)
+{
+ struct zswap_pool *pool;
+ unsigned long total = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &zswap_pools, list) {
+ int i;
+
+ for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+ total += zpool_get_total_pages(pool->zpools[i]);
+ }
+ rcu_read_unlock();
+
+ return total;
+}
+
+static bool zswap_check_limits(void)
+{
+ unsigned long cur_pages = zswap_total_pages();
+ unsigned long max_pages = zswap_max_pages();
+
+ if (cur_pages >= max_pages) {
+ zswap_pool_limit_hit++;
+ zswap_pool_reached_full = true;
+ } else if (zswap_pool_reached_full &&
+ cur_pages <= zswap_accept_thr_pages()) {
+ zswap_pool_reached_full = false;
+ }
+ return zswap_pool_reached_full;
+}
+
/*********************************
* param callbacks
**********************************/
@@ -807,63 +785,6 @@ void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
}
/*********************************
-* rbtree functions
-**********************************/
-static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
-{
- struct rb_node *node = root->rb_node;
- struct zswap_entry *entry;
- pgoff_t entry_offset;
-
- while (node) {
- entry = rb_entry(node, struct zswap_entry, rbnode);
- entry_offset = swp_offset(entry->swpentry);
- if (entry_offset > offset)
- node = node->rb_left;
- else if (entry_offset < offset)
- node = node->rb_right;
- else
- return entry;
- }
- return NULL;
-}
-
-/*
- * In the case that a entry with the same offset is found, a pointer to
- * the existing entry is stored in dupentry and the function returns -EEXIST
- */
-static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
- struct zswap_entry **dupentry)
-{
- struct rb_node **link = &root->rb_node, *parent = NULL;
- struct zswap_entry *myentry;
- pgoff_t myentry_offset, entry_offset = swp_offset(entry->swpentry);
-
- while (*link) {
- parent = *link;
- myentry = rb_entry(parent, struct zswap_entry, rbnode);
- myentry_offset = swp_offset(myentry->swpentry);
- if (myentry_offset > entry_offset)
- link = &(*link)->rb_left;
- else if (myentry_offset < entry_offset)
- link = &(*link)->rb_right;
- else {
- *dupentry = myentry;
- return -EEXIST;
- }
- }
- rb_link_node(&entry->rbnode, parent, link);
- rb_insert_color(&entry->rbnode, root);
- return 0;
-}
-
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
-}
-
-/*********************************
* zswap entry functions
**********************************/
static struct kmem_cache *zswap_entry_cache;
@@ -874,7 +795,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid)
entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid);
if (!entry)
return NULL;
- RB_CLEAR_NODE(&entry->rbnode);
return entry;
}
@@ -885,12 +805,7 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
{
- int i = 0;
-
- if (ZSWAP_NR_ZPOOLS > 1)
- i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS));
-
- return entry->pool->zpools[i];
+ return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))];
}
/*
@@ -904,7 +819,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
else {
zswap_lru_del(&zswap_list_lru, entry);
zpool_free(zswap_find_zpool(entry), entry->handle);
- atomic_dec(&zswap_nr_stored);
zswap_pool_put(entry->pool);
}
if (entry->objcg) {
@@ -913,18 +827,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
}
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_update_total_size();
-}
-
-/*
- * The caller hold the tree lock and search the entry from the tree,
- * so it must be on the tree, remove it from the tree and free it.
- */
-static void zswap_invalidate_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- zswap_rb_erase(&tree->rbroot, entry);
- zswap_entry_free(entry);
}
/*********************************
@@ -1126,7 +1028,8 @@ static void zswap_decompress(struct zswap_entry *entry, struct page *page)
static int zswap_writeback_entry(struct zswap_entry *entry,
swp_entry_t swpentry)
{
- struct zswap_tree *tree;
+ struct xarray *tree;
+ pgoff_t offset = swp_offset(swpentry);
struct folio *folio;
struct mempolicy *mpol;
bool folio_was_allocated;
@@ -1163,19 +1066,13 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
* be dereferenced.
*/
tree = swap_zswap_tree(swpentry);
- spin_lock(&tree->lock);
- if (zswap_rb_search(&tree->rbroot, swp_offset(swpentry)) != entry) {
- spin_unlock(&tree->lock);
+ if (entry != xa_cmpxchg(tree, offset, entry, NULL, GFP_KERNEL)) {
delete_from_swap_cache(folio);
folio_unlock(folio);
folio_put(folio);
return -ENOMEM;
}
- /* Safe to deref entry after the entry is verified above. */
- zswap_rb_erase(&tree->rbroot, entry);
- spin_unlock(&tree->lock);
-
zswap_decompress(entry, &folio->page);
count_vm_event(ZSWPWB);
@@ -1344,8 +1241,8 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
} else {
- nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
- nr_stored = atomic_read(&zswap_nr_stored);
+ nr_backing = zswap_total_pages();
+ nr_stored = atomic_read(&zswap_stored_pages);
}
if (!nr_stored)
@@ -1365,6 +1262,11 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
* This ensures that the better zswap compresses memory, the fewer
* pages we will evict to swap (as it will otherwise incur IO for
* relatively small memory saving).
+ *
+ * The memory saving factor calculated here takes same-filled pages into
+ * account, but those are not freeable since they almost occupy no
+ * space. Hence, we may scale nr_freeable down a little bit more than we
+ * should if we have a lot of same-filled pages.
*/
return mult_frac(nr_freeable, nr_backing, nr_stored);
}
@@ -1412,6 +1314,10 @@ static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
int ret, failures = 0;
+ unsigned long thr;
+
+ /* Reclaim down to the accept threshold */
+ thr = zswap_accept_thr_pages();
/* global reclaim will select cgroup in a round-robin fashion. */
do {
@@ -1459,32 +1365,37 @@ static void shrink_worker(struct work_struct *w)
break;
if (ret && ++failures == MAX_RECLAIM_RETRIES)
break;
-
resched:
cond_resched();
- } while (!zswap_can_accept());
+ } while (zswap_total_pages() > thr);
}
-static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
+/*********************************
+* same-filled functions
+**********************************/
+static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value)
{
unsigned long *page;
unsigned long val;
unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
+ bool ret = false;
- page = (unsigned long *)ptr;
+ page = kmap_local_folio(folio, 0);
val = page[0];
if (val != page[last_pos])
- return 0;
+ goto out;
for (pos = 1; pos < last_pos; pos++) {
if (val != page[pos])
- return 0;
+ goto out;
}
*value = val;
-
- return 1;
+ ret = true;
+out:
+ kunmap_local(page);
+ return ret;
}
static void zswap_fill_page(void *ptr, unsigned long value)
@@ -1495,14 +1406,18 @@ static void zswap_fill_page(void *ptr, unsigned long value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
}
+/*********************************
+* main API
+**********************************/
bool zswap_store(struct folio *folio)
{
swp_entry_t swp = folio->swap;
pgoff_t offset = swp_offset(swp);
- struct zswap_tree *tree = swap_zswap_tree(swp);
- struct zswap_entry *entry, *dupentry;
+ struct xarray *tree = swap_zswap_tree(swp);
+ struct zswap_entry *entry, *old;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
+ unsigned long value;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1514,6 +1429,7 @@ bool zswap_store(struct folio *folio)
if (!zswap_enabled)
goto check_old;
+ /* Check cgroup limits */
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
@@ -1524,19 +1440,8 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- /* reclaim space if needed */
- if (zswap_is_full()) {
- zswap_pool_limit_hit++;
- zswap_pool_reached_full = true;
- goto shrink;
- }
-
- if (zswap_pool_reached_full) {
- if (!zswap_can_accept())
- goto shrink;
- else
- zswap_pool_reached_full = false;
- }
+ if (zswap_check_limits())
+ goto reject;
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio));
@@ -1545,24 +1450,13 @@ bool zswap_store(struct folio *folio)
goto reject;
}
- if (zswap_same_filled_pages_enabled) {
- unsigned long value;
- u8 *src;
-
- src = kmap_local_folio(folio, 0);
- if (zswap_is_page_same_filled(src, &value)) {
- kunmap_local(src);
- entry->length = 0;
- entry->value = value;
- atomic_inc(&zswap_same_filled_pages);
- goto insert_entry;
- }
- kunmap_local(src);
+ if (zswap_is_folio_same_filled(folio, &value)) {
+ entry->length = 0;
+ entry->value = value;
+ atomic_inc(&zswap_same_filled_pages);
+ goto store_entry;
}
- if (!zswap_non_same_filled_pages_enabled)
- goto freepage;
-
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
@@ -1580,62 +1474,77 @@ bool zswap_store(struct folio *folio)
if (!zswap_compress(folio, entry))
goto put_pool;
-insert_entry:
+store_entry:
entry->swpentry = swp;
entry->objcg = objcg;
+
+ old = xa_store(tree, offset, entry, GFP_KERNEL);
+ if (xa_is_err(old)) {
+ int err = xa_err(old);
+
+ WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+ zswap_reject_alloc_fail++;
+ goto store_failed;
+ }
+
+ /*
+ * We may have had an existing entry that became stale when
+ * the folio was redirtied and now the new version is being
+ * swapped out. Get rid of the old.
+ */
+ if (old)
+ zswap_entry_free(old);
+
if (objcg) {
obj_cgroup_charge_zswap(objcg, entry->length);
- /* Account before objcg ref is moved to tree */
count_objcg_event(objcg, ZSWPOUT);
}
- /* map */
- spin_lock(&tree->lock);
/*
- * The folio may have been dirtied again, invalidate the
- * possibly stale entry before inserting the new entry.
+ * We finish initializing the entry while it's already in xarray.
+ * This is safe because:
+ *
+ * 1. Concurrent stores and invalidations are excluded by folio lock.
+ *
+ * 2. Writeback is excluded by the entry not being on the LRU yet.
+ * The publishing order matters to prevent writeback from seeing
+ * an incoherent entry.
*/
- if (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) {
- zswap_invalidate_entry(tree, dupentry);
- WARN_ON(zswap_rb_insert(&tree->rbroot, entry, &dupentry));
- }
if (entry->length) {
INIT_LIST_HEAD(&entry->lru);
zswap_lru_add(&zswap_list_lru, entry);
- atomic_inc(&zswap_nr_stored);
}
- spin_unlock(&tree->lock);
/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_update_total_size();
count_vm_event(ZSWPOUT);
return true;
+store_failed:
+ if (!entry->length)
+ atomic_dec(&zswap_same_filled_pages);
+ else {
+ zpool_free(zswap_find_zpool(entry), entry->handle);
put_pool:
- zswap_pool_put(entry->pool);
+ zswap_pool_put(entry->pool);
+ }
freepage:
zswap_entry_cache_free(entry);
reject:
- if (objcg)
- obj_cgroup_put(objcg);
+ obj_cgroup_put(objcg);
+ if (zswap_pool_reached_full)
+ queue_work(shrink_wq, &zswap_shrink_work);
check_old:
/*
* If the zswap store fails or zswap is disabled, we must invalidate the
* possibly stale entry which was previously stored at this offset.
* Otherwise, writeback could overwrite the new data in the swapfile.
*/
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = xa_erase(tree, offset);
if (entry)
- zswap_invalidate_entry(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
return false;
-
-shrink:
- queue_work(shrink_wq, &zswap_shrink_work);
- goto reject;
}
bool zswap_load(struct folio *folio)
@@ -1644,18 +1553,12 @@ bool zswap_load(struct folio *folio)
pgoff_t offset = swp_offset(swp);
struct page *page = &folio->page;
bool swapcache = folio_test_swapcache(folio);
- struct zswap_tree *tree = swap_zswap_tree(swp);
+ struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
u8 *dst;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
- if (!entry) {
- spin_unlock(&tree->lock);
- return false;
- }
/*
* When reading into the swapcache, invalidate our entry. The
* swapcache can be the authoritative owner of the page and
@@ -1669,8 +1572,12 @@ bool zswap_load(struct folio *folio)
* the fault fails. We remain the primary owner of the entry.)
*/
if (swapcache)
- zswap_rb_erase(&tree->rbroot, entry);
- spin_unlock(&tree->lock);
+ entry = xa_erase(tree, offset);
+ else
+ entry = xa_load(tree, offset);
+
+ if (!entry)
+ return false;
if (entry->length)
zswap_decompress(entry, page);
@@ -1695,19 +1602,17 @@ bool zswap_load(struct folio *folio)
void zswap_invalidate(swp_entry_t swp)
{
pgoff_t offset = swp_offset(swp);
- struct zswap_tree *tree = swap_zswap_tree(swp);
+ struct xarray *tree = swap_zswap_tree(swp);
struct zswap_entry *entry;
- spin_lock(&tree->lock);
- entry = zswap_rb_search(&tree->rbroot, offset);
+ entry = xa_erase(tree, offset);
if (entry)
- zswap_invalidate_entry(tree, entry);
- spin_unlock(&tree->lock);
+ zswap_entry_free(entry);
}
int zswap_swapon(int type, unsigned long nr_pages)
{
- struct zswap_tree *trees, *tree;
+ struct xarray *trees, *tree;
unsigned int nr, i;
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
@@ -1717,11 +1622,8 @@ int zswap_swapon(int type, unsigned long nr_pages)
return -ENOMEM;
}
- for (i = 0; i < nr; i++) {
- tree = trees + i;
- tree->rbroot = RB_ROOT;
- spin_lock_init(&tree->lock);
- }
+ for (i = 0; i < nr; i++)
+ xa_init(trees + i);
nr_zswap_trees[type] = nr;
zswap_trees[type] = trees;
@@ -1730,7 +1632,7 @@ int zswap_swapon(int type, unsigned long nr_pages)
void zswap_swapoff(int type)
{
- struct zswap_tree *trees = zswap_trees[type];
+ struct xarray *trees = zswap_trees[type];
unsigned int i;
if (!trees)
@@ -1738,7 +1640,7 @@ void zswap_swapoff(int type)
/* try_to_unuse() invalidated all the entries already */
for (i = 0; i < nr_zswap_trees[type]; i++)
- WARN_ON_ONCE(!RB_EMPTY_ROOT(&trees[i].rbroot));
+ WARN_ON_ONCE(!xa_empty(trees + i));
kvfree(trees);
nr_zswap_trees[type] = 0;
@@ -1753,6 +1655,13 @@ void zswap_swapoff(int type)
static struct dentry *zswap_debugfs_root;
+static int debugfs_get_total_size(void *data, u64 *val)
+{
+ *val = zswap_total_pages() * PAGE_SIZE;
+ return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
+
static int zswap_debugfs_init(void)
{
if (!debugfs_initialized())
@@ -1774,8 +1683,8 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, &zswap_reject_compress_poor);
debugfs_create_u64("written_back_pages", 0444,
zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("pool_total_size", 0444,
- zswap_debugfs_root, &zswap_pool_total_size);
+ debugfs_create_file("pool_total_size", 0444,
+ zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages);
debugfs_create_atomic_t("same_filled_pages", 0444,