aboutsummaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c157
-rw-r--r--mm/cma.c4
-rw-r--r--mm/filemap.c237
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/memblock.c57
-rw-r--r--mm/memcontrol.c13
-rw-r--r--mm/memory.c2
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mremap.c23
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_io.c17
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab_common.c35
-rw-r--r--mm/swapfile.c2
16 files changed, 316 insertions, 284 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d382272bcc31..8e8b00627bb2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -281,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
#define INIT_BW (100 << (20 - PAGE_SHIFT))
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
- int blkcg_id, gfp_t gfp)
+ gfp_t gfp)
{
int i, err;
@@ -308,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
wb->dirty_sleep = jiffies;
- wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested) {
- err = -ENOMEM;
- goto out_put_bdi;
- }
-
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
- goto out_put_cong;
+ goto out_put_bdi;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -330,8 +324,6 @@ out_destroy_stat:
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
-out_put_cong:
- wb_congested_put(wb->congested);
out_put_bdi:
if (wb != &bdi->wb)
bdi_put(bdi);
@@ -374,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
- wb_congested_put(wb->congested);
if (wb != &wb->bdi->wb)
bdi_put(wb->bdi);
}
@@ -384,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb)
#include <linux/memcontrol.h>
/*
- * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
- * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
+ * bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
-/**
- * wb_congested_get_create - get or create a wb_congested
- * @bdi: associated bdi
- * @blkcg_id: ID of the associated blkcg
- * @gfp: allocation mask
- *
- * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
- * The returned wb_congested has its reference count incremented. Returns
- * NULL on failure.
- */
-struct bdi_writeback_congested *
-wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
-{
- struct bdi_writeback_congested *new_congested = NULL, *congested;
- struct rb_node **node, *parent;
- unsigned long flags;
-retry:
- spin_lock_irqsave(&cgwb_lock, flags);
-
- node = &bdi->cgwb_congested_tree.rb_node;
- parent = NULL;
-
- while (*node != NULL) {
- parent = *node;
- congested = rb_entry(parent, struct bdi_writeback_congested,
- rb_node);
- if (congested->blkcg_id < blkcg_id)
- node = &parent->rb_left;
- else if (congested->blkcg_id > blkcg_id)
- node = &parent->rb_right;
- else
- goto found;
- }
-
- if (new_congested) {
- /* !found and storage for new one already allocated, insert */
- congested = new_congested;
- rb_link_node(&congested->rb_node, parent, node);
- rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- return congested;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
-
- /* allocate storage for new one and retry */
- new_congested = kzalloc(sizeof(*new_congested), gfp);
- if (!new_congested)
- return NULL;
-
- refcount_set(&new_congested->refcnt, 1);
- new_congested->__bdi = bdi;
- new_congested->blkcg_id = blkcg_id;
- goto retry;
-
-found:
- refcount_inc(&congested->refcnt);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(new_congested);
- return congested;
-}
-
-/**
- * wb_congested_put - put a wb_congested
- * @congested: wb_congested to put
- *
- * Put @congested and destroy it if the refcnt reaches zero.
- */
-void wb_congested_put(struct bdi_writeback_congested *congested)
-{
- unsigned long flags;
-
- if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
- return;
-
- /* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->__bdi) {
- rb_erase(&congested->rb_node,
- &congested->__bdi->cgwb_congested_tree);
- congested->__bdi = NULL;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(congested);
-}
-
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -558,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
}
- ret = wb_init(wb, bdi, blkcg_css->id, gfp);
+ ret = wb_init(wb, bdi, gfp);
if (ret)
goto err_free;
@@ -696,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
int ret;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
- bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
- ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
@@ -769,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg)
spin_unlock_irq(&cgwb_lock);
}
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- struct rb_node *rbn;
-
- spin_lock_irq(&cgwb_lock);
- while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
- struct bdi_writeback_congested *congested =
- rb_entry(rbn, struct bdi_writeback_congested, rb_node);
-
- rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->__bdi = NULL; /* mark @congested unlinked */
- }
- spin_unlock_irq(&cgwb_lock);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
spin_lock_irq(&cgwb_lock);
@@ -810,29 +698,11 @@ subsys_initcall(cgwb_init);
static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
- int err;
-
- bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
- if (!bdi->wb_congested)
- return -ENOMEM;
-
- refcount_set(&bdi->wb_congested->refcnt, 1);
-
- err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
- if (err) {
- wb_congested_put(bdi->wb_congested);
- return err;
- }
- return 0;
+ return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- wb_congested_put(bdi->wb_congested);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
@@ -1023,7 +893,6 @@ static void release_bdi(struct kref *ref)
bdi_unregister(bdi);
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -1047,29 +916,29 @@ static wait_queue_head_t congestion_wqh[2] = {
};
static atomic_t nr_wb_congested[2];
-void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &congested->state))
+ if (test_and_clear_bit(bit, &bdi->wb.congested))
atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
-EXPORT_SYMBOL(clear_wb_congested);
+EXPORT_SYMBOL(clear_bdi_congested);
-void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &congested->state))
+ if (!test_and_set_bit(bit, &bdi->wb.congested))
atomic_inc(&nr_wb_congested[sync]);
}
-EXPORT_SYMBOL(set_wb_congested);
+EXPORT_SYMBOL(set_bdi_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
diff --git a/mm/cma.c b/mm/cma.c
index 0463ad2ce06b..26ecff818881 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -339,13 +339,13 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, false);
+ highmem_start, limit, nid, true);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, false);
+ limit, nid, true);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..9f131f1cfde3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -987,44 +987,46 @@ void __init pagecache_init(void)
page_writeback_init();
}
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
-struct wait_page_key {
- struct page *page;
- int bit_nr;
- int page_match;
-};
-
-struct wait_page_queue {
- struct page *page;
- int bit_nr;
- wait_queue_entry_t wait;
-};
-
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
+ int ret;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
- if (wait_page->page != key->page)
- return 0;
- key->page_match = 1;
-
- if (wait_page->bit_nr != key->bit_nr)
+ if (!wake_page_match(wait_page, key))
return 0;
/*
- * Stop walking if it's locked.
- * Is this safe if put_and_wait_on_page_locked() is in use?
- * Yes: the waker must hold a reference to this page, and if PG_locked
- * has now already been set by another task, that task must also hold
- * a reference to the *same usage* of this page; so there is no need
- * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ * If it's an exclusive wait, we get the bit for it, and
+ * stop walking if we can't.
+ *
+ * If it's a non-exclusive wait, then the fact that this
+ * wake function was called means that the bit already
+ * was cleared, and we don't care if somebody then
+ * re-took it.
*/
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
+ ret = 0;
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ ret = 1;
+ }
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ wake_up_state(wait->private, mode);
- return autoremove_wake_function(wait, mode, sync, key);
+ /*
+ * Ok, we have successfully done what we're waiting for,
+ * and we can unconditionally remove the wait entry.
+ *
+ * Note that this has to be the absolute last thing we do,
+ * since after list_del_init(&wait->entry) the wait entry
+ * might be de-allocated and the process might even have
+ * exited.
+ */
+ list_del_init_careful(&wait->entry);
+ return ret;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1103,16 +1105,31 @@ enum behavior {
*/
};
+/*
+ * Attempt to check (or get) the page bit, and mark the
+ * waiter woken if successful.
+ */
+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+ struct wait_queue_entry *wait)
+{
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(bit_nr, &page->flags))
+ return false;
+ } else if (test_bit(bit_nr, &page->flags))
+ return false;
+
+ wait->flags |= WQ_FLAG_WOKEN;
+ return true;
+}
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
- bool bit_is_set;
bool thrashing = false;
bool delayacct = false;
unsigned long pflags;
- int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
@@ -1130,48 +1147,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
wait_page.page = page;
wait_page.bit_nr = bit_nr;
- for (;;) {
- spin_lock_irq(&q->lock);
+ /*
+ * Do one last check whether we can get the
+ * page bit synchronously.
+ *
+ * Do the SetPageWaiters() marking before that
+ * to let any waker we _just_ missed know they
+ * need to wake us up (otherwise they'll never
+ * even go to the slow case that looks at the
+ * page queue), and add ourselves to the wait
+ * queue if we need to sleep.
+ *
+ * This part needs to be done under the queue
+ * lock to avoid races.
+ */
+ spin_lock_irq(&q->lock);
+ SetPageWaiters(page);
+ if (!trylock_page_bit_common(page, bit_nr, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
- if (likely(list_empty(&wait->entry))) {
- __add_wait_queue_entry_tail(q, wait);
- SetPageWaiters(page);
- }
+ /*
+ * From now on, all the logic will be based on
+ * the WQ_FLAG_WOKEN flag, and the and the page
+ * bit testing (and setting) will be - or has
+ * already been - done by the wake function.
+ *
+ * We can drop our reference to the page.
+ */
+ if (behavior == DROP)
+ put_page(page);
+ for (;;) {
set_current_state(state);
- spin_unlock_irq(&q->lock);
-
- bit_is_set = test_bit(bit_nr, &page->flags);
- if (behavior == DROP)
- put_page(page);
-
- if (likely(bit_is_set))
- io_schedule();
-
- if (behavior == EXCLUSIVE) {
- if (!test_and_set_bit_lock(bit_nr, &page->flags))
- break;
- } else if (behavior == SHARED) {
- if (!test_bit(bit_nr, &page->flags))
- break;
- }
-
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
+ if (signal_pending_state(state, current))
break;
- }
- if (behavior == DROP) {
- /*
- * We can no longer safely access page->flags:
- * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
- * there is a risk of waiting forever on a page reused
- * for something that keeps it locked indefinitely.
- * But best check for -EINTR above before breaking.
- */
+ if (wait->flags & WQ_FLAG_WOKEN)
break;
- }
+
+ io_schedule();
}
finish_wait(q, wait);
@@ -1190,7 +1206,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
* bother with signals either.
*/
- return ret;
+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
void wait_on_page_bit(struct page *page, int bit_nr)
@@ -1207,6 +1223,44 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
+static int __wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait, bool set)
+{
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ if (set)
+ ret = !trylock_page(page);
+ else
+ ret = PageLocked(page);
+ /*
+ * If we were succesful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
+}
+
+static int wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait)
+{
+ if (!PageLocked(page))
+ return 0;
+ return __wait_on_page_locked_async(compound_head(page), wait, false);
+}
+
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
@@ -1369,6 +1423,11 @@ int __lock_page_killable(struct page *__page)
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
+int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+{
+ return __wait_on_page_locked_async(page, wait, true);
+}
+
/*
* Return values:
* 1 - page is locked; mmap_lock is still held.
@@ -2028,7 +2087,7 @@ find_page:
page = find_get_page(mapping, index);
if (!page) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
@@ -2038,22 +2097,34 @@ find_page:
goto no_cached_page;
}
if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_NOIO) {
+ put_page(page);
+ goto out;
+ }
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
-
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
- error = wait_on_page_locked_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+ error = wait_on_page_locked_killable(page);
+ }
if (unlikely(error))
goto readpage_error;
if (PageUptodate(page))
@@ -2141,7 +2212,10 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
@@ -2160,6 +2234,11 @@ page_not_up_to_date_locked:
}
readpage:
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ unlock_page(page);
+ put_page(page);
+ goto would_block;
+ }
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
@@ -2249,9 +2328,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read);
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead. When no data
+ * can be read, -EAGAIN shall be returned. When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
* Return:
* * number of bytes copied, even for partial reads
- * * negative error code if nothing was read
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 57ece74e3aae..590111ea6975 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -45,7 +45,10 @@ int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
+#endif
+static unsigned long hugetlb_cma_size __initdata;
/*
* Minimum page order among possible hugepage sizes, set to a proper value
@@ -1235,9 +1238,10 @@ static void free_gigantic_page(struct page *page, unsigned int order)
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
- if (IS_ENABLED(CONFIG_CMA) &&
- cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+#ifdef CONFIG_CMA
+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
return;
+#endif
free_contig_range(page_to_pfn(page), 1 << order);
}
@@ -1248,7 +1252,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
{
unsigned long nr_pages = 1UL << huge_page_order(h);
- if (IS_ENABLED(CONFIG_CMA)) {
+#ifdef CONFIG_CMA
+ {
struct page *page;
int node;
@@ -1262,6 +1267,7 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
return page;
}
}
+#endif
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
@@ -1593,7 +1599,7 @@ static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
/* Use first found vma */
pgoff_start = page_to_pgoff(hpage);
- pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+ pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
@@ -2571,7 +2577,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) {
+ if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
break;
}
@@ -5654,7 +5660,6 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
}
#ifdef CONFIG_CMA
-static unsigned long hugetlb_cma_size __initdata;
static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b043c40a21d4..700f5160f3e4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -958,6 +958,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_ADDRESS_RANGE;
if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
+ /* Anon VMA expected */
+ if (!vma->anon_vma || vma->vm_ops)
+ return SCAN_VMA_CHECK;
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 39aceafc57f6..45f198750be9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -44,19 +44,20 @@
* in the system, for instance when the memory is restricted with
* ``mem=`` command line parameter
* * ``reserved`` - describes the regions that were allocated
- * * ``physmap`` - describes the actual physical memory regardless of
- * the possible restrictions; the ``physmap`` type is only available
- * on some architectures.
+ * * ``physmem`` - describes the actual physical memory available during
+ * boot regardless of the possible restrictions and memory hot(un)plug;
+ * the ``physmem`` type is only available on some architectures.
*
* Each region is represented by :c:type:`struct memblock_region` that
* defines the region extents, its attributes and NUMA node id on NUMA
* systems. Every memory type is described by the :c:type:`struct
* memblock_type` which contains an array of memory regions along with
- * the allocator metadata. The memory types are nicely wrapped with
- * :c:type:`struct memblock`. This structure is statically initialzed
- * at build time. The region arrays for the "memory" and "reserved"
- * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
- * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * the allocator metadata. The "memory" and "reserved" types are nicely
+ * wrapped with :c:type:`struct memblock`. This structure is statically
+ * initialized at build time. The region arrays are initially sized to
+ * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
+ * for "reserved". The region array for "physmem" is initially sized to
+ * %INIT_PHYSMEM_REGIONS.
* The memblock_allow_resize() enables automatic resizing of the region
* arrays during addition of new regions. This feature should be used
* with care so that memory allocated for the region array will not
@@ -87,8 +88,8 @@
* function frees all the memory to the buddy page allocator.
*
* Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
- * memblock data structures will be discarded after the system
- * initialization completes.
+ * memblock data structures (except "physmem") will be discarded after the
+ * system initialization completes.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -104,7 +105,7 @@ unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
#endif
struct memblock memblock __initdata_memblock = {
@@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = {
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- .physmem.regions = memblock_physmem_init_regions,
- .physmem.cnt = 1, /* empty dummy entry */
- .physmem.max = INIT_PHYSMEM_REGIONS,
- .physmem.name = "physmem",
-#endif
-
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+struct memblock_type physmem = {
+ .regions = memblock_physmem_init_regions,
+ .cnt = 1, /* empty dummy entry */
+ .max = INIT_PHYSMEM_REGIONS,
+ .name = "physmem",
+};
+#endif
+
int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
@@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
}
#endif
@@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
- enum memblock_flags flags,
- struct memblock_type *type_a,
- struct memblock_type *type_b,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b, phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
int idx_a = *idx & 0xffffffff;
int idx_b = *idx >> 32;
@@ -1924,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void)
memblock_dump(&memblock.memory);
memblock_dump(&memblock.reserved);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- memblock_dump(&memblock.physmem);
+ memblock_dump(&physmem);
#endif
}
@@ -2064,8 +2065,8 @@ static int __init memblock_init_debugfs(void)
debugfs_create_file("reserved", 0444, root,
&memblock.reserved, &memblock_debug_fops);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- debugfs_create_file("physmem", 0444, root,
- &memblock.physmem, &memblock_debug_fops);
+ debugfs_create_file("physmem", 0444, root, &physmem,
+ &memblock_debug_fops);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 19622328e4b5..13f559af1ab6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5669,7 +5669,6 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- mem_cgroup_id_get_many(mc.to, mc.moved_swap);
css_put_many(&mc.to->css, mc.moved_swap);
mc.moved_swap = 0;
@@ -5860,7 +5859,8 @@ put: /* get_mctgt_type() gets the page */
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
- /* we fixup refcnts and charges later. */
+ mem_cgroup_id_get_many(mc.to, 1);
+ /* we fixup other refcnts and charges later. */
mc.moved_swap++;
}
break;
@@ -7186,6 +7186,13 @@ static struct cftype memsw_files[] = {
{ }, /* terminate */
};
+/*
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
+ * boot parameter. This may result in premature OOPS inside
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
+ */
static int __init mem_cgroup_swap_init(void)
{
/* No memory control -> no swap control */
@@ -7200,6 +7207,6 @@ static int __init mem_cgroup_swap_init(void)
return 0;
}
-subsys_initcall(mem_cgroup_swap_init);
+core_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory.c b/mm/memory.c
index 87ec87cdc1ff..3ecad55103ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1601,7 +1601,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
unsigned long idx = 0, pgcount = *num;
- int err;
+ int err = -EINVAL;
for (; idx < pgcount; ++idx) {
err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
diff --git a/mm/migrate.c b/mm/migrate.c
index f37729673558..40cd7016ae6f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1161,21 +1161,10 @@ out:
}
/*
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
- * around it.
- */
-#if defined(CONFIG_ARM) && \
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
-#define ICE_noinline noinline
-#else
-#define ICE_noinline
-#endif
-
-/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
diff --git a/mm/mmap.c b/mm/mmap.c
index 59a4682ebf3f..8c7ca737a19b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2620,7 +2620,7 @@ static void unmap_region(struct mm_struct *mm,
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
-static void
+static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
@@ -2645,6 +2645,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
/* Kill the cache */
vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
}
/*
@@ -2825,7 +2836,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
}
/* Detach vmas from rbtree */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
if (downgrade)
mmap_write_downgrade(mm);
diff --git a/mm/mremap.c b/mm/mremap.c
index 5dd572d57ca9..6b153dc05fe4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -206,9 +206,28 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* The destination pmd shouldn't be established, free_pgtables()
- * should have release it.
+ * should have released it.
+ *
+ * However, there's a case during execve() where we use mremap
+ * to move the initial stack, and in that case the target area
+ * may overlap the source area (always moving down).
+ *
+ * If everything is PMD-aligned, that works fine, as moving
+ * each pmd down will clear the source pmd. But if we first
+ * have a few 4kB-only pages that get moved down, and then
+ * hit the "now the rest is PMD-aligned, let's do everything
+ * one pmd at a time", we will still have the old (now empty
+ * of any 4kB pages, but still there) PMD in the page table
+ * tree.
+ *
+ * Warn on it once - because we really should try to figure
+ * out how to do this better - but then say "I won't move
+ * this pmd".
+ *
+ * One alternative might be to just unmap the target pmd at
+ * this point, and verify that it really is empty. We'll see.
*/
- if (WARN_ON(!pmd_none(*new_pmd)))
+ if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48eb0f1410d4..e028b87ce294 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7832,7 +7832,7 @@ void setup_per_zone_wmarks(void)
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
- * we want it large (64MB max). But it is not linear, because network
+ * we want it large (256MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
diff --git a/mm/page_io.c b/mm/page_io.c
index e8726f3e3820..ccda76790088 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -277,6 +277,23 @@ static inline void count_swpout_vm_event(struct page *page)
count_vm_events(PSWPOUT, hpage_nr_pages(page));
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!page->mem_cgroup)
+ return;
+
+ rcu_read_lock();
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ bio_associate_blkg_from_css(bio, css);
+ rcu_read_unlock();
+}
+#else
+#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
diff --git a/mm/shmem.c b/mm/shmem.c
index a0dbe62f8042..b2abca3f7f33 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3178,7 +3178,7 @@ static int shmem_initxattrs(struct inode *inode,
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
GFP_KERNEL);
if (!new_xattr->name) {
- kfree(new_xattr);
+ kvfree(new_xattr);
return -ENOMEM;
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 37d48a56431d..fe8b68482670 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -326,6 +326,14 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->refcount < 0)
return 1;
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * Skip the dying kmem_cache.
+ */
+ if (s->memcg_params.dying)
+ return 1;
+#endif
+
return 0;
}
@@ -886,12 +894,15 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
return 0;
}
-static void flush_memcg_workqueue(struct kmem_cache *s)
+static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
{
spin_lock_irq(&memcg_kmem_wq_lock);
s->memcg_params.dying = true;
spin_unlock_irq(&memcg_kmem_wq_lock);
+}
+static void flush_memcg_workqueue(struct kmem_cache *s)
+{
/*
* SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
* sure all registered rcu callbacks have been invoked.
@@ -923,10 +934,6 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
-
-static inline void flush_memcg_workqueue(struct kmem_cache *s)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
@@ -944,8 +951,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
- flush_memcg_workqueue(s);
-
get_online_cpus();
get_online_mems();
@@ -955,6 +960,22 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
+#ifdef CONFIG_MEMCG_KMEM
+ memcg_set_kmem_cache_dying(s);
+
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ flush_memcg_workqueue(s);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+#endif
+
err = shutdown_memcg_caches(s);
if (!err)
err = shutdown_cache(s);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 987276c557d1..6c26916e95fd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2929,7 +2929,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_queue))
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {