diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 388 | ||||
-rw-r--r-- | mm/bootmem.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 546 | ||||
-rw-r--r-- | mm/memcontrol.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 11 | ||||
-rw-r--r-- | mm/mempolicy.c | 84 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/oom_kill.c | 64 | ||||
-rw-r--r-- | mm/page-writeback.c | 184 | ||||
-rw-r--r-- | mm/page_alloc.c | 49 | ||||
-rw-r--r-- | mm/pdflush.c | 269 | ||||
-rw-r--r-- | mm/percpu.c | 50 | ||||
-rw-r--r-- | mm/rmap.c | 1 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/shmem_acl.c | 11 | ||||
-rw-r--r-- | mm/slob.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 19 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 19 |
25 files changed, 1043 insertions, 730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe5f674d7a7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/Makefile b/mm/Makefile index 5e0bd6426693..147a7a7873c4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -8,7 +8,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ vmalloc.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o $(mmu-y) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a5035..d3ca0dac1111 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,8 +1,11 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> @@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { + .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, @@ -22,6 +26,18 @@ struct backing_dev_info default_backing_dev_info = { EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); +static void arm_supers_timer(void); + +static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> @@ -37,9 +53,29 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + /* + * inode lock is enough here, the bdi->wb_list is protected by + * RCU on the reader side + */ + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(wb, &bdi->wb_list, list) { + nr_wb++; + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; + } + spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -49,12 +85,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "WriteBack threads:%8lu\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n" + "wb_mask: %8lx\n" + "wb_list: %8u\n" + "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, + !list_empty(&bdi->wb_list), bdi->wb_cnt); #undef K return 0; @@ -185,6 +231,13 @@ static int __init default_bdi_init(void) { int err; + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + init_timer(&sync_supers_timer); + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + arm_supers_timer(); + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -193,6 +246,248 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static void bdi_task_init(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct task_struct *tsk = current; + + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&wb->list, &bdi->wb_list); + spin_unlock(&bdi->wb_lock); + + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(tsk, 0); +} + +static int bdi_start_fn(void *ptr) +{ + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; + int ret; + + /* + * Add us to the active bdi_list + */ + spin_lock(&bdi_lock); + list_add(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + + bdi_task_init(bdi, wb); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + + ret = bdi_writeback_task(wb); + + /* + * Remove us from the list + */ + spin_lock(&bdi->wb_lock); + list_del_rcu(&wb->list); + spin_unlock(&bdi->wb_lock); + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + wb->task = NULL; + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wbc(&wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback tasks individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +static void arm_supers_timer(void) +{ + unsigned long next; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + arm_supers_timer(); +} + +static int bdi_forker_task(void *ptr) +{ + struct bdi_writeback *me = ptr; + + bdi_task_init(me->bdi, me); + + for (;;) { + struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + wb_do_writeback(me, 0); + + spin_lock(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (bdi->wb.task) + continue; + if (list_empty(&bdi->work_list) && + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + set_current_state(TASK_INTERRUPTIBLE); + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + spin_unlock(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + /* + * This is our real job - check for pending entries in + * bdi_pending_list, and create the tasks that got added + */ + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, + bdi_list); + list_del_init(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + wb = &bdi->wb; + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + dev_name(bdi->dev)); + /* + * If task creation fails, then readd the bdi to + * the pending list and force writeout of the bdi + * from this forker thread. That will free some memory + * and we can try again. + */ + if (IS_ERR(wb->task)) { + wb->task = NULL; + + /* + * Add this 'bdi' to the back, so we get + * a chance to flush other bdi's to free + * memory. + */ + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + bdi_flush_io(bdi); + } + } + + return 0; +} + +/* + * Add the default flusher task that gets created for any bdi + * that has dirty data pending writeout + */ +void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { + printk(KERN_ERR "bdi %p/%s is not registered!\n", + bdi, bdi->name); + return; + } + + /* + * Check with the helper whether to proceed adding a task. Will only + * abort if we two or more simultanous calls to + * bdi_add_default_flusher_task() occured, further additions will block + * waiting for previous additions to finish. + */ + if (!test_and_set_bit(BDI_pending, &bdi->state)) { + list_move_tail(&bdi->bdi_list, &bdi_pending_list); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); + } +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -211,9 +506,35 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_list); + spin_unlock(&bdi_lock); + bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) { + wb->task = NULL; + ret = -ENOMEM; + + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + goto exit; + } + } + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); exit: return ret; } @@ -225,9 +546,42 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + spin_lock(&bdi_lock); + list_del(&bdi->bdi_list); + spin_unlock(&bdi_lock); + + /* + * Finally, kill the kernel threads. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. + */ + list_for_each_entry(wb, &bdi->wb_list, list) + kthread_stop(wb->task); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -237,14 +591,25 @@ EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + /* + * Just one thread support for now, hard code mask and count + */ + bdi->wb_mask = 1; + bdi->wb_cnt = 1; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -269,6 +634,8 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + WARN_ON(bdi_has_dirty_io(bdi)); + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) @@ -283,7 +650,6 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; @@ -308,18 +674,18 @@ EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/bootmem.c b/mm/bootmem.c index d2a9ce952768..555d5d2731c6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/kmemleak.h> #include <asm/bug.h> #include <asm/io.h> @@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { unsigned long start, end; + kmemleak_free_part(__va(physaddr), size); + start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) { unsigned long start, end; + kmemleak_free_part(__va(addr), size); + start = PFN_UP(addr); end = PFN_DOWN(addr + size); @@ -516,6 +521,11 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); return region; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d0351e31f474..cafdcee154e8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e766e1da09d2..4ea4510e2996 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -92,21 +92,24 @@ #include <linux/string.h> #include <linux/nodemask.h> #include <linux/mm.h> +#include <linux/workqueue.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/atomic.h> +#include <linux/kmemcheck.h> #include <linux/kmemleak.h> /* * Kmemleak configuration and common defines. */ #define MAX_TRACE 16 /* stack trace length */ -#define REPORTS_NR 50 /* maximum number of reported leaks */ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -120,6 +123,9 @@ struct kmemleak_scan_area { size_t length; }; +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + /* * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the @@ -158,6 +164,17 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set on newly allocated objects */ +#define OBJECT_NEW (1 << 3) + +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 /* the list of all allocated objects */ static LIST_HEAD(object_list); @@ -196,9 +213,6 @@ static int kmemleak_stack_scan = 1; /* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); -/* number of leaks reported (for limitation purposes) */ -static int reported_leaks; - /* * Early object allocation/freeing logging. Kmemleak is initialized after the * kernel allocator. However, both the kernel allocator and kmemleak may @@ -211,6 +225,7 @@ static int reported_leaks; enum { KMEMLEAK_ALLOC, KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -228,11 +243,14 @@ struct early_log { int min_count; /* minimum reference count */ unsigned long offset; /* scan area offset */ size_t length; /* scan area length */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ }; /* early logging buffer and current position */ -static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; -static int crt_early_log; +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; static void kmemleak_disable(void); @@ -255,6 +273,35 @@ static void kmemleak_disable(void); } while (0) /* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* * Object colors, encoded with count and min_count: * - white - orphan object, not enough references to it (count < min_count) * - gray - not orphan, not marked as false positive (min_count == 0) or @@ -264,14 +311,21 @@ static void kmemleak_disable(void); * Newly created objects don't have any color assigned (object->count == -1) * before the next memory scan when they become white. */ -static int color_white(const struct kmemleak_object *object) +static bool color_white(const struct kmemleak_object *object) +{ + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; +} + +static bool color_gray(const struct kmemleak_object *object) { - return object->count != -1 && object->count < object->min_count; + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; } -static int color_gray(const struct kmemleak_object *object) +static bool color_black(const struct kmemleak_object *object) { - return object->min_count != -1 && object->count >= object->min_count; + return object->min_count == KMEMLEAK_BLACK; } /* @@ -279,7 +333,7 @@ static int color_gray(const struct kmemleak_object *object) * not be deleted and have a minimum age to avoid false positives caused by * pointers temporarily stored in CPU registers. */ -static int unreferenced_object(struct kmemleak_object *object) +static bool unreferenced_object(struct kmemleak_object *object) { return (object->flags & OBJECT_ALLOCATED) && color_white(object) && time_before_eq(object->jiffies + jiffies_min_age, @@ -299,6 +353,7 @@ static void print_unreferenced(struct seq_file *seq, object->pointer, object->size); seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); + hex_dump_object(seq, object); seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { @@ -325,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -429,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* * Create the metadata (struct kmemleak_object) corresponding to an allocated * memory block and add it to the object_list and object_tree_root. */ -static void create_object(unsigned long ptr, size_t size, int min_count, - gfp_t gfp) +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct prio_tree_node *node; - struct stack_trace trace; object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); - return; + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -451,7 +522,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count, INIT_HLIST_HEAD(&object->area_list); spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED; + object->flags = OBJECT_ALLOCATED | OBJECT_NEW; object->pointer = ptr; object->size = size; object->min_count = min_count; @@ -477,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count, } /* kernel backtrace */ - trace.max_entries = MAX_TRACE; - trace.nr_entries = 0; - trace.entries = object->trace; - trace.skip = 1; - save_stack_trace(&trace); - object->trace_len = trace.nr_entries; + object->trace_len = __save_stack_trace(object->trace); INIT_PRIO_TREE_NODE(&object->tree_node); object->tree_node.start = ptr; object->tree_node.last = ptr + size - 1; write_lock_irqsave(&kmemleak_lock, flags); + min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); node = prio_tree_insert(&object_tree_root, &object->tree_node); @@ -499,47 +566,36 @@ static void create_object(unsigned long ptr, size_t size, int min_count, * random memory blocks. */ if (node != &object->tree_node) { - unsigned long flags; - kmemleak_stop("Cannot insert 0x%lx into the object search tree " "(already existing)\n", ptr); object = lookup_object(ptr, 1); - spin_lock_irqsave(&object->lock, flags); + spin_lock(&object->lock); dump_object_info(object); - spin_unlock_irqrestore(&object->lock, flags); + spin_unlock(&object->lock); goto out; } list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); + return object; } /* * Remove the metadata (struct kmemleak_object) for a memory block from the * object_list and object_tree_root and decrement its use_count. */ -static void delete_object(unsigned long ptr) +static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - struct kmemleak_object *object; write_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, 0); - if (!object) { -#ifdef DEBUG - kmemleak_warn("Freeing unknown object at 0x%08lx\n", - ptr); -#endif - write_unlock_irqrestore(&kmemleak_lock, flags); - return; - } prio_tree_remove(&object_tree_root, &object->tree_node); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 1); + WARN_ON(atomic_read(&object->use_count) < 2); /* * Locking here also ensures that the corresponding memory block @@ -552,48 +608,115 @@ static void delete_object(unsigned long ptr) } /* - * Make a object permanently as gray-colored so that it can no longer be - * reported as a leak. This is used in general to mark a false positive. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. */ -static void make_gray_object(unsigned long ptr) +static void delete_object_full(unsigned long ptr) { - unsigned long flags; struct kmemleak_object *object; object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif return; } - - spin_lock_irqsave(&object->lock, flags); - object->min_count = 0; - spin_unlock_irqrestore(&object->lock, flags); + __delete_object(object); put_object(object); } /* - * Mark the object as black-colored so that it is ignored from scans and - * reporting. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. */ -static void make_black_object(unsigned long ptr) +static void delete_object_part(unsigned long ptr, size_t size) { - unsigned long flags; struct kmemleak_object *object; + unsigned long start, end; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { - kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif return; } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; spin_lock_irqsave(&object->lock, flags); - object->min_count = -1; + __paint_it(object, color); spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); put_object(object); } /* + * Make a object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_BLACK); +} + +/* * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ @@ -662,14 +785,15 @@ static void object_no_scan(unsigned long ptr) * Log an early kmemleak_* call to the early_log buffer. These calls will be * processed later once kmemleak is fully initialized. */ -static void log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count, unsigned long offset, size_t length) { unsigned long flags; struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - pr_warning("Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded, " + "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); kmemleak_disable(); return; } @@ -686,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size, log->min_count = min_count; log->offset = offset; log->length = length; + if (op_type == KMEMLEAK_ALLOC) + log->trace_len = __save_stack_trace(log->trace); crt_early_log++; local_irq_restore(flags); } /* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_KERNEL); + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); + rcu_read_unlock(); +} + +/* * Memory allocation function callback. This function is called from the * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, * vmalloc etc.). */ -void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); @@ -710,22 +863,37 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); * Memory freeing function callback. This function is called from the kernel * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). */ -void kmemleak_free(const void *ptr) +void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - delete_object((unsigned long)ptr); + delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); /* + * Partial memory freeing function callback. This function is usually called + * from bootmem allocator when (part of) a memory block is freed. + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/* * Mark an already allocated memory block as a false positive. This will cause * the block to no longer be reported as leak and always be scanned. */ -void kmemleak_not_leak(const void *ptr) +void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -741,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); * corresponding block is not a leak and does not contain any references to * other allocated memory blocks. */ -void kmemleak_ignore(const void *ptr) +void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -755,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, - gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, + size_t length, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -770,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); /* * Inform kmemleak not to scan the given memory block. */ -void kmemleak_no_scan(const void *ptr) +void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -807,20 +975,29 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned) + struct kmemleak_object *scanned, int allow_resched) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); for (ptr = start; ptr < end; ptr++) { - unsigned long flags; - unsigned long pointer = *ptr; struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; + if (allow_resched) + cond_resched(); if (scan_should_stop()) break; + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + pointer = *ptr; + object = find_and_get_object(pointer, 1); if (!object) continue; @@ -879,14 +1056,25 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) - scan_block((void *)object->pointer, - (void *)(object->pointer + object->size), object); - else + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else hlist_for_each_entry(area, elem, &object->area_list, node) scan_block((void *)(object->pointer + area->offset), (void *)(object->pointer + area->offset - + area->length), object); + + area->length), object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -900,9 +1088,9 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object, *tmp; - struct task_struct *task; int i; int new_leaks = 0; + int gray_list_pass = 0; jiffies_last_scan = jiffies; @@ -923,6 +1111,7 @@ static void kmemleak_scan(void) #endif /* reset the reference count (whiten the object) */ object->count = 0; + object->flags &= ~OBJECT_NEW; if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); @@ -931,14 +1120,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL); - scan_block(__bss_start, __bss_stop, NULL); + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL); + __per_cpu_end + per_cpu_offset(i), NULL, 1); #endif /* @@ -960,19 +1149,21 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL); + scan_block(page, page + 1, NULL, 1); } } /* - * Scanning the task stacks may introduce false negatives and it is - * not enabled by default. + * Scanning the task stacks (may introduce false negatives). */ if (kmemleak_stack_scan) { + struct task_struct *p, *g; + read_lock(&tasklist_lock); - for_each_process(task) - scan_block(task_stack_page(task), - task_stack_page(task) + THREAD_SIZE, NULL); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -984,6 +1175,7 @@ static void kmemleak_scan(void) * kmemleak objects cannot be freed from outside the loop because their * use_count was increased. */ +repeat: object = list_entry(gray_list.next, typeof(*object), gray_list); while (&object->gray_list != &gray_list) { cond_resched(); @@ -1001,12 +1193,38 @@ static void kmemleak_scan(void) object = tmp; } + + if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) + goto scan_end; + + /* + * Check for new objects allocated during this scanning and add them + * to the gray list. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_NEW) && !color_black(object) && + get_object(object)) { + object->flags &= ~OBJECT_NEW; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (!list_empty(&gray_list)) + goto repeat; + +scan_end: WARN_ON(!list_empty(&gray_list)); /* - * If scanning was stopped do not report any new unreferenced objects. + * If scanning was stopped or new objects were being allocated at a + * higher rate than gray list scanning, do not report any new + * unreferenced objects. */ - if (scan_should_stop()) + if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) return; /* @@ -1039,6 +1257,7 @@ static int kmemleak_scan_thread(void *arg) static int first_run = 1; pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); /* * Wait before the first scan to allow the system to fully initialize. @@ -1069,7 +1288,7 @@ static int kmemleak_scan_thread(void *arg) * Start the automatic memory scanning thread. This function must be called * with the scan_mutex held. */ -void start_scan_thread(void) +static void start_scan_thread(void) { if (scan_thread) return; @@ -1084,7 +1303,7 @@ void start_scan_thread(void) * Stop the automatic memory scanning thread. This function must be called * with the scan_mutex held. */ -void stop_scan_thread(void) +static void stop_scan_thread(void) { if (scan_thread) { kthread_stop(scan_thread); @@ -1101,11 +1320,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) { struct kmemleak_object *object; loff_t n = *pos; + int err; - if (!n) - reported_leaks = 0; - if (reported_leaks >= REPORTS_NR) - return NULL; + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { @@ -1116,7 +1335,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) } object = NULL; out: - rcu_read_unlock(); return object; } @@ -1131,17 +1349,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct list_head *n = &prev_obj->object_list; ++(*pos); - if (reported_leaks >= REPORTS_NR) - goto out; - rcu_read_lock(); list_for_each_continue_rcu(n, &object_list) { next_obj = list_entry(n, struct kmemleak_object, object_list); if (get_object(next_obj)) break; } - rcu_read_unlock(); -out: + put_object(prev_obj); return next_obj; } @@ -1151,8 +1365,16 @@ out: */ static void kmemleak_seq_stop(struct seq_file *seq, void *v) { - if (v) - put_object(v); + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } } /* @@ -1164,10 +1386,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) unsigned long flags; spin_lock_irqsave(&object->lock, flags); - if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) { + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) print_unreferenced(seq, object); - reported_leaks++; - } spin_unlock_irqrestore(&object->lock, flags); return 0; } @@ -1181,36 +1401,58 @@ static const struct seq_operations kmemleak_seq_ops = { static int kmemleak_open(struct inode *inode, struct file *file) { - int ret = 0; - if (!atomic_read(&kmemleak_enabled)) return -EBUSY; - ret = mutex_lock_interruptible(&scan_mutex); - if (ret < 0) - goto out; - if (file->f_mode & FMODE_READ) { - ret = seq_open(file, &kmemleak_seq_ops); - if (ret < 0) - goto scan_unlock; - } - return ret; - -scan_unlock: - mutex_unlock(&scan_mutex); -out: - return ret; + return seq_open(file, &kmemleak_seq_ops); } static int kmemleak_release(struct inode *inode, struct file *file) { - int ret = 0; + return seq_release(inode, file); +} - if (file->f_mode & FMODE_READ) - seq_release(inode, file); - mutex_unlock(&scan_mutex); +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; - return ret; + addr= simple_strtoul(str, NULL, 0); + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; + } + + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); } /* @@ -1224,21 +1466,26 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them + * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { char buf[64]; int buf_size; - - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; + int ret; buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; buf[buf_size] = 0; + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1251,11 +1498,10 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, stop_scan_thread(); else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - int err; - err = strict_strtoul(buf + 5, 0, &secs); - if (err < 0) - return err; + ret = strict_strtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; stop_scan_thread(); if (secs) { jiffies_scan_wait = msecs_to_jiffies(secs * 1000); @@ -1263,8 +1509,17 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); + else if (strncmp(buf, "clear", 5) == 0) + kmemleak_clear(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); else - return -EINVAL; + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; /* ignore the rest of the buffer, only one command at a time */ *ppos += size; @@ -1284,7 +1539,7 @@ static const struct file_operations kmemleak_fops = { * Perform the freeing of the kmemleak internal objects after waiting for any * current memory scan to complete. */ -static int kmemleak_cleanup_thread(void *arg) +static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; @@ -1293,25 +1548,12 @@ static int kmemleak_cleanup_thread(void *arg) rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) - delete_object(object->pointer); + delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); - - return 0; } -/* - * Start the clean-up thread. - */ -static void kmemleak_cleanup(void) -{ - struct task_struct *cleanup_thread; - - cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, - "kmemleak-clean"); - if (IS_ERR(cleanup_thread)) - pr_warning("Failed to create the clean-up thread\n"); -} +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); /* * Disable kmemleak. No memory allocation/freeing will be traced once this @@ -1329,7 +1571,7 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (atomic_read(&kmemleak_initialized)) - kmemleak_cleanup(); + schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); } @@ -1382,12 +1624,14 @@ void __init kmemleak_init(void) switch (log->op_type) { case KMEMLEAK_ALLOC: - kmemleak_alloc(log->ptr, log->size, log->min_count, - GFP_KERNEL); + early_alloc(log); break; case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; @@ -1423,7 +1667,7 @@ static int __init kmemleak_late_init(void) * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ - kmemleak_cleanup(); + schedule_work(&cleanup_work); return -ENOMEM; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20dadf40..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* @@ -1973,7 +1990,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } diff --git a/mm/memory.c b/mm/memory.c index 65216194eb8d..aede2ce3aba4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb66..32e75d400503 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)pool_data; return kzalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kzalloc); diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd5..8101de490c73 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece4..66e81e7e9fe9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); @@ -922,6 +919,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -1351,6 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; + add_nommu_region(region); /* set up the mapping */ if (file && vma->vm_flags & VM_SHARED) @@ -1360,8 +1362,6 @@ unsigned long do_mmap_pgoff(struct file *file, if (ret < 0) goto error_put_region; - add_nommu_region(region); - /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7687879253b9..25e7770309b8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,15 +36,6 @@ #include <linux/pagevec.h> /* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -117,8 +108,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -320,15 +309,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -340,27 +327,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock(&bdi_lock); return ret; } @@ -546,7 +532,7 @@ static void balance_dirty_pages(struct address_space *mapping) * up. */ if (bdi_nr_reclaimable > bdi_thresh) { - writeback_inodes(&wbc); + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -575,7 +561,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + schedule_timeout(1); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -594,10 +580,18 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .nr_to_write = nr_writeback, + }; + + + bdi_start_writeback(&wbc); + } } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -669,7 +663,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -681,153 +675,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - start_jif = jiffies; - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + - msecs_to_jiffies(dirty_writeback_interval * 10)); - else - del_timer(&wb_timer); return 0; } -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) +static void do_laptop_sync(struct work_struct *work) { - sys_sync(); + wakeup_flusher_threads(0); + kfree(work); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_laptop_sync); + schedule_work(work); + } } /* @@ -910,8 +786,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..a0de15f46987 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -817,13 +817,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -882,7 +884,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +903,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -1119,7 +1124,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1138,7 +1144,8 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); page = list_entry(pcp->list.next, struct page, lru); } @@ -1666,7 +1673,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1740,8 +1747,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1789,6 +1798,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1831,7 +1844,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -2533,7 +2546,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2642,6 +2654,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -4745,8 +4760,10 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4764,16 +4781,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac440c44e..000000000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - spin_lock_irq(&pdflush_lock); - - /* - * Thread creation: For how long have there been zero - * available threads? - * - * To throttle creation, we reset last_empty_jifs. - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } - } - } - - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - struct task_struct *k; - - k = kthread_run(pdflush, NULL, "pdflush"); - if (unlikely(IS_ERR(k))) { - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - } -} - -static int __init pdflush_init(void) -{ - int i; - - /* - * Pre-set nr_pdflush_threads... If we fail to create, - * the count will be decremented. - */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..3311c8919f37 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -197,7 +197,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, int page_idx) { - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + /* + * Any possible cpu id can be used here, so there's no need to + * worry about preemption or cpu hotplug. + */ + return *pcpu_chunk_pagep(chunk, raw_smp_processor_id(), + page_idx) != NULL; } /* set the pointer to a chunk in a page struct */ @@ -297,6 +302,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += raw_smp_processor_id() * pcpu_unit_size; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -558,7 +571,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -643,7 +656,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -749,7 +762,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; @@ -1067,9 +1080,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1248,7 +1261,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + chunk_size = pcpue_unit_size * nr_cpu_ids; pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); @@ -1259,12 +1272,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ diff --git a/mm/rmap.c b/mm/rmap.c index 836c6c63e1f2..0895b5c7cbff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } diff --git a/mm/shmem.c b/mm/shmem.c index d713239ce2ce..5a0b3d4055f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2446,7 +2446,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2469,7 +2469,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2480,7 +2480,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 606a8e757a42..df2c87fdae50 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir) /** * shmem_check_acl - check_acl() callback for generic_permission() */ -static int +int shmem_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); @@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) } return -EAGAIN; } - -/** - * shmem_permission - permission() inode operation - */ -int -shmem_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, shmem_check_acl); -} diff --git a/mm/slob.c b/mm/slob.c index 9641da3d5e58..837ebd64cc34 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -692,3 +692,8 @@ void __init kmem_cache_init(void) { slob_ready = 1; } + +void __init kmem_cache_init_late(void) +{ + /* Nothing to do */ +} diff --git a/mm/slub.c b/mm/slub.c index b1cb2dfa109a..417ed843b251 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -21,7 +21,6 @@ #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/debugobjects.h> @@ -1127,8 +1126,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) } if (kmemcheck_enabled - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) - { + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); @@ -2023,7 +2021,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects --; + min_objects--; } /* @@ -2629,8 +2627,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) */ void kmem_cache_destroy(struct kmem_cache *s) { - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); down_write(&slub_lock); s->refcount--; if (!s->refcount) { @@ -2641,6 +2637,8 @@ void kmem_cache_destroy(struct kmem_cache *s) "still has objects.\n", s->name, __func__); dump_stack(); } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2874,13 +2872,15 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; + void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA @@ -2965,6 +2965,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38eba79f..5ae6b8b78c80 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a48ee7..8ffdc0d23c53 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); diff --git a/mm/vmscan.c b/mm/vmscan.c index 54155268dfca..ba8228e0a806 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); - /* In active use or really unfreeable? Activate it. */ + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -1104,7 +1109,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1715,13 +1720,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1960,7 +1965,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -2233,7 +2238,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } |