From 4b0ece6fa0167b22c004ff69e137dc94ee2e469e Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 31 Mar 2017 15:11:44 -0700 Subject: mm: migrate: fix remove_migration_pte() for ksm pages I found that calling page migration for ksm pages causes the following bug: page:ffffea0004d51180 count:2 mapcount:2 mapping:ffff88013c785141 index:0x913 flags: 0x57ffffc0040068(uptodate|lru|active|swapbacked) raw: 0057ffffc0040068 ffff88013c785141 0000000000000913 0000000200000001 raw: ffffea0004d5f9e0 ffffea0004d53f60 0000000000000000 ffff88007d81b800 page dumped because: VM_BUG_ON_PAGE(!PageLocked(page)) page->mem_cgroup:ffff88007d81b800 ------------[ cut here ]------------ kernel BUG at /src/linux-dev/mm/rmap.c:1086! invalid opcode: 0000 [#1] SMP Modules linked in: ppdev parport_pc virtio_balloon i2c_piix4 pcspkr parport i2c_core acpi_cpufreq ip_tables xfs libcrc32c ata_generic pata_acpi ata_piix 8139too libata virtio_blk 8139cp crc32c_intel mii virtio_pci virtio_ring serio_raw virtio floppy dm_mirror dm_region_hash dm_log dm_mod CPU: 0 PID: 3162 Comm: bash Not tainted 4.11.0-rc2-mm1+ #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:do_page_add_anon_rmap+0x1ba/0x260 RSP: 0018:ffffc90002473b30 EFLAGS: 00010282 RAX: 0000000000000021 RBX: ffffea0004d51180 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff88007dc0dfe0 RBP: ffffc90002473b58 R08: 00000000fffffffe R09: 00000000000001c1 R10: 0000000000000005 R11: 00000000000001c0 R12: ffff880139ab3d80 R13: 0000000000000000 R14: 0000700000000200 R15: 0000160000000000 FS: 00007f5195f50740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fd450287000 CR3: 000000007a08e000 CR4: 00000000001406f0 Call Trace: page_add_anon_rmap+0x18/0x20 remove_migration_pte+0x220/0x2c0 rmap_walk_ksm+0x143/0x220 rmap_walk+0x55/0x60 remove_migration_ptes+0x53/0x80 migrate_pages+0x8ed/0xb60 soft_offline_page+0x309/0x8d0 store_soft_offline_page+0xaf/0xf0 dev_attr_store+0x18/0x30 sysfs_kf_write+0x3a/0x50 kernfs_fop_write+0xff/0x180 __vfs_write+0x37/0x160 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7f51956339e0 RSP: 002b:00007ffcfa0dffc8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 000000000000000c RCX: 00007f51956339e0 RDX: 000000000000000c RSI: 00007f5195f53000 RDI: 0000000000000001 RBP: 00007f5195f53000 R08: 000000000000000a R09: 00007f5195f50740 R10: 000000000000000b R11: 0000000000000246 R12: 00007f5195907400 R13: 000000000000000c R14: 0000000000000001 R15: 0000000000000000 Code: fe ff ff 48 81 c2 00 02 00 00 48 89 55 d8 e8 2e c3 fd ff 48 8b 55 d8 e9 42 ff ff ff 48 c7 c6 e0 52 a1 81 48 89 df e8 46 ad fe ff <0f> 0b 48 83 e8 01 e9 7f fe ff ff 48 83 e8 01 e9 96 fe ff ff 48 RIP: do_page_add_anon_rmap+0x1ba/0x260 RSP: ffffc90002473b30 ---[ end trace a679d00f4af2df48 ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception The problem is in the following lines: new = page - pvmw.page->index + linear_page_index(vma, pvmw.address); The 'new' is calculated with 'page' which is given by the caller as a destination page and some offset adjustment for thp. But this doesn't properly work for ksm pages because pvmw.page->index doesn't change for each address but linear_page_index() changes, which means that 'new' points to different pages for each addresses backed by the ksm page. As a result, we try to set totally unrelated pages as destination pages, and that causes kernel crash. This patch fixes the miscalculation and makes ksm page migration work fine. Fixes: 3fe87967c536 ("mm: convert remove_migration_pte() to use page_vma_mapped_walk()") Link: http://lkml.kernel.org/r/1489717683-29905-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Mel Gorman Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 9a0897a14d37..ed97c2c14fa8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -209,8 +209,11 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageTail(page), page); while (page_vma_mapped_walk(&pvmw)) { - new = page - pvmw.page->index + - linear_page_index(vma, pvmw.address); + if (PageKsm(page)) + new = page; + else + new = page - pvmw.page->index + + linear_page_index(vma, pvmw.address); get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); -- cgit v1.2.3 From 597b7305dd8bafdb3aef4957d97128bc90af8e9f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 31 Mar 2017 15:11:47 -0700 Subject: mm: move mm_percpu_wq initialization earlier Yang Li has reported that drain_all_pages triggers a WARN_ON which means that this function is called earlier than the mm_percpu_wq is initialized on arm64 with CMA configured: WARNING: CPU: 2 PID: 1 at mm/page_alloc.c:2423 drain_all_pages+0x244/0x25c Modules linked in: CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc1-next-20170310-00027-g64dfbc5 #127 Hardware name: Freescale Layerscape 2088A RDB Board (DT) task: ffffffc07c4a6d00 task.stack: ffffffc07c4a8000 PC is at drain_all_pages+0x244/0x25c LR is at start_isolate_page_range+0x14c/0x1f0 [...] drain_all_pages+0x244/0x25c start_isolate_page_range+0x14c/0x1f0 alloc_contig_range+0xec/0x354 cma_alloc+0x100/0x1fc dma_alloc_from_contiguous+0x3c/0x44 atomic_pool_init+0x7c/0x208 arm64_dma_init+0x44/0x4c do_one_initcall+0x38/0x128 kernel_init_freeable+0x1a0/0x240 kernel_init+0x10/0xfc ret_from_fork+0x10/0x20 Fix this by moving the whole setup_vmstat which is an initcall right now to init_mm_internals which will be called right after the WQ subsystem is initialized. Link: http://lkml.kernel.org/r/20170315164021.28532-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Yang Li Tested-by: Yang Li Tested-by: Xiaolong Ye Cc: Mel Gorman Cc: Vlastimil Babka Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ init/main.c | 2 ++ mm/vmstat.c | 4 +--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..00a8fa7e366a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,8 @@ struct user_struct; struct writeback_control; struct bdi_writeback; +void init_mm_internals(void); + #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/init/main.c b/init/main.c index f9c9d9948203..b0c11cbf5ddf 100644 --- a/init/main.c +++ b/init/main.c @@ -1022,6 +1022,8 @@ static noinline void __init kernel_init_freeable(void) workqueue_init(); + init_mm_internals(); + do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/mm/vmstat.c b/mm/vmstat.c index b1947f0cbee2..89f95396ec46 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1764,7 +1764,7 @@ static int vmstat_cpu_dead(unsigned int cpu) #endif -static int __init setup_vmstat(void) +void __init init_mm_internals(void) { #ifdef CONFIG_SMP int ret; @@ -1792,9 +1792,7 @@ static int __init setup_vmstat(void) proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif - return 0; } -module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) -- cgit v1.2.3 From 553af430e7c981e6e8fa5007c5b7b5773acc63dd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 31 Mar 2017 15:11:50 -0700 Subject: mm: rmap: fix huge file mmap accounting in the memcg stats Huge pages are accounted as single units in the memcg's "file_mapped" counter. Account the correct number of base pages, like we do in the corresponding node counter. Link: http://lkml.kernel.org/r/20170322005111.3156-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/rmap.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5af377303880..bb7250c45cb8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -740,6 +740,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, + int nr) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { diff --git a/mm/rmap.c b/mm/rmap.c index 49ed681ccc7b..f6838015810f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1159,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1199,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); -- cgit v1.2.3 From 0cefabdaf757a6455d75f00cb76874e62703ed18 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 31 Mar 2017 15:11:52 -0700 Subject: mm: workingset: fix premature shadow node shrinking with cgroups Commit 0a6b76dd23fa ("mm: workingset: make shadow node shrinker memcg aware") enabled cgroup-awareness in the shadow node shrinker, but forgot to also enable cgroup-awareness in the list_lru the shadow nodes sit on. Consequently, all shadow nodes are sitting on a global (per-NUMA node) list, while the shrinker applies the limits according to the amount of cache in the cgroup its shrinking. The result is excessive pressure on the shadow nodes from cgroups that have very little cache. Enable memcg-mode on the shadow node LRUs, such that per-cgroup limits are applied to per-cgroup lists. Fixes: 0a6b76dd23fa ("mm: workingset: make shadow node shrinker memcg aware") Link: http://lkml.kernel.org/r/20170322005320.8165-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index ac839fca0e76..eda05c71fa49 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -532,7 +532,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); -- cgit v1.2.3 From c9d398fa237882ea07167e23bcfc5e6847066518 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 31 Mar 2017 15:11:55 -0700 Subject: mm, hugetlb: use pte_present() instead of pmd_present() in follow_huge_pmd() I found the race condition which triggers the following bug when move_pages() and soft offline are called on a single hugetlb page concurrently. Soft offlining page 0x119400 at 0x700000000000 BUG: unable to handle kernel paging request at ffffea0011943820 IP: follow_huge_pmd+0x143/0x190 PGD 7ffd2067 PUD 7ffd1067 PMD 0 [61163.582052] Oops: 0000 [#1] SMP Modules linked in: binfmt_misc ppdev virtio_balloon parport_pc pcspkr i2c_piix4 parport i2c_core acpi_cpufreq ip_tables xfs libcrc32c ata_generic pata_acpi virtio_blk 8139too crc32c_intel ata_piix serio_raw libata virtio_pci 8139cp virtio_ring virtio mii floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: cap_check] CPU: 0 PID: 22573 Comm: iterate_numa_mo Tainted: P OE 4.11.0-rc2-mm1+ #2 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:follow_huge_pmd+0x143/0x190 RSP: 0018:ffffc90004bdbcd0 EFLAGS: 00010202 RAX: 0000000465003e80 RBX: ffffea0004e34d30 RCX: 00003ffffffff000 RDX: 0000000011943800 RSI: 0000000000080001 RDI: 0000000465003e80 RBP: ffffc90004bdbd18 R08: 0000000000000000 R09: ffff880138d34000 R10: ffffea0004650000 R11: 0000000000c363b0 R12: ffffea0011943800 R13: ffff8801b8d34000 R14: ffffea0000000000 R15: 000077ff80000000 FS: 00007fc977710740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffea0011943820 CR3: 000000007a746000 CR4: 00000000001406f0 Call Trace: follow_page_mask+0x270/0x550 SYSC_move_pages+0x4ea/0x8f0 SyS_move_pages+0xe/0x10 do_syscall_64+0x67/0x180 entry_SYSCALL64_slow_path+0x25/0x25 RIP: 0033:0x7fc976e03949 RSP: 002b:00007ffe72221d88 EFLAGS: 00000246 ORIG_RAX: 0000000000000117 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc976e03949 RDX: 0000000000c22390 RSI: 0000000000001400 RDI: 0000000000005827 RBP: 00007ffe72221e00 R08: 0000000000c2c3a0 R09: 0000000000000004 R10: 0000000000c363b0 R11: 0000000000000246 R12: 0000000000400650 R13: 00007ffe72221ee0 R14: 0000000000000000 R15: 0000000000000000 Code: 81 e4 ff ff 1f 00 48 21 c2 49 c1 ec 0c 48 c1 ea 0c 4c 01 e2 49 bc 00 00 00 00 00 ea ff ff 48 c1 e2 06 49 01 d4 f6 45 bc 04 74 90 <49> 8b 7c 24 20 40 f6 c7 01 75 2b 4c 89 e7 8b 47 1c 85 c0 7e 2a RIP: follow_huge_pmd+0x143/0x190 RSP: ffffc90004bdbcd0 CR2: ffffea0011943820 ---[ end trace e4f81353a2d23232 ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled This bug is triggered when pmd_present() returns true for non-present hugetlb, so fixing the present check in follow_huge_pmd() prevents it. Using pmd_present() to determine present/non-present for hugetlb is not correct, because pmd_present() checks multiple bits (not only _PAGE_PRESENT) for historical reason and it can misjudge hugetlb state. Fixes: e66f17ff7177 ("mm/hugetlb: take page table lock in follow_huge_pmd()") Link: http://lkml.kernel.org/r/1490149898-20231-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Acked-by: Hillf Danton Cc: Hugh Dickins Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3d0aab9ee80d..f501f14f14ce 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4651,6 +4651,7 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { struct page *page = NULL; spinlock_t *ptl; + pte_t pte; retry: ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); @@ -4660,12 +4661,13 @@ retry: */ if (!pmd_huge(*pmd)) goto out; - if (pmd_present(*pmd)) { + pte = huge_ptep_get((pte_t *)pmd); + if (pte_present(pte)) { page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { - if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { + if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); __migration_entry_wait(mm, (pte_t *)pmd, ptl); goto retry; -- cgit v1.2.3 From 906f2a51c941e251ca196d5128953d9899a608ef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 Mar 2017 15:11:58 -0700 Subject: mm: fix section name for .data..ro_after_init A section name for .data..ro_after_init was added by both: commit d07a980c1b8d ("s390: add proper __ro_after_init support") and commit d7c19b066dcf ("mm: kmemleak: scan .data.ro_after_init") The latter adds incorrect wrapping around the existing s390 section, and came later. I'd prefer the s390 naming, so this moves the s390-specific name up to the asm-generic/sections.h and renames the section as used by kmemleak (and in the future, kernel/extable.c). Link: http://lkml.kernel.org/r/20170327192213.GA129375@beast Signed-off-by: Kees Cook Acked-by: Heiko Carstens [s390 parts] Acked-by: Jakub Kicinski Cc: Eddie Kovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/sections.h | 1 - arch/s390/kernel/vmlinux.lds.S | 2 -- include/asm-generic/sections.h | 6 +++--- include/asm-generic/vmlinux.lds.h | 4 ++-- mm/kmemleak.c | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 5ce29fe100ba..fbd9116eb17b 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -4,6 +4,5 @@ #include extern char _eshared[], _ehead[]; -extern char __start_ro_after_init[], __end_ro_after_init[]; #endif diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 5ccf95396251..72307f108c40 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -63,11 +63,9 @@ SECTIONS . = ALIGN(PAGE_SIZE); __start_ro_after_init = .; - __start_data_ro_after_init = .; .data..ro_after_init : { *(.data..ro_after_init) } - __end_data_ro_after_init = .; EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 4df64a1fc09e..532372c6cf15 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -14,8 +14,8 @@ * [_sdata, _edata]: contains .data.* sections, may also contain .rodata.* * and/or .init.* sections. * [__start_rodata, __end_rodata]: contains .rodata.* sections - * [__start_data_ro_after_init, __end_data_ro_after_init]: - * contains data.ro_after_init section + * [__start_ro_after_init, __end_ro_after_init]: + * contains .data..ro_after_init section * [__init_begin, __init_end]: contains .init.* sections, but .init.text.* * may be out of this range on some architectures. * [_sinittext, _einittext]: contains .init.text.* sections @@ -33,7 +33,7 @@ extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; -extern char __start_data_ro_after_init[], __end_data_ro_after_init[]; +extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0968d13b3885..f9f21e2c59f3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -260,9 +260,9 @@ */ #ifndef RO_AFTER_INIT_DATA #define RO_AFTER_INIT_DATA \ - __start_data_ro_after_init = .; \ + __start_ro_after_init = .; \ *(.data..ro_after_init) \ - __end_data_ro_after_init = .; + __end_ro_after_init = .; #endif /* diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 26c874e90b12..20036d4f9f13 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1416,7 +1416,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); + scan_large_block(__start_ro_after_init, __end_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ -- cgit v1.2.3 From 4742a35d9de745e867405b4311e1aac412f0ace1 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 31 Mar 2017 15:12:01 -0700 Subject: hugetlbfs: initialize shared policy as part of inode allocation Any time after inode allocation, destroy_inode can be called. The hugetlbfs inode contains a shared_policy structure, and mpol_free_shared_policy is unconditionally called as part of hugetlbfs_destroy_inode. Initialize the policy as part of inode allocation so that any quick (error path) calls to destroy_inode will be handed an initialized policy. syzkaller fuzzer found this bug, that resulted in the following: BUG: KASAN: user-memory-access in atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] at addr 000000131730bd7a BUG: KASAN: user-memory-access in __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 at addr 000000131730bd7a Write of size 4 by task syz-executor6/14086 CPU: 3 PID: 14086 Comm: syz-executor6 Not tainted 4.11.0-rc3+ #364 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: atomic_inc include/asm-generic/atomic-instrumented.h:87 [inline] __lock_acquire+0x21a/0x3a80 kernel/locking/lockdep.c:3239 lock_acquire+0x1ee/0x590 kernel/locking/lockdep.c:3762 __raw_write_lock include/linux/rwlock_api_smp.h:210 [inline] _raw_write_lock+0x33/0x50 kernel/locking/spinlock.c:295 mpol_free_shared_policy+0x43/0xb0 mm/mempolicy.c:2536 hugetlbfs_destroy_inode+0xca/0x120 fs/hugetlbfs/inode.c:952 alloc_inode+0x10d/0x180 fs/inode.c:216 new_inode_pseudo+0x69/0x190 fs/inode.c:889 new_inode+0x1c/0x40 fs/inode.c:918 hugetlbfs_get_inode+0x40/0x420 fs/hugetlbfs/inode.c:734 hugetlb_file_setup+0x329/0x9f0 fs/hugetlbfs/inode.c:1282 newseg+0x422/0xd30 ipc/shm.c:575 ipcget_new ipc/util.c:285 [inline] ipcget+0x21e/0x580 ipc/util.c:639 SYSC_shmget ipc/shm.c:673 [inline] SyS_shmget+0x158/0x230 ipc/shm.c:657 entry_SYSCALL_64_fastpath+0x1f/0xc2 Analysis provided by Tetsuo Handa Link: http://lkml.kernel.org/r/1490477850-7944-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Dmitry Vyukov Acked-by: Hillf Danton Cc: Tetsuo Handa Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8f96461236f6..7163fe014b57 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -695,14 +695,11 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode->i_mode = S_IFDIR | config->mode; inode->i_uid = config->uid; inode->i_gid = config->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - info = HUGETLBFS_I(inode); - mpol_shared_policy_init(&info->policy, NULL); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -733,7 +730,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -741,15 +737,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; - info = HUGETLBFS_I(inode); - /* - * The policy is initialized here even if we are creating a - * private inode because initialization simply creates an - * an empty rb tree and calls rwlock_init(), later when we - * call mpol_free_shared_policy() it will just return because - * the rb tree will still be empty. - */ - mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -937,6 +924,18 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } + + /* + * Any time after allocation, hugetlbfs_destroy_inode can be called + * for the inode. mpol_free_shared_policy is unconditionally called + * as part of hugetlbfs_destroy_inode. So, initialize policy here + * in case of a quick call to destroy. + * + * Note that the policy is initialized even if we are creating a + * private inode. This simplifies hugetlbfs_destroy_inode. + */ + mpol_shared_policy_init(&p->policy, NULL); + return &p->vfs_inode; } -- cgit v1.2.3 From b0845ce58379d11dcad4cdb6824a6410de260216 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 31 Mar 2017 15:12:04 -0700 Subject: kasan: report only the first error by default Disable kasan after the first report. There are several reasons for this: - Single bug quite often has multiple invalid memory accesses causing storm in the dmesg. - Write OOB access might corrupt metadata so the next report will print bogus alloc/free stacktraces. - Reports after the first easily could be not bugs by itself but just side effects of the first one. Given that multiple reports usually only do harm, it makes sense to disable kasan after the first one. If user wants to see all the reports, the boot-time parameter kasan_multi_shot must be used. [aryabinin@virtuozzo.com: wrote changelog and doc, added missing include] Link: http://lkml.kernel.org/r/20170323154416.30257-1-aryabinin@virtuozzo.com Signed-off-by: Mark Rutland Signed-off-by: Andrey Ryabinin Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 6 +++++ include/linux/kasan.h | 3 +++ lib/test_kasan.c | 10 +++++++ mm/kasan/kasan.h | 5 ---- mm/kasan/report.c | 36 +++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2ba45caabada..facc20a3f962 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1725,6 +1725,12 @@ kernel and module base offset ASLR (Address Space Layout Randomization). + kasan_multi_shot + [KNL] Enforce KASAN (Kernel Address Sanitizer) to print + report on every invalid memory access. Without this + parameter KASAN will print report only for the first + invalid access. + keepinitrd [HW,ARM] kernelcore= [KNL,X86,IA-64,PPC] diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5734480c9590..a5c7046f26b4 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -76,6 +76,9 @@ size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } size_t kasan_metadata_size(struct kmem_cache *cache); +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + #else /* CONFIG_KASAN */ static inline void kasan_unpoison_shadow(const void *address, size_t size) {} diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 0b1d3140fbb8..a25c9763fce1 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * Note: test functions are marked noinline so that their names appear in @@ -474,6 +475,12 @@ static noinline void __init use_after_scope_test(void) static int __init kmalloc_tests_init(void) { + /* + * Temporarily enable multi-shot mode. Otherwise, we'd only get a + * report for the first case. + */ + bool multishot = kasan_save_enable_multi_shot(); + kmalloc_oob_right(); kmalloc_oob_left(); kmalloc_node_oob_right(); @@ -499,6 +506,9 @@ static int __init kmalloc_tests_init(void) ksize_unpoisons_memory(); copy_user_test(); use_after_scope_test(); + + kasan_restore_multi_shot(multishot); + return -EAGAIN; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1c260e6b3b3c..dd2dea8eb077 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_report_enabled(void) -{ - return !current->kasan_depth; -} - void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f479365530b6..ab42a0803f16 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,7 +13,9 @@ * */ +#include #include +#include #include #include #include @@ -293,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info) kasan_end_report(&flags); } +static unsigned long kasan_flags; + +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +static inline bool kasan_report_enabled(void) +{ + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { -- cgit v1.2.3 From ff8c0c53c47530ffea82c22a0a6df6332b56c957 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 31 Mar 2017 15:12:07 -0700 Subject: mm/hugetlb.c: don't call region_abort if region_chg fails Changes to hugetlbfs reservation maps is a two step process. The first step is a call to region_chg to determine what needs to be changed, and prepare that change. This should be followed by a call to call to region_add to commit the change, or region_abort to abort the change. The error path in hugetlb_reserve_pages called region_abort after a failed call to region_chg. As a result, the adds_in_progress counter in the reservation map is off by 1. This is caught by a VM_BUG_ON in resv_map_release when the reservation map is freed. syzkaller fuzzer (when using an injected kmalloc failure) found this bug, that resulted in the following: kernel BUG at mm/hugetlb.c:742! Call Trace: hugetlbfs_evict_inode+0x7b/0xa0 fs/hugetlbfs/inode.c:493 evict+0x481/0x920 fs/inode.c:553 iput_final fs/inode.c:1515 [inline] iput+0x62b/0xa20 fs/inode.c:1542 hugetlb_file_setup+0x593/0x9f0 fs/hugetlbfs/inode.c:1306 newseg+0x422/0xd30 ipc/shm.c:575 ipcget_new ipc/util.c:285 [inline] ipcget+0x21e/0x580 ipc/util.c:639 SYSC_shmget ipc/shm.c:673 [inline] SyS_shmget+0x158/0x230 ipc/shm.c:657 entry_SYSCALL_64_fastpath+0x1f/0xc2 RIP: resv_map_release+0x265/0x330 mm/hugetlb.c:742 Link: http://lkml.kernel.org/r/1490821682-23228-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Dmitry Vyukov Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f501f14f14ce..e5828875f7bb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4403,7 +4403,9 @@ int hugetlb_reserve_pages(struct inode *inode, return 0; out_err: if (!vma || vma->vm_flags & VM_MAYSHARE) - region_abort(resv_map, from, to); + /* Don't call region_abort if region_chg failed */ + if (chg >= 0) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; -- cgit v1.2.3 From 4785603bd05b0b029c647080937674d9991600f9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 31 Mar 2017 15:12:10 -0700 Subject: drivers/rapidio/devices/tsi721.c: make module parameter variable name unique kbuild test robot reported a non-static variable name collision between a staging driver and a RapidIO driver, with a generic variable name of 'dbg_level'. Both drivers should be changed so that they don't use this generic public variable name. This patch fixes the RapidIO driver but does not change the user interface (name) for the module parameter. drivers/staging/built-in.o:(.bss+0x109d0): multiple definition of `dbg_level' drivers/rapidio/built-in.o:(.bss+0x16c): first defined here Link: http://lkml.kernel.org/r/ab527fc5-aa3c-4b07-5d48-eef5de703192@infradead.org Signed-off-by: Randy Dunlap Reported-by: kbuild test robot Cc: Greg Kroah-Hartman Cc: Matt Porter Cc: Alexandre Bounine Cc: Jérémy Lefaure Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/devices/tsi721.c | 4 ++-- drivers/rapidio/devices/tsi721.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/rapidio/devices/tsi721.c b/drivers/rapidio/devices/tsi721.c index 9d19b9a62011..315a4be8dc1e 100644 --- a/drivers/rapidio/devices/tsi721.c +++ b/drivers/rapidio/devices/tsi721.c @@ -37,8 +37,8 @@ #include "tsi721.h" #ifdef DEBUG -u32 dbg_level; -module_param(dbg_level, uint, S_IWUSR | S_IRUGO); +u32 tsi_dbg_level; +module_param_named(dbg_level, tsi_dbg_level, uint, S_IWUSR | S_IRUGO); MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)"); #endif diff --git a/drivers/rapidio/devices/tsi721.h b/drivers/rapidio/devices/tsi721.h index 5941437cbdd1..957eadc58150 100644 --- a/drivers/rapidio/devices/tsi721.h +++ b/drivers/rapidio/devices/tsi721.h @@ -40,11 +40,11 @@ enum { }; #ifdef DEBUG -extern u32 dbg_level; +extern u32 tsi_dbg_level; #define tsi_debug(level, dev, fmt, arg...) \ do { \ - if (DBG_##level & dbg_level) \ + if (DBG_##level & tsi_dbg_level) \ dev_dbg(dev, "%s: " fmt "\n", __func__, ##arg); \ } while (0) #else -- cgit v1.2.3 From 13a6798e4a03096b11bf402a063786a7be55d426 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 31 Mar 2017 15:12:12 -0700 Subject: kasan: do not sanitize kexec purgatory Fixes this: kexec: Undefined symbol: __asan_load8_noabort kexec-bzImage64: Loading purgatory failed Link: http://lkml.kernel.org/r/1489672155.4458.7.camel@gmx.de Signed-off-by: Mike Galbraith Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 555b9fa0ad43..7dbdb780264d 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -8,6 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib targets += purgatory.ro +KASAN_SANITIZE := n KCOV_INSTRUMENT := n # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That -- cgit v1.2.3