From a83836dbc53e96f13fec248ecc201d18e1e3111d Mon Sep 17 00:00:00 2001 From: Libor Pechacek Date: Fri, 31 Jan 2020 14:28:29 +0100 Subject: powerpc/pseries: Avoid NULL pointer dereference when drmem is unavailable In guests without hotplugagble memory drmem structure is only zero initialized. Trying to manipulate DLPAR parameters results in a crash. $ echo "memory add count 1" > /sys/kernel/dlpar Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ... NIP: c0000000000ff294 LR: c0000000000ff248 CTR: 0000000000000000 REGS: c0000000fb9d3880 TRAP: 0300 Tainted: G E (5.5.0-rc6-2-default) MSR: 8000000000009033 CR: 28242428 XER: 20000000 CFAR: c0000000009a6c10 DAR: 0000000000000010 DSISR: 40000000 IRQMASK: 0 ... NIP dlpar_memory+0x6e4/0xd00 LR dlpar_memory+0x698/0xd00 Call Trace: dlpar_memory+0x698/0xd00 (unreliable) handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 __vfs_write+0x3c/0x70 vfs_write+0xd0/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x68 Taking closer look at the code, I can see that for_each_drmem_lmb is a macro expanding into `for (lmb = &drmem_info->lmbs[0]; lmb <= &drmem_info->lmbs[drmem_info->n_lmbs - 1]; lmb++)`. When drmem_info->lmbs is NULL, the loop would iterate through the whole address range if it weren't stopped by the NULL pointer dereference on the next line. This patch aligns for_each_drmem_lmb and for_each_drmem_lmb_in_range macro behavior with the common C semantics, where the end marker does not belong to the scanned range, and alters get_lmb_range() semantics. As a side effect, the wraparound observed in the crash is prevented. Fixes: 6c6ea53725b3 ("powerpc/mm: Separate ibm, dynamic-memory data from DT format") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Libor Pechacek Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200131132829.10281-1-msuchanek@suse.de --- arch/powerpc/include/asm/drmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index 3d76e1c388c2..28c3d936fdf3 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -27,12 +27,12 @@ struct drmem_lmb_info { extern struct drmem_lmb_info *drmem_info; #define for_each_drmem_lmb_in_range(lmb, start, end) \ - for ((lmb) = (start); (lmb) <= (end); (lmb)++) + for ((lmb) = (start); (lmb) < (end); (lmb)++) #define for_each_drmem_lmb(lmb) \ for_each_drmem_lmb_in_range((lmb), \ &drmem_info->lmbs[0], \ - &drmem_info->lmbs[drmem_info->n_lmbs - 1]) + &drmem_info->lmbs[drmem_info->n_lmbs]) /* * The of_drconf_cell_v1 struct defines the layout of the LMB data -- cgit v1.2.3 From 0b1c524caaae2428b20e714297243e5551251eb5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 9 Jan 2020 08:25:25 +0000 Subject: powerpc/32: refactor pmd_offset(pud_offset(pgd_offset... At several places pmd pointer is retrieved through the same action: pmd = pmd_offset(pud_offset(pgd_offset(mm, addr), addr), addr); or pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr); Refactor this by implementing two helpers pmd_ptr() and pmd_ptr_k() This will help when adding the p4d level. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7b065c5be35726af4066cab238ee35cabceda1fa.1578558199.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/pgtable.h | 12 ++++++++++++ arch/powerpc/mm/book3s32/mmu.c | 2 +- arch/powerpc/mm/book3s32/tlb.c | 4 ++-- arch/powerpc/mm/kasan/kasan_init_32.c | 8 ++++---- arch/powerpc/mm/mem.c | 3 +-- arch/powerpc/mm/nohash/40x.c | 4 ++-- arch/powerpc/mm/pgtable_32.c | 2 +- 7 files changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 8cc543ed114c..22bf7bb666a7 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,6 +41,18 @@ struct mm_struct; #ifndef __ASSEMBLY__ +#ifdef CONFIG_PPC32 +static inline pmd_t *pmd_ptr(struct mm_struct *mm, unsigned long va) +{ + return pmd_offset(pud_offset(pgd_offset(mm, va), va), va); +} + +static inline pmd_t *pmd_ptr_k(unsigned long va) +{ + return pmd_offset(pud_offset(pgd_offset_k(va), va), va); +} +#endif + #include /* Keep these as a macros to avoid include dependency mess */ diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 0a1c65a2c565..c225b0397ffd 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -312,7 +312,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea) if (!Hash) return; - pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); + pmd = pmd_ptr(mm, ea); if (!pmd_none(*pmd)) add_hash_page(mm->context.id, ea, pmd_val(*pmd)); } diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 724c0490fb17..dc9039a170aa 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -90,7 +90,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start, if (start >= end) return; end = (end - 1) | ~PAGE_MASK; - pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); + pmd = pmd_ptr(mm, start); for (;;) { pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; if (pmd_end > end) @@ -148,7 +148,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) return; } mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; - pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); + pmd = pmd_ptr(mm, vmaddr); if (!pmd_none(*pmd)) flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); } diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/kasan_init_32.c index 16dd95bd0749..b195d085970a 100644 --- a/arch/powerpc/mm/kasan/kasan_init_32.c +++ b/arch/powerpc/mm/kasan/kasan_init_32.c @@ -36,7 +36,7 @@ static int __init kasan_init_shadow_page_tables(unsigned long k_start, unsigned unsigned long k_cur, k_next; pte_t *new = NULL; - pmd = pmd_offset(pud_offset(pgd_offset_k(k_start), k_start), k_start); + pmd = pmd_ptr_k(k_start); for (k_cur = k_start; k_cur != k_end; k_cur = k_next, pmd++) { k_next = pgd_addr_end(k_cur, k_end); @@ -78,7 +78,7 @@ static int __init kasan_init_region(void *start, size_t size) block = memblock_alloc(k_end - k_start, PAGE_SIZE); for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pmd_t *pmd = pmd_ptr_k(k_cur); void *va = block + k_cur - k_start; pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); @@ -102,7 +102,7 @@ static void __init kasan_remap_early_shadow_ro(void) kasan_populate_pte(kasan_early_shadow_pte, prot); for (k_cur = k_start & PAGE_MASK; k_cur < k_end; k_cur += PAGE_SIZE) { - pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(k_cur), k_cur), k_cur); + pmd_t *pmd = pmd_ptr_k(k_cur); pte_t *ptep = pte_offset_kernel(pmd, k_cur); if ((pte_val(*ptep) & PTE_RPN_MASK) != pa) @@ -202,7 +202,7 @@ void __init kasan_early_init(void) unsigned long addr = KASAN_SHADOW_START; unsigned long end = KASAN_SHADOW_END; unsigned long next; - pmd_t *pmd = pmd_offset(pud_offset(pgd_offset_k(addr), addr), addr); + pmd_t *pmd = pmd_ptr_k(addr); BUILD_BUG_ON(KASAN_SHADOW_START & ~PGDIR_MASK); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ef7b1119b2e2..b7325bb4c890 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -69,8 +69,7 @@ EXPORT_SYMBOL(kmap_prot); static inline pte_t *virt_to_kpte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + return pte_offset_kernel(pmd_ptr_k(vaddr), vaddr); } #endif diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c index f348104eb461..82862723ab42 100644 --- a/arch/powerpc/mm/nohash/40x.c +++ b/arch/powerpc/mm/nohash/40x.c @@ -104,7 +104,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmdp = pmd_ptr_k(v); *pmdp++ = __pmd(val); *pmdp++ = __pmd(val); *pmdp++ = __pmd(val); @@ -119,7 +119,7 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; - pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmdp = pmd_ptr_k(v); *pmdp = __pmd(val); v += LARGE_PAGE_SIZE_4M; diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5fb90edd865e..d90c166bb6e5 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -63,7 +63,7 @@ int __ref map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot) int err = -ENOMEM; /* Use upper 10 bits of VA to index the first level map */ - pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); + pd = pmd_ptr_k(va); /* Use middle 10 bits of VA to index the second-level map */ if (likely(slab_is_available())) pg = pte_alloc_kernel(pd, va); -- cgit v1.2.3 From 2efc7c085f05870eda6f29ac71eeb83f3bd54415 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 9 Jan 2020 08:25:26 +0000 Subject: powerpc/32: drop get_pteptr() Commit 8d30c14cab30 ("powerpc/mm: Rework I$/D$ coherency (v3)") and commit 90ac19a8b21b ("[POWERPC] Abolish iopa(), mm_ptov(), io_block_mapping() from arch/powerpc") removed the use of get_pteptr() outside of mm/pgtable_32.c In mm/pgtable_32.c, the only user of get_pteptr() is change_page_attr() which operates on kernel context and on lowmem pages only. Make virt_to_kpte() available outside of mm/mem.c and use it instead of get_pteptr(), and drop get_pteptr() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/788378c6c3ba5c5298caab7c7f95e6c3c88244b8.1578558199.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/pgtable.h | 5 +++++ arch/powerpc/mm/mem.c | 5 ----- arch/powerpc/mm/pgtable_32.c | 39 ++------------------------------------ 3 files changed, 7 insertions(+), 42 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 22bf7bb666a7..b80bfd41828d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -51,6 +51,11 @@ static inline pmd_t *pmd_ptr_k(unsigned long va) { return pmd_offset(pud_offset(pgd_offset_k(va), va), va); } + +static inline pte_t *virt_to_kpte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_ptr_k(vaddr), vaddr); +} #endif #include diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index b7325bb4c890..4f852d2a62f5 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -66,11 +66,6 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); - -static inline pte_t *virt_to_kpte(unsigned long vaddr) -{ - return pte_offset_kernel(pmd_ptr_k(vaddr), vaddr); -} #endif pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index d90c166bb6e5..f62de06e3d07 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -121,44 +121,9 @@ void __init mapin_ram(void) } } -/* Scan the real Linux page tables and return a PTE pointer for - * a virtual address in a context. - * Returns true (1) if PTE was found, zero otherwise. The pointer to - * the PTE pointer is unmodified if PTE is not found. - */ -static int -get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int retval = 0; - - pgd = pgd_offset(mm, addr & PAGE_MASK); - if (pgd) { - pud = pud_offset(pgd, addr & PAGE_MASK); - if (pud && pud_present(*pud)) { - pmd = pmd_offset(pud, addr & PAGE_MASK); - if (pmd_present(*pmd)) { - pte = pte_offset_map(pmd, addr & PAGE_MASK); - if (pte) { - retval = 1; - *ptep = pte; - if (pmdp) - *pmdp = pmd; - /* XXX caller needs to do pte_unmap, yuck */ - } - } - } - } - return(retval); -} - static int __change_page_attr_noflush(struct page *page, pgprot_t prot) { pte_t *kpte; - pmd_t *kpmd; unsigned long address; BUG_ON(PageHighMem(page)); @@ -166,10 +131,10 @@ static int __change_page_attr_noflush(struct page *page, pgprot_t prot) if (v_block_mapped(address)) return 0; - if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) + kpte = virt_to_kpte(address); + if (!kpte) return -EINVAL; __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); - pte_unmap(kpte); return 0; } -- cgit v1.2.3 From c4fd527f52ecb135018655c7f56f87800872c5bc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 Feb 2020 11:58:57 +0100 Subject: powerpc/kvm: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Because of this cleanup, we get to remove a few fields in struct kvm_arch that are now unused. Signed-off-by: Greg Kroah-Hartman [mpe: Fix build error in kvm/timing.c, adapt kvmppc_remove_cpu_debugfs()] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200209105901.1620958-2-gregkh@linuxfoundation.org --- arch/powerpc/include/asm/kvm_host.h | 3 --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 5 ++--- arch/powerpc/kvm/book3s_64_mmu_radix.c | 5 ++--- arch/powerpc/kvm/book3s_hv.c | 9 ++------- arch/powerpc/kvm/timing.c | 17 ++++------------- 5 files changed, 10 insertions(+), 29 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 6e8b8ffd06ad..877f8aa2bc1e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -308,8 +308,6 @@ struct kvm_arch { pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; - struct dentry *htab_dentry; - struct dentry *radix_dentry; struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -830,7 +828,6 @@ struct kvm_vcpu_arch { struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */ struct dentry *debugfs_dir; - struct dentry *debugfs_timings; #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ }; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6c372f5c61b6..8b4eac0c9dcd 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -2138,9 +2138,8 @@ static const struct file_operations debugfs_htab_fops = { void kvmppc_mmu_debugfs_init(struct kvm *kvm) { - kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, - kvm->arch.debugfs_dir, kvm, - &debugfs_htab_fops); + debugfs_create_file("htab", 0400, kvm->arch.debugfs_dir, kvm, + &debugfs_htab_fops); } void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 803940d79b73..1d75ed684b53 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1376,9 +1376,8 @@ static const struct file_operations debugfs_radix_fops = { void kvmhv_radix_debugfs_init(struct kvm *kvm) { - kvm->arch.radix_dentry = debugfs_create_file("radix", 0400, - kvm->arch.debugfs_dir, kvm, - &debugfs_radix_fops); + debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm, + &debugfs_radix_fops); } int kvmppc_radix_init(void) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2cefd071b848..33be4d93248a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2258,14 +2258,9 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) struct kvm *kvm = vcpu->kvm; snprintf(buf, sizeof(buf), "vcpu%u", id); - if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) - return; vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); - if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) - return; - vcpu->arch.debugfs_timings = - debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, - vcpu, &debugfs_timings_ops); + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, vcpu, + &debugfs_timings_ops); } #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index bfe4f106cffc..ba56a5cbba97 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -211,23 +211,14 @@ void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id) snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing", current->pid, id); - debugfs_file = debugfs_create_file(dbg_fname, 0666, - kvm_debugfs_dir, vcpu, - &kvmppc_exit_timing_fops); - - if (!debugfs_file) { - printk(KERN_ERR"%s: error creating debugfs file %s\n", - __func__, dbg_fname); - return; - } + debugfs_file = debugfs_create_file(dbg_fname, 0666, kvm_debugfs_dir, + vcpu, &kvmppc_exit_timing_fops); vcpu->arch.debugfs_exit_timing = debugfs_file; } void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) { - if (vcpu->arch.debugfs_exit_timing) { - debugfs_remove(vcpu->arch.debugfs_exit_timing); - vcpu->arch.debugfs_exit_timing = NULL; - } + debugfs_remove(vcpu->arch.debugfs_exit_timing); + vcpu->arch.debugfs_exit_timing = NULL; } -- cgit v1.2.3 From 6453f9ed9d4e4b4cdf201bf34bf460c436bf50ea Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 17 Feb 2020 09:41:35 +0000 Subject: powerpc/mm: Don't kmap_atomic() in pte_offset_map() on PPC32 On PPC32, pte_offset_map() does a kmap_atomic() in order to support page tables allocated in high memory, just like ARM and x86/32. But since at least 2008 and commit 8054a3428fbe ("powerpc: Remove dead CONFIG_HIGHPTE"), page tables are never allocated in high memory. When the page is in low mem, kmap_atomic() just returns the page address but still disable preemption and pagefault. And it is not an inlined function, so we suffer function call for no reason. Make pte_offset_map() the same as pte_offset_kernel() and make pte_unmap() void, in the same way as PPC64 which doesn't have HIGHMEM. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/03c97f0f6b3790d164822563be80f2fd4713a955.1581932480.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/pgtable.h | 6 ++---- arch/powerpc/include/asm/nohash/32/pgtable.h | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 5b39c11e884a..7549393c4c43 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -366,10 +366,8 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, addr) \ ((pte_t *) pmd_page_vaddr(*(dir)) + pte_index(addr)) -#define pte_offset_map(dir, addr) \ - ((pte_t *)(kmap_atomic(pmd_page(*(dir))) + \ - (pmd_page_vaddr(*(dir)) & ~PAGE_MASK)) + pte_index(addr)) -#define pte_unmap(pte) kunmap_atomic(pte) +#define pte_offset_map(dir, addr) pte_offset_kernel((dir), (addr)) +static inline void pte_unmap(pte_t *pte) { } /* * Encode and decode a swap entry. diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 60c4d829152e..b04ba257fddb 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -372,10 +372,8 @@ static inline int pte_young(pte_t pte) #define pte_offset_kernel(dir, addr) \ (pmd_bad(*(dir)) ? NULL : (pte_t *)pmd_page_vaddr(*(dir)) + \ pte_index(addr)) -#define pte_offset_map(dir, addr) \ - ((pte_t *)(kmap_atomic(pmd_page(*(dir))) + \ - (pmd_page_vaddr(*(dir)) & ~PAGE_MASK)) + pte_index(addr)) -#define pte_unmap(pte) kunmap_atomic(pte) +#define pte_offset_map(dir, addr) pte_offset_kernel((dir), (addr)) +static inline void pte_unmap(pte_t *pte) { } /* * Encode and decode a swap entry. -- cgit v1.2.3 From 672e480aa21023fc8e4b6ab8635d8898822b97e7 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Mon, 17 Feb 2020 13:48:33 +1100 Subject: powerpc/powernv: Add explicit fast-reboot support Add a way to manually invoke a fast-reboot rather than setting the NVRAM flag. The idea is to allow userspace to invoke a fast-reboot using the optional string argument to the reboot() system call, or using the xmon zr command so we don't need to leave around a persistent changes on a system to use the feature. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200217024833.30580-2-oohall@gmail.com --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/platforms/powernv/setup.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index c1f25a760eb1..1dffa3cb16ba 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1067,6 +1067,7 @@ enum { OPAL_REBOOT_PLATFORM_ERROR = 1, OPAL_REBOOT_FULL_IPL = 2, OPAL_REBOOT_MPIPL = 3, + OPAL_REBOOT_FAST = 4, }; /* Argument to OPAL_PCI_TCE_KILL */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a8fe630cf7cc..3bc188da82ba 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -237,6 +237,8 @@ static void __noreturn pnv_restart(char *cmd) rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL); else if (strcmp(cmd, "error") == 0) rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL); + else if (strcmp(cmd, "fast") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_FAST, NULL); else rc = OPAL_UNSUPPORTED; -- cgit v1.2.3 From 3d13e839e801e081bdece0127c2affa33d0f77cf Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 20 Feb 2020 22:51:37 +1100 Subject: powerpc: Rename current_stack_pointer() to current_stack_frame() current_stack_pointer(), which was called __get_SP(), used to just return the value in r1. But that caused problems in some cases, so it was turned into a function in commit bfe9a2cfe91a ("powerpc: Reimplement __get_SP() as a function not a define"). Because it's a function in a separate compilation unit to all its callers, it has the effect of causing a stack frame to be created, and then returns the address of that frame. This is good in some cases like those described in the above commit, but in other cases it's overkill, we just need to know what stack page we're on. On some other arches current_stack_pointer is just a register global giving the stack pointer, and we'd like to do that too. So rename our current_stack_pointer() to current_stack_frame() to make that possible. Signed-off-by: Michael Ellerman Reviewed-by: Christophe Leroy Link: https://lore.kernel.org/r/20200220115141.2707-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/perf_event.h | 2 +- arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/irq.c | 4 ++-- arch/powerpc/kernel/misc.S | 4 ++-- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/stacktrace.c | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index 7426d7a90e1e..eed3954082fa 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -32,7 +32,7 @@ do { \ (regs)->result = 0; \ (regs)->nip = __ip; \ - (regs)->gpr[1] = current_stack_pointer(); \ + (regs)->gpr[1] = current_stack_frame(); \ asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ } while (0) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1aa46dff0957..1b1ffdba6097 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1448,7 +1448,7 @@ static inline void mtsrin(u32 val, u32 idx) #define proc_trap() asm volatile("trap") -extern unsigned long current_stack_pointer(void); +extern unsigned long current_stack_frame(void); extern unsigned long scom970_read(unsigned int address); extern void scom970_write(unsigned int address, unsigned long value); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c9b11878555..02118c18434d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -602,7 +602,7 @@ static inline void check_stack_overflow(void) #ifdef CONFIG_DEBUG_STACKOVERFLOW long sp; - sp = current_stack_pointer() & (THREAD_SIZE-1); + sp = current_stack_frame() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ if (unlikely(sp < 2048)) { @@ -647,7 +647,7 @@ void do_IRQ(struct pt_regs *regs) void *cursp, *irqsp, *sirqsp; /* Switch to the irq stack to handle this */ - cursp = (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1)); + cursp = (void *)(current_stack_frame() & ~(THREAD_SIZE - 1)); irqsp = hardirq_ctx[raw_smp_processor_id()]; sirqsp = softirq_ctx[raw_smp_processor_id()]; diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 974f65f79a8e..65f9f731c229 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -110,7 +110,7 @@ _GLOBAL(longjmp) li r3, 1 blr -_GLOBAL(current_stack_pointer) +_GLOBAL(current_stack_frame) PPC_LL r3,0(r1) blr -EXPORT_SYMBOL(current_stack_pointer) +EXPORT_SYMBOL(current_stack_frame) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e730b8e522b0..110db94cdf3c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2051,7 +2051,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) sp = (unsigned long) stack; if (sp == 0) { if (tsk == current) - sp = current_stack_pointer(); + sp = current_stack_frame(); else sp = tsk->thread.ksp; } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index e2a46cfed5fd..c477b8585a29 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -57,7 +57,7 @@ void save_stack_trace(struct stack_trace *trace) { unsigned long sp; - sp = current_stack_pointer(); + sp = current_stack_frame(); save_context_stack(trace, sp, current, 1); } @@ -71,7 +71,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) return; if (tsk == current) - sp = current_stack_pointer(); + sp = current_stack_frame(); else sp = tsk->thread.ksp; @@ -131,7 +131,7 @@ static int __save_stack_trace_tsk_reliable(struct task_struct *tsk, } if (tsk == current) - sp = current_stack_pointer(); + sp = current_stack_frame(); else sp = tsk->thread.ksp; -- cgit v1.2.3 From 0e63f0151719ee4cb90d85e60c98045099c995e2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 20 Feb 2020 22:51:38 +1100 Subject: powerpc: Add current_stack_pointer as a register global current_stack_frame() doesn't return the stack pointer, but the caller's stack frame. See commit bfe9a2cfe91a ("powerpc: Reimplement __get_SP() as a function not a define") and commit acf620ecf56c ("powerpc: Rename __get_SP() to current_stack_pointer()") for details. In some cases this is overkill or incorrect, as it doesn't return the current value of r1. So add a current_stack_pointer register global to get the value of r1 directly. Signed-off-by: Christophe Leroy [mpe: Split out of other patch, tweak change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200220115141.2707-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/reg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1b1ffdba6097..da5cab038e25 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1450,6 +1450,8 @@ static inline void mtsrin(u32 val, u32 idx) extern unsigned long current_stack_frame(void); +register unsigned long current_stack_pointer asm("r1"); + extern unsigned long scom970_read(unsigned int address); extern void scom970_write(unsigned int address, unsigned long value); -- cgit v1.2.3 From a05f0e5be4e81e4977d3f92aaf7688ee0cb7d5db Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 29 Jan 2020 19:21:21 +0530 Subject: powerpc/smp: Use nid as fallback for package_id package_id is to match cores that are part of the same chip. On PowerNV machines, package_id defaults to chip_id. However ibm,chip_id property is not present in device-tree of PowerVM LPARs. Hence lscpu output shows one core per socket and multiple cores. To overcome this, use nid as the package_id on PowerVM LPARs. Before the patch: Architecture: ppc64le Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 8 Core(s) per socket: 1 <---------------------- Socket(s): 16 <---------------------- NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-63 NUMA node1 CPU(s): 64-127 # # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id -1 After the patch: Architecture: ppc64le Byte Order: Little Endian CPU(s): 128 On-line CPU(s) list: 0-127 Thread(s) per core: 8 <--------------------- Core(s) per socket: 8 <--------------------- Socket(s): 2 NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-63 NUMA node1 CPU(s): 64-127 # # cat /sys/devices/system/cpu/cpu0/topology/physical_package_id 0 Now lscpu output is more in line with the system configuration. Signed-off-by: Srikar Dronamraju [mpe: Use pkg_id instead of ppid, tweak change log and comment] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200129135121.24617-1-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/topology.h | 6 ++++++ arch/powerpc/kernel/smp.c | 30 +++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 2f7e1ea5089e..e2e1ccd4a18d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -134,7 +134,13 @@ static inline void shared_proc_topology_init(void) {} #ifdef CONFIG_PPC64 #include +#ifdef CONFIG_PPC_SPLPAR +int get_physical_package_id(int cpu); +#define topology_physical_package_id(cpu) (get_physical_package_id(cpu)) +#else #define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) +#endif + #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_core_id(cpu) (cpu_to_core_id(cpu)) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ea6adbf6a221..f68cde82bdf3 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1185,10 +1185,34 @@ static inline void add_cpu_to_smallcore_masks(int cpu) } } +int get_physical_package_id(int cpu) +{ + int pkg_id = cpu_to_chip_id(cpu); + +#ifdef CONFIG_PPC_SPLPAR + /* + * If the platform is PowerNV or Guest on KVM, ibm,chip-id is + * defined. Hence we would return the chip-id as the result of + * get_physical_package_id. + */ + if (pkg_id == -1 && firmware_has_feature(FW_FEATURE_LPAR)) { + struct device_node *np = of_get_cpu_node(cpu, NULL); + + if (np) { + pkg_id = of_node_to_nid(np); + of_node_put(np); + } + } +#endif /* CONFIG_PPC_SPLPAR */ + + return pkg_id; +} +EXPORT_SYMBOL_GPL(get_physical_package_id); + static void add_cpu_to_masks(int cpu) { int first_thread = cpu_first_thread_sibling(cpu); - int chipid = cpu_to_chip_id(cpu); + int pkg_id = get_physical_package_id(cpu); int i; /* @@ -1217,11 +1241,11 @@ static void add_cpu_to_masks(int cpu) for_each_cpu(i, cpu_l2_cache_mask(cpu)) set_cpus_related(cpu, i, cpu_core_mask); - if (chipid == -1) + if (pkg_id == -1) return; for_each_cpu(i, cpu_online_mask) - if (cpu_to_chip_id(i) == chipid) + if (get_physical_package_id(i) == pkg_id) set_cpus_related(cpu, i, cpu_core_mask); } -- cgit v1.2.3 From 247257b03b04398ca07da4bce3d17bee25d623cb Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 29 Jan 2020 19:23:01 +0530 Subject: powerpc/numa: Remove late request for home node associativity With commit ("powerpc/numa: Early request for home node associativity"), commit 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot") which was requesting home node associativity becomes redundant. Hence remove the late request for home node associativity. Signed-off-by: Srikar Dronamraju Reported-by: Abdul Haleem Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200129135301.24739-6-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/topology.h | 4 ---- arch/powerpc/kernel/smp.c | 5 ----- arch/powerpc/mm/numa.c | 9 --------- 3 files changed, 18 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index e2e1ccd4a18d..2db7ba789720 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -98,7 +98,6 @@ extern int stop_topology_update(void); extern int prrn_is_enabled(void); extern int find_and_online_cpu_nid(int cpu); extern int timed_topology_update(int nsecs); -extern void __init shared_proc_topology_init(void); #else static inline int start_topology_update(void) { @@ -121,9 +120,6 @@ static inline int timed_topology_update(int nsecs) return 0; } -#ifdef CONFIG_SMP -static inline void shared_proc_topology_init(void) {} -#endif #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f68cde82bdf3..37c12e3bab9e 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1383,11 +1383,6 @@ void __init smp_cpus_done(unsigned int max_cpus) if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); - /* - * On a shared LPAR, associativity needs to be requested. - * Hence, get numa topology before dumping cpu topology - */ - shared_proc_topology_init(); dump_numa_cpu_topology(); #ifdef CONFIG_SCHED_SMT diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 5a8abf0165d7..9fcf2d195830 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1632,15 +1632,6 @@ int prrn_is_enabled(void) return prrn_enabled; } -void __init shared_proc_topology_init(void) -{ - if (lppaca_shared_proc(get_lppaca())) { - bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), - nr_cpumask_bits); - numa_update_cpu_topology(false); - } -} - static int topology_read(struct seq_file *file, void *v) { if (vphn_enabled || prrn_enabled) -- cgit v1.2.3 From cc6f0e39000900e5dd1448103a9571f0eccd7d4e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 7 Mar 2020 10:09:15 +0000 Subject: powerpc/32: Fix missing NULL pmd check in virt_to_kpte() Commit 2efc7c085f05 ("powerpc/32: drop get_pteptr()"), replaced get_pteptr() by virt_to_kpte(). But virt_to_kpte() lacks a NULL pmd check and returns an invalid non NULL pointer when there is no page table. Reported-by: Nick Desaulniers Fixes: 2efc7c085f05 ("powerpc/32: drop get_pteptr()") Signed-off-by: Christophe Leroy Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b1177cdfc6af74a3e277bba5d9e708c4b3315ebe.1583575707.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/pgtable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index b80bfd41828d..b1f1d5339735 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -54,7 +54,9 @@ static inline pmd_t *pmd_ptr_k(unsigned long va) static inline pte_t *virt_to_kpte(unsigned long vaddr) { - return pte_offset_kernel(pmd_ptr_k(vaddr), vaddr); + pmd_t *pmd = pmd_ptr_k(vaddr); + + return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #endif -- cgit v1.2.3 From ffd3eaf178b0f616a071e510e289d937330b0b35 Mon Sep 17 00:00:00 2001 From: Joe Lawrence Date: Mon, 24 Feb 2020 16:18:48 -0500 Subject: powerpc/vdso: remove deprecated VDS64_HAS_DESCRIPTORS references The original 2005 patch that introduced the powerpc vdso, pre-git ("ppc64: Implement a vDSO and use it for signal trampoline") notes that: ... symbols exposed by the vDSO aren't "normal" function symbols, apps can't be expected to link against them directly, the vDSO's are both seen as if they were linked at 0 and the symbols just contain offsets to the various functions. This is done on purpose to avoid a relocation step (ppc64 functions normally have descriptors with abs addresses in them). When glibc uses those functions, it's expected to use it's own trampolines that know how to reach them. Despite that explanation, there remains dead #ifdef VDS64_HAS_DESCRIPTORS code-blocks that provide alternate function definitions that setup function descriptors. Since VDS64_HAS_DESCRIPTORS has been unused for all these years, we might as well finally remove it from the codebase. Signed-off-by: Joe Lawrence Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200224211848.26087-1-joe.lawrence@redhat.com --- arch/powerpc/include/asm/vdso.h | 24 ------------------------ arch/powerpc/kernel/vdso.c | 5 ----- 2 files changed, 29 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index b5e1f8f8a05c..2ff884853f97 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -13,9 +13,6 @@ #define VDSO_VERSION_STRING LINUX_2.6.15 -/* Define if 64 bits VDSO has procedure descriptors */ -#undef VDS64_HAS_DESCRIPTORS - #ifndef __ASSEMBLY__ /* Offsets relative to thread->vdso_base */ @@ -28,25 +25,6 @@ int vdso_getcpu_init(void); #else /* __ASSEMBLY__ */ #ifdef __VDSO64__ -#ifdef VDS64_HAS_DESCRIPTORS -#define V_FUNCTION_BEGIN(name) \ - .globl name; \ - .section ".opd","a"; \ - .align 3; \ - name: \ - .quad .name,.TOC.@tocbase,0; \ - .previous; \ - .globl .name; \ - .type .name,@function; \ - .name: \ - -#define V_FUNCTION_END(name) \ - .size .name,.-.name; - -#define V_LOCAL_FUNC(name) (.name) - -#else /* VDS64_HAS_DESCRIPTORS */ - #define V_FUNCTION_BEGIN(name) \ .globl name; \ name: \ @@ -55,8 +33,6 @@ int vdso_getcpu_init(void); .size name,.-name; #define V_LOCAL_FUNC(name) (name) - -#endif /* VDS64_HAS_DESCRIPTORS */ #endif /* __VDSO64__ */ #ifdef __VDSO32__ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index b9a108411c0d..d3b77c15f9ce 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -391,12 +391,7 @@ static unsigned long __init find_function64(struct lib64_elfinfo *lib, symname); return 0; } -#ifdef VDS64_HAS_DESCRIPTORS - return *((u64 *)(vdso64_kbase + sym->st_value - VDSO64_LBASE)) - - VDSO64_LBASE; -#else return sym->st_value - VDSO64_LBASE; -#endif } static int __init vdso_do_func_patch64(struct lib32_elfinfo *v32, -- cgit v1.2.3 From 697ece78f8f749aeea40f2711389901f0974017a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 10 Mar 2020 17:29:12 +0000 Subject: powerpc/32s: reorder Linux PTE bits to better match Hash PTE bits. Reorder Linux PTE bits to (almost) match Hash PTE bits. RW Kernel : PP = 00 RO Kernel : PP = 00 RW User : PP = 01 RO User : PP = 11 So naturally, we should have _PAGE_USER = 0x001 _PAGE_RW = 0x002 Today 0x001 and 0x002 and _PAGE_PRESENT and _PAGE_HASHPTE which both are software only bits. Switch _PAGE_USER and _PAGE_PRESET Switch _PAGE_RW and _PAGE_HASHPTE This allows to remove a few insns. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c4d6c18a7f8d9d3b899bc492f55fbc40ef38896a.1583861325.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/book3s/32/hash.h | 8 ++++---- arch/powerpc/kernel/head_32.S | 9 +++------ arch/powerpc/mm/book3s32/hash_low.S | 14 ++++++-------- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index 2a0a467d2985..34a7215ae81e 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -17,9 +17,9 @@ * updating the accessed and modified bits in the page table tree. */ -#define _PAGE_PRESENT 0x001 /* software: pte contains a translation */ -#define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */ -#define _PAGE_USER 0x004 /* usermode access allowed */ +#define _PAGE_USER 0x001 /* usermode access allowed */ +#define _PAGE_RW 0x002 /* software: user write access allowed */ +#define _PAGE_PRESENT 0x004 /* software: pte contains a translation */ #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ @@ -27,7 +27,7 @@ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ #define _PAGE_EXEC 0x200 /* software: exec allowed */ -#define _PAGE_RW 0x400 /* software: user write access allowed */ +#define _PAGE_HASHPTE 0x400 /* hash_page has made an HPTE for this pte */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ #ifdef CONFIG_PTE_64BIT diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 97c887950c3c..daaa153950c2 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -348,7 +348,7 @@ BEGIN_MMU_FTR_SECTION andis. r0, r5, (DSISR_BAD_FAULT_32S | DSISR_DABRMATCH)@h #endif bne handle_page_fault_tramp_2 /* if not, try to put a PTE */ - rlwinm r3, r5, 32 - 15, 21, 21 /* DSISR_STORE -> _PAGE_RW */ + rlwinm r3, r5, 32 - 24, 30, 30 /* DSISR_STORE -> _PAGE_RW */ bl hash_page b handle_page_fault_tramp_1 FTR_SECTION_ELSE @@ -497,7 +497,6 @@ InstructionTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ ori r1, r1, 0xe06 /* clear out reserved bits */ andc r1, r0, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION @@ -565,9 +564,8 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-9,30,30 /* _PAGE_RW -> PP msb */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ + rlwinm r1,r0,0,30,30 /* _PAGE_RW -> PP msb */ + rlwimi r0,r0,1,30,30 /* _PAGE_USER -> PP msb */ ori r1,r1,0xe04 /* clear out reserved bits */ andc r1,r0,r1 /* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION @@ -645,7 +643,6 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ li r1,0xe06 /* clear out reserved bits & PP msb */ andc r1,r0,r1 /* PP = user? 1: 0 */ BEGIN_FTR_SECTION diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 877d880890fe..6d236080cb1a 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -35,7 +35,7 @@ mmu_hash_lock: /* * Load a PTE into the hash table, if possible. * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. + * _PAGE_RW (0x002) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * @@ -69,7 +69,7 @@ _GLOBAL(hash_page) blt+ 112f /* assume user more likely */ lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ addi r5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ + rlwimi r3,r9,32-14,31,31 /* MSR_PR -> _PAGE_USER */ 112: #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ @@ -94,7 +94,7 @@ _GLOBAL(hash_page) #else rlwimi r8,r4,23,20,28 /* compute pte address */ #endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + rlwinm r0,r3,6,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE /* @@ -310,11 +310,9 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ - and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ + and r8,r5,r0 /* writable if _RW & _DIRTY */ + rlwimi r5,r5,1,30,30 /* _PAGE_USER -> PP msb */ ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION @@ -566,7 +564,7 @@ _GLOBAL(flush_hash_pages) 33: lwarx r8,0,r5 /* fetch the pte flags word */ andi. r0,r8,_PAGE_HASHPTE beq 8f /* done if HASHPTE is already clear */ - rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ + rlwinm r8,r8,0,~_PAGE_HASHPTE /* clear HASHPTE bit */ stwcx. r8,0,r5 /* update the pte */ bne- 33b -- cgit v1.2.3 From 36b78402d97a3b9aeab136feb9b00d8647ec2c20 Mon Sep 17 00:00:00 2001 From: Aneesh Kumar K.V Date: Fri, 13 Mar 2020 15:18:42 +0530 Subject: powerpc/hash64/devmap: Use H_PAGE_THP_HUGE when setting up huge devmap PTE entries H_PAGE_THP_HUGE is used to differentiate between a THP hugepage and hugetlb hugepage entries. The difference is WRT how we handle hash fault on these address. THP address enables MPSS in segments. We want to manage devmap hugepage entries similar to THP pt entries. Hence use H_PAGE_THP_HUGE for devmap huge PTE entries. With current code while handling hash PTE fault, we do set is_thp = true when finding devmap PTE huge PTE entries. Current code also does the below sequence we setting up huge devmap entries. entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); In that case we would find both H_PAGE_THP_HUGE and PAGE_DEVMAP set for huge devmap PTE entries. This results in false positive error like below. kernel BUG at /home/kvaneesh/src/linux/mm/memory.c:4321! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 56 PID: 67996 Comm: t_mmap_dio Not tainted 5.6.0-rc4-59640-g371c804dedbc #128 .... NIP [c00000000044c9e4] __follow_pte_pmd+0x264/0x900 LR [c0000000005d45f8] dax_writeback_one+0x1a8/0x740 Call Trace: str_spec.74809+0x22ffb4/0x2d116c (unreliable) dax_writeback_one+0x1a8/0x740 dax_writeback_mapping_range+0x26c/0x700 ext4_dax_writepages+0x150/0x5a0 do_writepages+0x68/0x180 __filemap_fdatawrite_range+0x138/0x180 file_write_and_wait_range+0xa4/0x110 ext4_sync_file+0x370/0x6e0 vfs_fsync_range+0x70/0xf0 sys_msync+0x220/0x2e0 system_call+0x5c/0x68 This is because our pmd_trans_huge check doesn't exclude _PAGE_DEVMAP. To make this all consistent, update pmd_mkdevmap to set H_PAGE_THP_HUGE and pmd_trans_huge check now excludes _PAGE_DEVMAP correctly. Fixes: ebd31197931d ("powerpc/mm: Add devmap support for ppc64") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200313094842.351830-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 6 ++++++ arch/powerpc/include/asm/book3s/64/hash-64k.h | 8 +++++++- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++- arch/powerpc/include/asm/book3s/64/radix.h | 5 +++++ 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 8fd8599c9395..3f9ae3585ab9 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -156,6 +156,12 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, extern int hash__has_transparent_hugepage(void); #endif +static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd) +{ + BUG(); + return pmd; +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index d1d9177d9ebd..0729c034e56f 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -246,7 +246,7 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, */ static inline int hash__pmd_trans_huge(pmd_t pmd) { - return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == + return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)) == (_PAGE_PTE | H_PAGE_THP_HUGE)); } @@ -272,6 +272,12 @@ extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline pmd_t hash__pmd_mkdevmap(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE | _PAGE_DEVMAP)); +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 201a69e6a355..368b136517e0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1303,7 +1303,9 @@ extern void serialize_against_pte_lookup(struct mm_struct *mm); static inline pmd_t pmd_mkdevmap(pmd_t pmd) { - return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); + if (radix_enabled()) + return radix__pmd_mkdevmap(pmd); + return hash__pmd_mkdevmap(pmd); } static inline int pmd_devmap(pmd_t pmd) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index d97db3ad9aae..a1c60d5b50af 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -263,6 +263,11 @@ static inline int radix__has_transparent_hugepage(void) } #endif +static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); -- cgit v1.2.3 From 8645aaa87963439007773ed8862ae6a29ea15eae Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 6 Mar 2020 18:38:59 +1100 Subject: powerpc/eeh: Add sysfs files in late probe Move creating the EEH specific sysfs files into eeh_add_device_late() rather than being open-coded all over the place. Calling the function is generally done immediately after calling eeh_add_device_late() anyway. This is also a correctness fix since currently the sysfs files will be added even if the EEH probe happens to fail. Similarly, on pseries we currently add the sysfs files before calling eeh_add_device_late(). This is flat-out broken since the sysfs files require the pci_dev->dev.archdata.edev pointer to be set, and that is done in eeh_add_device_late(). Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200306073904.4737-1-oohall@gmail.com --- arch/powerpc/include/asm/eeh.h | 3 --- arch/powerpc/kernel/eeh.c | 24 +----------------------- arch/powerpc/kernel/of_platform.c | 3 --- arch/powerpc/kernel/pci-common.c | 3 --- arch/powerpc/platforms/powernv/eeh-powernv.c | 1 - arch/powerpc/platforms/pseries/eeh_pseries.c | 3 +-- 6 files changed, 2 insertions(+), 35 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 6f9b2a12540a..5a349079057d 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -305,7 +305,6 @@ void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); -void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -368,8 +367,6 @@ static inline void eeh_add_device_late(struct pci_dev *dev) { } static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } -static inline void eeh_add_sysfs_files(struct pci_bus *bus) { } - static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_POSSIBLE_ERROR(val, type) (0) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 17cb3e9b5697..087891214739 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1210,6 +1210,7 @@ void eeh_add_device_late(struct pci_dev *dev) dev->dev.archdata.edev = edev; eeh_addr_cache_insert_dev(dev); + eeh_sysfs_add_device(dev); } /** @@ -1237,29 +1238,6 @@ void eeh_add_device_tree_late(struct pci_bus *bus) } EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); -/** - * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to add EEH sysfs files for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_sysfs_files(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_sysfs_add_device(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_sysfs_files(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); - /** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 427fc22f72b6..cb6880092dd7 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -86,9 +86,6 @@ static int of_pci_phb_probe(struct platform_device *dev) /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(phb->bus); - return 0; } diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index c6c03416a151..3d2b1cf30b80 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1404,9 +1404,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); - - /* sysfs files should only be added after devices are added */ - eeh_add_sysfs_files(bus); } EXPORT_SYMBOL_GPL(pcibios_finish_adding_to_bus); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6f300ab7f0e9..ef727ecd99cd 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,7 +48,6 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) dev_dbg(&pdev->dev, "EEH: Setting up device\n"); eeh_add_device_early(pdn); eeh_add_device_late(pdev); - eeh_sysfs_add_device(pdev); } static int pnv_eeh_init(void) diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 893ba3f562c4..95bbf9102584 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -68,7 +68,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) } #endif eeh_add_device_early(pdn); - eeh_add_device_late(pdev); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -78,7 +77,7 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) eeh_add_to_parent_pe(edev); /* Add as VF PE type */ } #endif - eeh_sysfs_add_device(pdev); + eeh_add_device_late(pdev); } /* -- cgit v1.2.3 From 2d0953f7d5acc53a97d0dbfab8b1827e9897a7e6 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 6 Mar 2020 18:39:00 +1100 Subject: powerpc/eeh: Remove eeh_add_device_tree_late() On pseries and PowerNV pcibios_bus_add_device() calls eeh_add_device_late() so there's no need to do a separate tree traversal to bind the eeh_dev and pci_dev together setting up the PHB at boot. As a result we can remove eeh_add_device_tree_late(). Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200306073904.4737-2-oohall@gmail.com --- arch/powerpc/include/asm/eeh.h | 3 --- arch/powerpc/kernel/eeh.c | 25 ------------------------- arch/powerpc/kernel/of_platform.c | 3 --- arch/powerpc/kernel/pci-common.c | 3 --- 4 files changed, 34 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 5a349079057d..5d1078166417 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -304,7 +304,6 @@ void eeh_addr_cache_init(void); void eeh_add_device_early(struct pci_dn *); void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); -void eeh_add_device_tree_late(struct pci_bus *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -365,8 +364,6 @@ static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { } static inline void eeh_add_device_late(struct pci_dev *dev) { } -static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } - static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_POSSIBLE_ERROR(val, type) (0) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 087891214739..9cb33706ef80 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1213,31 +1213,6 @@ void eeh_add_device_late(struct pci_dev *dev) eeh_sysfs_add_device(dev); } -/** - * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus - * @bus: PCI bus - * - * This routine must be used to perform EEH initialization for PCI - * devices which are attached to the indicated PCI bus. The PCI bus - * is added after system boot through hotplug or dlpar. - */ -void eeh_add_device_tree_late(struct pci_bus *bus) -{ - struct pci_dev *dev; - - if (eeh_has_flag(EEH_FORCE_DISABLED)) - return; - list_for_each_entry(dev, &bus->devices, bus_list) { - eeh_add_device_late(dev); - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { - struct pci_bus *subbus = dev->subordinate; - if (subbus) - eeh_add_device_tree_late(subbus); - } - } -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); - /** * eeh_remove_device - Undo EEH setup for the indicated pci device * @dev: pci device to be removed diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index cb6880092dd7..64edac81c633 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -80,9 +80,6 @@ static int of_pci_phb_probe(struct platform_device *dev) */ pcibios_claim_one_bus(phb->bus); - /* Finish EEH setup */ - eeh_add_device_tree_late(phb->bus); - /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 3d2b1cf30b80..8983afa6d62a 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1399,9 +1399,6 @@ void pcibios_finish_adding_to_bus(struct pci_bus *bus) pci_assign_unassigned_bus_resources(bus); } - /* Fixup EEH */ - eeh_add_device_tree_late(bus); - /* Add new devices to global lists. Register in proc, sysfs. */ pci_bus_add_devices(bus); } -- cgit v1.2.3 From b6eebb093cad0feb56c717611ee0d2d7c66b4ec7 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 6 Mar 2020 18:39:03 +1100 Subject: powerpc/eeh: Make early EEH init pseries specific The eeh_ops->probe() function is called from two different contexts: 1. On pseries, where we set EEH_PROBE_MODE_DEVTREE, it's called in eeh_add_device_early() which is supposed to run before we create a pci_dev. 2. On PowerNV, where we set EEH_PROBE_MODE_DEV, it's called in eeh_device_add_late() which is supposed to run *after* the pci_dev is created. The "early" probe is required because PAPR requires that we perform an RTAS call to enable EEH support on a device before we start interacting with it via config space or MMIO. This requirement doesn't exist on PowerNV and shoehorning two completely separate initialisation paths into a common interface just results in a convoluted code everywhere. Additionally the early probe requires the probe function to take an pci_dn rather than a pci_dev argument. We'd like to make pci_dn a pseries specific data structure since there's no real requirement for them on PowerNV. To help both goals move the early probe into the pseries containment zone so the platform depedence is more explicit. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200306073904.4737-5-oohall@gmail.com --- arch/powerpc/include/asm/eeh.h | 14 +++--- arch/powerpc/kernel/eeh.c | 46 -------------------- arch/powerpc/kernel/of_platform.c | 6 +-- arch/powerpc/platforms/powernv/eeh-powernv.c | 6 --- arch/powerpc/platforms/pseries/eeh_pseries.c | 65 ++++++++++++++++++++++------ arch/powerpc/platforms/pseries/pci_dlpar.c | 2 +- drivers/pci/hotplug/rpadlpar_core.c | 2 +- drivers/pci/hotplug/rpaphp_core.c | 2 +- drivers/pci/hotplug/rpaphp_pci.c | 2 +- 9 files changed, 64 insertions(+), 81 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 5d1078166417..8580238e4852 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -301,8 +301,6 @@ int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); -void eeh_add_device_early(struct pci_dn *); -void eeh_add_device_tree_early(struct pci_dn *); void eeh_add_device_late(struct pci_dev *); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); @@ -358,10 +356,6 @@ static inline int eeh_check_failure(const volatile void __iomem *token) static inline void eeh_addr_cache_init(void) { } -static inline void eeh_add_device_early(struct pci_dn *pdn) { } - -static inline void eeh_add_device_tree_early(struct pci_dn *pdn) { } - static inline void eeh_add_device_late(struct pci_dev *dev) { } static inline void eeh_remove_device(struct pci_dev *dev) { } @@ -370,6 +364,14 @@ static inline void eeh_remove_device(struct pci_dev *dev) { } #define EEH_IO_ERROR_VALUE(size) (-1UL) #endif /* CONFIG_EEH */ +#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_EEH) +void pseries_eeh_init_edev(struct pci_dn *pdn); +void pseries_eeh_init_edev_recursive(struct pci_dn *pdn); +#else +static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { } +static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { } +#endif + #ifdef CONFIG_PPC64 /* * MMIO read/write operations with EEH support. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index a9e4ca7b5e09..55d3ef6e5b9c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1106,52 +1106,6 @@ static int eeh_init(void) core_initcall_sync(eeh_init); -/** - * eeh_add_device_early - Enable EEH for the indicated device node - * @pdn: PCI device node for which to set up EEH - * - * This routine must be used to perform EEH initialization for PCI - * devices that were added after system boot (e.g. hotplug, dlpar). - * This routine must be called before any i/o is performed to the - * adapter (inluding any config-space i/o). - * Whether this actually enables EEH or not for this device depends - * on the CEC architecture, type of the device, on earlier boot - * command-line arguments & etc. - */ -void eeh_add_device_early(struct pci_dn *pdn) -{ - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); - - if (!edev) - return; - - if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) - return; - - eeh_ops->probe(pdn, NULL); -} - -/** - * eeh_add_device_tree_early - Enable EEH for the indicated device - * @pdn: PCI device node - * - * This routine must be used to perform EEH initialization for the - * indicated PCI device that was added after system boot (e.g. - * hotplug, dlpar). - */ -void eeh_add_device_tree_early(struct pci_dn *pdn) -{ - struct pci_dn *n; - - if (!pdn) - return; - - list_for_each_entry(n, &pdn->child_list, list) - eeh_add_device_tree_early(n); - eeh_add_device_early(pdn); -} -EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); - /** * eeh_add_device_late - Perform EEH initialization for the indicated pci device * @dev: pci device for which to set up EEH diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 64edac81c633..71a3f97dc988 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -62,13 +62,9 @@ static int of_pci_phb_probe(struct platform_device *dev) /* Init pci_dn data structures */ pci_devs_phb_init_dynamic(phb); - /* Create EEH devices for the PHB */ + /* Create EEH PEs for the PHB */ eeh_dev_phb_init_dynamic(phb); - /* Register devices with EEH */ - if (dev->dev.of_node->child) - eeh_add_device_tree_early(PCI_DN(dev->dev.of_node)); - /* Scan the bus */ pcibios_scan_phb(phb); if (phb->bus == NULL) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index ef727ecd99cd..eaa8dfefa124 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -40,13 +40,7 @@ static int eeh_event_irq = -EINVAL; void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED)) - return; - dev_dbg(&pdev->dev, "EEH: Setting up device\n"); - eeh_add_device_early(pdn); eeh_add_device_late(pdev); } diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 95bbf9102584..1ca7cf0fc0d8 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -67,7 +67,7 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) pdn->pe_number = physfn_pdn->pe_num_map[pdn->vf_index]; } #endif - eeh_add_device_early(pdn); + pseries_eeh_init_edev(pdn); #ifdef CONFIG_PCI_IOV if (pdev->is_virtfn) { struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -221,15 +221,16 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap) } /** - * pseries_eeh_probe - EEH probe on the given device + * pseries_eeh_init_edev - initialise the eeh_dev and eeh_pe for a pci_dn + * * @pdn: PCI device node - * @data: Unused * - * When EEH module is installed during system boot, all PCI devices - * are checked one by one to see if it supports EEH. The function - * is introduced for the purpose. + * When we discover a new PCI device via the device-tree we create a + * corresponding pci_dn and we allocate, but don't initialise, an eeh_dev. + * This function takes care of the initialisation and inserts the eeh_dev + * into the correct eeh_pe. If no eeh_pe exists we'll allocate one. */ -static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) +void pseries_eeh_init_edev(struct pci_dn *pdn) { struct eeh_dev *edev; struct eeh_pe pe; @@ -237,18 +238,35 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) int enable = 0; int ret; - /* Retrieve OF node and eeh device */ + if (WARN_ON_ONCE(!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))) + return; + + /* + * Find the eeh_dev for this pdn. The storage for the eeh_dev was + * allocated at the same time as the pci_dn. + * + * XXX: We should probably re-visit that. + */ edev = pdn_to_eeh_dev(pdn); - if (!edev || edev->pe) - return NULL; + if (!edev) + return; + + /* + * If ->pe is set then we've already probed this device. We hit + * this path when a pci_dev is removed and rescanned while recovering + * a PE (i.e. for devices where the driver doesn't support error + * recovery). + */ + if (edev->pe) + return; /* Check class/vendor/device IDs */ if (!pdn->vendor_id || !pdn->device_id || !pdn->class_code) - return NULL; + return; /* Skip for PCI-ISA bridge */ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) - return NULL; + return; eeh_edev_dbg(edev, "Probing device\n"); @@ -315,9 +333,29 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data) /* Save memory bars */ eeh_save_bars(edev); +} + +/** + * pseries_eeh_init_edev_recursive - Enable EEH for the indicated device + * @pdn: PCI device node + * + * This routine must be used to perform EEH initialization for the + * indicated PCI device that was added after system boot (e.g. + * hotplug, dlpar). + */ +void pseries_eeh_init_edev_recursive(struct pci_dn *pdn) +{ + struct pci_dn *n; + + if (!pdn) + return; + + list_for_each_entry(n, &pdn->child_list, list) + pseries_eeh_init_edev_recursive(n); - return NULL; + pseries_eeh_init_edev(pdn); } +EXPORT_SYMBOL_GPL(pseries_eeh_init_edev_recursive); /** * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable @@ -775,7 +813,6 @@ static int pseries_notify_resume(struct pci_dn *pdn) static struct eeh_ops pseries_eeh_ops = { .name = "pseries", .init = pseries_eeh_init, - .probe = pseries_eeh_probe, .set_option = pseries_eeh_set_option, .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 361986e4354e..b3a38f5a6b68 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -37,7 +37,7 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) eeh_dev_phb_init_dynamic(phb); if (dn->child) - eeh_add_device_tree_early(PCI_DN(dn)); + pseries_eeh_init_edev_recursive(PCI_DN(dn)); pcibios_scan_phb(phb); pcibios_finish_adding_to_bus(phb->bus); diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c index 977946e4e613..c5eb509c72f0 100644 --- a/drivers/pci/hotplug/rpadlpar_core.c +++ b/drivers/pci/hotplug/rpadlpar_core.c @@ -140,7 +140,7 @@ static void dlpar_pci_add_bus(struct device_node *dn) struct pci_controller *phb = pdn->phb; struct pci_dev *dev = NULL; - eeh_add_device_tree_early(pdn); + pseries_eeh_init_edev_recursive(pdn); /* Add EADS device to PHB bus, adding new entry to bus->devices */ dev = of_create_pci_dev(dn, phb->bus, pdn->devfn); diff --git a/drivers/pci/hotplug/rpaphp_core.c b/drivers/pci/hotplug/rpaphp_core.c index 392a936c17de..6504869efabc 100644 --- a/drivers/pci/hotplug/rpaphp_core.c +++ b/drivers/pci/hotplug/rpaphp_core.c @@ -493,7 +493,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) return retval; if (state == PRESENT) { - eeh_add_device_tree_early(PCI_DN(slot->dn)); + pseries_eeh_init_edev_recursive(PCI_DN(slot->dn)); pci_lock_rescan_remove(); pci_hp_add_devices(slot->bus); diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c index 61ebbd832afb..c380bdacd146 100644 --- a/drivers/pci/hotplug/rpaphp_pci.c +++ b/drivers/pci/hotplug/rpaphp_pci.c @@ -96,7 +96,7 @@ int rpaphp_enable_slot(struct slot *slot) } if (list_empty(&bus->devices)) { - eeh_add_device_tree_early(PCI_DN(slot->dn)); + pseries_eeh_init_edev_recursive(PCI_DN(slot->dn)); pci_hp_add_devices(bus); } -- cgit v1.2.3 From e86350f70a02e5b4e26b0eccedb575a7490bc834 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Fri, 6 Mar 2020 18:39:04 +1100 Subject: powerpc/eeh: Rework eeh_ops->probe() With the EEH early probe now being pseries specific there's no need for eeh_ops->probe() to take a pci_dn. Instead, we can make it take a pci_dev and use the probe function to map a pci_dev to an eeh_dev. This allows the platform to implement it's own method for finding (or creating) an eeh_dev for a given pci_dev which also removes a use of pci_dn in generic EEH code. This patch also renames eeh_device_add_late() to eeh_device_probe(). This better reflects what it does does and removes the last vestiges of the early/late EEH probe split. Reviewed-by: Sam Bobroff Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200306073904.4737-6-oohall@gmail.com --- arch/powerpc/include/asm/eeh.h | 6 ++-- arch/powerpc/kernel/eeh.c | 44 +++++++++++++++------------- arch/powerpc/platforms/powernv/eeh-powernv.c | 30 +++++++++---------- arch/powerpc/platforms/pseries/eeh_pseries.c | 23 ++++++++++++++- 4 files changed, 62 insertions(+), 41 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 8580238e4852..964a54292b36 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -215,7 +215,7 @@ enum { struct eeh_ops { char *name; int (*init)(void); - void* (*probe)(struct pci_dn *pdn, void *data); + struct eeh_dev *(*probe)(struct pci_dev *pdev); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); int (*get_state)(struct eeh_pe *pe, int *delay); @@ -301,7 +301,7 @@ int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); int eeh_dev_check_failure(struct eeh_dev *edev); void eeh_addr_cache_init(void); -void eeh_add_device_late(struct pci_dev *); +void eeh_probe_device(struct pci_dev *pdev); void eeh_remove_device(struct pci_dev *); int eeh_unfreeze_pe(struct eeh_pe *pe); int eeh_pe_reset_and_recover(struct eeh_pe *pe); @@ -356,7 +356,7 @@ static inline int eeh_check_failure(const volatile void __iomem *token) static inline void eeh_addr_cache_init(void) { } -static inline void eeh_add_device_late(struct pci_dev *dev) { } +static inline void eeh_probe_device(struct pci_dev *dev) { } static inline void eeh_remove_device(struct pci_dev *dev) { } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 55d3ef6e5b9c..7cdcb413bb44 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1107,35 +1107,43 @@ static int eeh_init(void) core_initcall_sync(eeh_init); /** - * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * eeh_probe_device() - Perform EEH initialization for the indicated pci device * @dev: pci device for which to set up EEH * * This routine must be used to complete EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). */ -void eeh_add_device_late(struct pci_dev *dev) +void eeh_probe_device(struct pci_dev *dev) { - struct pci_dn *pdn; struct eeh_dev *edev; - if (!dev) + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + /* + * pci_dev_to_eeh_dev() can only work if eeh_probe_dev() was + * already called for this device. + */ + if (WARN_ON_ONCE(pci_dev_to_eeh_dev(dev))) { + pci_dbg(dev, "Already bound to an eeh_dev!\n"); return; + } - pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); - edev = pdn_to_eeh_dev(pdn); - eeh_edev_dbg(edev, "Adding device\n"); - if (edev->pdev == dev) { - eeh_edev_dbg(edev, "Device already referenced!\n"); + edev = eeh_ops->probe(dev); + if (!edev) { + pr_debug("EEH: Adding device failed\n"); return; } /* - * The EEH cache might not be removed correctly because of - * unbalanced kref to the device during unplug time, which - * relies on pcibios_release_device(). So we have to remove - * that here explicitly. + * FIXME: We rely on pcibios_release_device() to remove the + * existing EEH state. The release function is only called if + * the pci_dev's refcount drops to zero so if something is + * keeping a ref to a device (e.g. a filesystem) we need to + * remove the old EEH state. + * + * FIXME: HEY MA, LOOK AT ME, NO LOCKING! */ - if (edev->pdev) { + if (edev->pdev && edev->pdev != dev) { eeh_rmv_from_parent_pe(edev); eeh_addr_cache_rmv_dev(edev->pdev); eeh_sysfs_remove_device(edev->pdev); @@ -1146,17 +1154,11 @@ void eeh_add_device_late(struct pci_dev *dev) * into error handler afterwards. */ edev->mode |= EEH_DEV_NO_HANDLER; - - edev->pdev = NULL; - dev->dev.archdata.edev = NULL; } - if (eeh_has_flag(EEH_PROBE_MODE_DEV)) - eeh_ops->probe(pdn, NULL); - + /* bind the pdev and the edev together */ edev->pdev = dev; dev->dev.archdata.edev = edev; - eeh_addr_cache_insert_dev(dev); eeh_sysfs_add_device(dev); } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index eaa8dfefa124..79409e005fcd 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,7 @@ static int eeh_event_irq = -EINVAL; void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { dev_dbg(&pdev->dev, "EEH: Setting up device\n"); - eeh_add_device_late(pdev); + eeh_probe_device(pdev); } static int pnv_eeh_init(void) @@ -340,23 +340,13 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) /** * pnv_eeh_probe - Do probe on PCI device - * @pdn: PCI device node - * @data: unused + * @pdev: pci_dev to probe * - * When EEH module is installed during system boot, all PCI devices - * are checked one by one to see if it supports EEH. The function - * is introduced for the purpose. By default, EEH has been enabled - * on all PCI devices. That's to say, we only need do necessary - * initialization on the corresponding eeh device and create PE - * accordingly. - * - * It's notable that's unsafe to retrieve the EEH device through - * the corresponding PCI device. During the PCI device hotplug, which - * was possiblly triggered by EEH core, the binding between EEH device - * and the PCI device isn't built yet. + * Create, or find the existing, eeh_dev for this pci_dev. */ -static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) +static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) { + struct pci_dn *pdn = pci_get_pdn(pdev); struct pci_controller *hose = pdn->phb; struct pnv_phb *phb = hose->private_data; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -373,6 +363,14 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if (!edev || edev->pe) return NULL; + /* already configured? */ + if (edev->pdev) { + pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n", + __func__, hose->global_number, config_addr >> 8, + PCI_SLOT(config_addr), PCI_FUNC(config_addr)); + return edev; + } + /* Skip for PCI-ISA bridge */ if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; @@ -464,7 +462,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) eeh_edev_dbg(edev, "EEH enabled on device\n"); - return NULL; + return edev; } /** diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 1ca7cf0fc0d8..845342814edc 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -77,7 +77,7 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev) eeh_add_to_parent_pe(edev); /* Add as VF PE type */ } #endif - eeh_add_device_late(pdev); + eeh_probe_device(pdev); } /* @@ -335,6 +335,26 @@ void pseries_eeh_init_edev(struct pci_dn *pdn) eeh_save_bars(edev); } +static struct eeh_dev *pseries_eeh_probe(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + struct pci_dn *pdn; + + pdn = pci_get_pdn_by_devfn(pdev->bus, pdev->devfn); + if (!pdn) + return NULL; + + /* + * If the system supports EEH on this device then the eeh_dev was + * configured and inserted into a PE in pseries_eeh_init_edev() + */ + edev = pdn_to_eeh_dev(pdn); + if (!edev || !edev->pe) + return NULL; + + return edev; +} + /** * pseries_eeh_init_edev_recursive - Enable EEH for the indicated device * @pdn: PCI device node @@ -813,6 +833,7 @@ static int pseries_notify_resume(struct pci_dn *pdn) static struct eeh_ops pseries_eeh_ops = { .name = "pseries", .init = pseries_eeh_init, + .probe = pseries_eeh_probe, .set_option = pseries_eeh_set_option, .get_pe_addr = pseries_eeh_get_pe_addr, .get_state = pseries_eeh_get_state, -- cgit v1.2.3 From a7032637b54186e5649917679727d7feaec932b1 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Mon, 12 Aug 2019 14:50:43 -0700 Subject: powerpc: Prefer __section and __printf from compiler_attributes.h Reported-by: Sedat Dilek Suggested-by: Josh Poimboeuf Signed-off-by: Nick Desaulniers [mpe: Drop changes to a/p/boot which doesn't use linux headers] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190812215052.71840-10-ndesaulniers@google.com --- arch/powerpc/include/asm/cache.h | 2 +- arch/powerpc/kernel/btext.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index 72b81015cebe..609cab1d58f2 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -97,7 +97,7 @@ static inline u32 l1_icache_bytes(void) #endif -#define __read_mostly __attribute__((__section__(".data..read_mostly"))) +#define __read_mostly __section(.data..read_mostly) #ifdef CONFIG_PPC_BOOK3S_32 extern long _get_L2CR(void); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 6dfceaa820e4..f57712a55815 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -26,7 +26,7 @@ static void scrollscreen(void); #endif -#define __force_data __attribute__((__section__(".data"))) +#define __force_data __section(.data) static int g_loc_X __force_data; static int g_loc_Y __force_data; -- cgit v1.2.3 From efbc4303b255bb80ab1283794b36dd5fe1fb0ec3 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Fri, 27 Mar 2020 00:19:16 +0530 Subject: powerpc/pseries: Handle UE event for memcpy_mcsafe memcpy_mcsafe has been implemented for power machines which is used by pmem infrastructure, so that an UE encountered during memcpy from pmem devices would not result in panic instead a right error code is returned. The implementation expects machine check handler to ignore the event and set nip to continue the execution from fixup code. Appropriate changes are already made to powernv machine check handler, make similar changes to pseries machine check handler to ignore the the event and set nip to continue execution at the fixup entry if we hit UE at an instruction with a fixup entry. while we are at it, have a common function which searches the exception table entry and updates nip with fixup address, and any future common changes can be made in this function that are valid for both architectures. powernv changes are made by commit 895e3dceeb97 ("powerpc/mce: Handle UE event for memcpy_mcsafe") Reviewed-by: Mahesh Salgaonkar Reviewed-by: Santosh S Signed-off-by: Ganesh Goudar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200326184916.31172-1-ganeshgr@linux.ibm.com --- arch/powerpc/include/asm/mce.h | 2 ++ arch/powerpc/kernel/mce.c | 14 ++++++++++++++ arch/powerpc/kernel/mce_power.c | 8 ++------ arch/powerpc/platforms/pseries/ras.c | 3 +++ 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 6a6ddaabdb34..376a395daf32 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -218,6 +218,8 @@ extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode, bool in_guest); unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr); +extern void mce_common_process_ue(struct pt_regs *regs, + struct mce_error_info *mce_err); #ifdef CONFIG_PPC_BOOK3S_64 void flush_and_reload_slb(void); #endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 34c1001e9e8b..8077b5fb18a7 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -251,6 +252,19 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } + +void mce_common_process_ue(struct pt_regs *regs, + struct mce_error_info *mce_err) +{ + const struct exception_table_entry *entry; + + entry = search_kernel_exception_table(regs->nip); + if (entry) { + mce_err->ignore_event = true; + regs->nip = extable_fixup(entry); + } +} + /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 1cbf7f1a4e3d..067b094bfeff 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -579,14 +579,10 @@ static long mce_handle_ue_error(struct pt_regs *regs, struct mce_error_info *mce_err) { long handled = 0; - const struct exception_table_entry *entry; - entry = search_kernel_exception_table(regs->nip); - if (entry) { - mce_err->ignore_event = true; - regs->nip = extable_fixup(entry); + mce_common_process_ue(regs, mce_err); + if (mce_err->ignore_event) return 1; - } /* * On specific SCOM read via MMIO we may get a machine check diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 1d7f973c647b..aa6208c8d4f0 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -558,6 +558,9 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp) switch (mce_log->error_type) { case MC_ERROR_TYPE_UE: mce_err.error_type = MCE_ERROR_TYPE_UE; + mce_common_process_ue(regs, &mce_err); + if (mce_err.ignore_event) + disposition = RTAS_DISP_FULLY_RECOVERED; switch (err_sub_type) { case MC_ERROR_UE_IFETCH: mce_err.u.ue_error_type = MCE_UE_ERROR_IFETCH; -- cgit v1.2.3 From 233ba5461838a56c19600216f0919e7cd3aec40e Mon Sep 17 00:00:00 2001 From: Aneesh Kumar K.V Date: Fri, 20 Mar 2020 16:02:42 +0530 Subject: powerpc/64: Avoid isync in flush_dcache_range() As per ISA an isync is only needed on instruction cache block invalidate. Remove the same from dcache invalidate. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200320103242.229223-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/cacheflush.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 4a1c9f0200e1..e92191b390f3 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -65,17 +65,13 @@ static inline void flush_dcache_range(unsigned long start, unsigned long stop) unsigned long size = stop - (unsigned long)addr + (bytes - 1); unsigned long i; - if (IS_ENABLED(CONFIG_PPC64)) { + if (IS_ENABLED(CONFIG_PPC64)) mb(); /* sync */ - isync(); - } for (i = 0; i < size >> shift; i++, addr += bytes) dcbf(addr); mb(); /* sync */ - if (IS_ENABLED(CONFIG_PPC64)) - isync(); } /* -- cgit v1.2.3 From 8729c26e675c356de4179d587af6cd1f16147a39 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Feb 2020 03:35:19 +1000 Subject: powerpc/64s/exception: Move real to virt switch into the common handler The real mode interrupt entry points currently use rfid to branch to the common handler in virtual mode. This is a significant amount of code, and forces other code (notably the KVM test) to live in the real mode handler. In the interest of minimising the amount of code that runs unrelocated move the switch to virt mode into the common code, and do it with mtmsrd, which avoids clobbering SRRs (although the post-KVMTEST performance of real-mode interrupt handlers is not a big concern these days). This requires CTR to always be saved (real-mode needs to reach 0xc...) but that's not a huge impact these days. It could be optimized away in future. mpe: Incorporate fix from Nick: It's possible for interrupts to be replayed when TM is enabled and suspended, for example rt_sigreturn, where the mtmsrd MSR_KERNEL in the real-mode entry point to the common handler causes a TM Bad Thing exception (due to attempting to clear suspended). The fix for this is to have replay interrupts go to the _virt entry point and skip the mtmsrd, which matches what happens before this patch. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200225173541.1549955-11-npiggin@gmail.com --- arch/powerpc/include/asm/exception-64s.h | 4 - arch/powerpc/kernel/exceptions-64s.S | 265 ++++++++++++++----------------- 2 files changed, 116 insertions(+), 153 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 33f4f72eb035..47bd4ea0837d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -33,11 +33,7 @@ #include /* PACA save area size in u64 units (exgen, exmc, etc) */ -#if defined(CONFIG_RELOCATABLE) #define EX_SIZE 10 -#else -#define EX_SIZE 9 -#endif /* * maximum recursive depth of MCE exceptions diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4eb099046f9d..42fced32c8af 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -32,16 +32,10 @@ #define EX_CCR 52 #define EX_CFAR 56 #define EX_PPR 64 -#if defined(CONFIG_RELOCATABLE) #define EX_CTR 72 .if EX_SIZE != 10 .error "EX_SIZE is wrong" .endif -#else -.if EX_SIZE != 9 - .error "EX_SIZE is wrong" -.endif -#endif /* * Following are fixed section helper macros. @@ -124,22 +118,6 @@ name: #define EXC_HV 1 #define EXC_STD 0 -#if defined(CONFIG_RELOCATABLE) -/* - * If we support interrupts with relocation on AND we're a relocatable kernel, - * we need to use CTR to get to the 2nd level handler. So, save/restore it - * when required. - */ -#define SAVE_CTR(reg, area) mfctr reg ; std reg,area+EX_CTR(r13) -#define GET_CTR(reg, area) ld reg,area+EX_CTR(r13) -#define RESTORE_CTR(reg, area) ld reg,area+EX_CTR(r13) ; mtctr reg -#else -/* ...else CTR is unused and in register. */ -#define SAVE_CTR(reg, area) -#define GET_CTR(reg, area) mfctr reg -#define RESTORE_CTR(reg, area) -#endif - /* * PPR save/restore macros used in exceptions-64s.S * Used for P7 or later processors @@ -199,6 +177,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define IVEC .L_IVEC_\name\() #define IHSRR .L_IHSRR_\name\() #define IAREA .L_IAREA_\name\() +#define IVIRT .L_IVIRT_\name\() #define IISIDE .L_IISIDE_\name\() #define IDAR .L_IDAR_\name\() #define IDSISR .L_IDSISR_\name\() @@ -232,6 +211,9 @@ do_define_int n .ifndef IAREA IAREA=PACA_EXGEN .endif + .ifndef IVIRT + IVIRT=1 + .endif .ifndef IISIDE IISIDE=0 .endif @@ -325,7 +307,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * outside the head section. CONFIG_RELOCATABLE KVM expects CTR * to be saved in HSTATE_SCRATCH1. */ - mfctr r9 + ld r9,IAREA+EX_CTR(r13) std r9,HSTATE_SCRATCH1(r13) __LOAD_FAR_HANDLER(r9, kvmppc_interrupt) mtctr r9 @@ -362,101 +344,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .endm #endif -.macro INT_SAVE_SRR_AND_JUMP label, hsrr, set_ri - ld r10,PACAKMSR(r13) /* get MSR value for kernel */ - .if ! \set_ri - xori r10,r10,MSR_RI /* Clear MSR_RI */ - .endif - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - mtspr SPRN_HSRR1,r10 - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - mtspr SPRN_SRR1,r10 - .endif - LOAD_HANDLER(r10, \label\()) - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - FTR_SECTION_ELSE - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mtspr SPRN_HSRR0,r10 - HRFI_TO_KERNEL - .else - mtspr SPRN_SRR0,r10 - RFI_TO_KERNEL - .endif - b . /* prevent speculative execution */ -.endm - -/* INT_SAVE_SRR_AND_JUMP works for real or virt, this is faster but virt only */ -.macro INT_VIRT_SAVE_SRR_AND_JUMP label, hsrr -#ifdef CONFIG_RELOCATABLE - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - .endif - LOAD_HANDLER(r12, \label\()) - mtctr r12 - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r12,SPRN_SRR1 /* and HSRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - bctr -#else - .if \hsrr == EXC_HV_OR_STD - BEGIN_FTR_SECTION - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - FTR_SECTION_ELSE - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif \hsrr - mfspr r11,SPRN_HSRR0 /* save HSRR0 */ - mfspr r12,SPRN_HSRR1 /* and HSRR1 */ - .else - mfspr r11,SPRN_SRR0 /* save SRR0 */ - mfspr r12,SPRN_SRR1 /* and SRR1 */ - .endif - li r10,MSR_RI - mtmsrd r10,1 /* Set RI (EE=0) */ - b \label -#endif -.endm - /* * This is the BOOK3S interrupt entry code macro. * @@ -477,6 +364,23 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) * - Fall through and continue executing in real, unrelocated mode. * This is done if early=2. */ + +.macro GEN_BRANCH_TO_COMMON name, virt + .if \virt +#ifndef CONFIG_RELOCATABLE + b \name\()_common_virt +#else + LOAD_HANDLER(r10, \name\()_common_virt) + mtctr r10 + bctr +#endif + .else + LOAD_HANDLER(r10, \name\()_common_real) + mtctr r10 + bctr + .endif +.endm + .macro GEN_INT_ENTRY name, virt, ool=0 SET_SCRATCH0(r13) /* save r13 */ GET_PACA(r13) @@ -500,8 +404,10 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) OPT_SAVE_REG_TO_PACA(IAREA+EX_PPR, r9, CPU_FTR_HAS_PPR) OPT_SAVE_REG_TO_PACA(IAREA+EX_CFAR, r10, CPU_FTR_CFAR) INTERRUPT_TO_KERNEL - SAVE_CTR(r10, IAREA) + mfctr r10 + std r10,IAREA+EX_CTR(r13) mfcr r9 + .if (!\virt && IKVM_REAL) || (\virt && IKVM_VIRT) KVMTEST \name IHSRR IVEC .endif @@ -566,27 +472,58 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) .if IEARLY == 2 /* nothing more */ .elseif IEARLY - mfctr r10 /* save ctr, even for !RELOCATABLE */ BRANCH_TO_C000(r11, \name\()_common) - .elseif !\virt - INT_SAVE_SRR_AND_JUMP \name\()_common, IHSRR, ISET_RI .else - INT_VIRT_SAVE_SRR_AND_JUMP \name\()_common, IHSRR + .if IHSRR == EXC_HV_OR_STD + BEGIN_FTR_SECTION + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + FTR_SECTION_ELSE + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) + .elseif IHSRR + mfspr r11,SPRN_HSRR0 /* save HSRR0 */ + mfspr r12,SPRN_HSRR1 /* and HSRR1 */ + .else + mfspr r11,SPRN_SRR0 /* save SRR0 */ + mfspr r12,SPRN_SRR1 /* and SRR1 */ .endif + GEN_BRANCH_TO_COMMON \name \virt + .endif + .if \ool .popsection .endif .endm /* - * On entry r13 points to the paca, r9-r13 are saved in the paca, - * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and - * SRR1, and relocation is on. - * - * If stack=0, then the stack is already set in r1, and r1 is saved in r10. - * PPR save and CPU accounting is not done for the !stack case (XXX why not?) + * __GEN_COMMON_ENTRY is required to receive the branch from interrupt + * entry, except in the case of the IEARLY handlers. + * This switches to virtual mode and sets MSR[RI]. */ -.macro GEN_COMMON name +.macro __GEN_COMMON_ENTRY name +DEFINE_FIXED_SYMBOL(\name\()_common_real) +\name\()_common_real: + ld r10,PACAKMSR(r13) /* get MSR value for kernel */ + /* MSR[RI] is clear iff using SRR regs */ + .if IHSRR == EXC_HV_OR_STD + BEGIN_FTR_SECTION + xori r10,r10,MSR_RI + END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) + .elseif ! IHSRR + xori r10,r10,MSR_RI + .endif + mtmsrd r10 + + .if IVIRT + .balign IFETCH_ALIGN_BYTES +DEFINE_FIXED_SYMBOL(\name\()_common_virt) +\name\()_common_virt: + .endif /* IVIRT */ +.endm + +.macro __GEN_COMMON_BODY name .if ISTACK andi. r10,r12,MSR_PR /* See if coming from user */ mr r10,r1 /* Save r1 */ @@ -604,6 +541,11 @@ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948) std r0,GPR0(r1) /* save r0 in stackframe */ std r10,GPR1(r1) /* save r1 in stackframe */ + .if ISET_RI + li r10,MSR_RI + mtmsrd r10,1 /* Set MSR_RI */ + .endif + .if ISTACK .if IKUAP kuap_save_amr_and_lock r9, r10, cr1, cr0 @@ -654,7 +596,7 @@ BEGIN_FTR_SECTION_NESTED(66) ld r10,IAREA+EX_CFAR(r13) std r10,ORIG_GPR3(r1) END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) - GET_CTR(r10, IAREA) + ld r10,IAREA+EX_CTR(r13) std r10,_CTR(r1) std r2,GPR2(r1) /* save r2 in stackframe */ SAVE_4GPRS(3, r1) /* save r3 - r6 in stackframe */ @@ -682,6 +624,19 @@ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66) .endif .endm +/* + * On entry r13 points to the paca, r9-r13 are saved in the paca, + * r9 contains the saved CR, r11 and r12 contain the saved SRR0 and + * SRR1, and relocation is on. + * + * If stack=0, then the stack is already set in r1, and r1 is saved in r10. + * PPR save and CPU accounting is not done for the !stack case (XXX why not?) + */ +.macro GEN_COMMON name + __GEN_COMMON_ENTRY \name + __GEN_COMMON_BODY \name +.endm + /* * Restore all registers including H/SRR0/1 saved in a stack frame of a * standard exception. @@ -834,6 +789,7 @@ EXC_VIRT_NONE(0x4000, 0x100) INT_DEFINE_BEGIN(system_reset) IVEC=0x100 IAREA=PACA_EXNMI + IVIRT=0 /* no virt entry point */ /* * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is * being used, so a nested NMI exception would corrupt it. @@ -913,6 +869,7 @@ TRAMP_REAL_BEGIN(system_reset_fwnmi) #endif /* CONFIG_PPC_PSERIES */ EXC_COMMON_BEGIN(system_reset_common) + __GEN_COMMON_ENTRY system_reset /* * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able * to recover, but nested NMI will notice in_nmi and not recover @@ -928,7 +885,7 @@ EXC_COMMON_BEGIN(system_reset_common) mr r10,r1 ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE - GEN_COMMON system_reset + __GEN_COMMON_BODY system_reset bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so arch_irqs_disabled does @@ -973,6 +930,7 @@ EXC_COMMON_BEGIN(system_reset_common) INT_DEFINE_BEGIN(machine_check_early) IVEC=0x200 IAREA=PACA_EXMC + IVIRT=0 /* no virt entry point */ /* * MSR_RI is not enabled, because PACA_EXMC is being used, so a * nested machine check corrupts it. machine_check_common enables @@ -990,6 +948,7 @@ INT_DEFINE_END(machine_check_early) INT_DEFINE_BEGIN(machine_check) IVEC=0x200 IAREA=PACA_EXMC + IVIRT=0 /* no virt entry point */ ISET_RI=0 IDAR=1 IDSISR=1 @@ -1022,7 +981,6 @@ TRAMP_KVM_BEGIN(machine_check_kvm) EXCEPTION_RESTORE_REGS EXC_STD EXC_COMMON_BEGIN(machine_check_early_common) - mtctr r10 /* Restore ctr */ mfspr r11,SPRN_SRR0 mfspr r12,SPRN_SRR1 @@ -1061,7 +1019,7 @@ EXC_COMMON_BEGIN(machine_check_early_common) bgt cr1,unrecoverable_mce /* Check if we hit limit of 4 */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - GEN_COMMON machine_check_early + __GEN_COMMON_BODY machine_check_early BEGIN_FTR_SECTION bl enable_machine_check @@ -1448,6 +1406,8 @@ EXC_VIRT_END(program_check, 0x4700, 0x100) TRAMP_KVM_BEGIN(program_check_kvm) GEN_KVM program_check EXC_COMMON_BEGIN(program_check_common) + __GEN_COMMON_ENTRY program_check + /* * It's possible to receive a TM Bad Thing type program check with * userspace register values (in particular r1), but with SRR1 reporting @@ -1473,11 +1433,11 @@ EXC_COMMON_BEGIN(program_check_common) ld r1,PACAEMERGSP(r13) /* Use emergency stack */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ __ISTACK(program_check)=0 - GEN_COMMON program_check + __GEN_COMMON_BODY program_check b 3f 2: __ISTACK(program_check)=1 - GEN_COMMON program_check + __GEN_COMMON_BODY program_check 3: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD @@ -1861,14 +1821,13 @@ TRAMP_KVM_BEGIN(hmi_exception_kvm) GEN_KVM hmi_exception EXC_COMMON_BEGIN(hmi_exception_early_common) - mtctr r10 /* Restore ctr */ mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ mr r10,r1 /* Save r1 */ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ - GEN_COMMON hmi_exception_early + __GEN_COMMON_BODY hmi_exception_early addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode @@ -2195,7 +2154,9 @@ EXC_REAL_BEGIN(denorm_exception, 0x1500, 0x100) bne+ denorm_assist #endif KVMTEST denorm_exception, EXC_HV, 0x1500 - INT_SAVE_SRR_AND_JUMP denorm_exception_common, EXC_HV, 1 + mfspr r11,SPRN_HSRR0 + mfspr r12,SPRN_HSRR1 + GEN_BRANCH_TO_COMMON denorm_exception, virt=0 EXC_REAL_END(denorm_exception, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) @@ -2203,7 +2164,9 @@ EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) mfspr r10,SPRN_HSRR1 andis. r10,r10,(HSRR1_DENORM)@h /* denorm? */ bne+ denorm_assist - INT_VIRT_SAVE_SRR_AND_JUMP denorm_exception_common, EXC_HV + mfspr r11,SPRN_HSRR0 + mfspr r12,SPRN_HSRR1 + GEN_BRANCH_TO_COMMON denorm_exception, virt=1 EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else EXC_VIRT_NONE(0x5500, 0x100) @@ -2374,7 +2337,11 @@ EXC_VIRT_NONE(0x5800, 0x100) std r12,PACA_EXGEN+EX_R12(r13); \ GET_SCRATCH0(r10); \ std r10,PACA_EXGEN+EX_R13(r13); \ - INT_SAVE_SRR_AND_JUMP soft_nmi_common, _H, 1 + mfspr r11,SPRN_SRR0; /* save SRR0 */ \ + mfspr r12,SPRN_SRR1; /* and SRR1 */ \ + LOAD_HANDLER(r10, soft_nmi_common); \ + mtctr r10; \ + bctr /* * Branch to soft_nmi_interrupt using the emergency stack. The emergency @@ -2390,7 +2357,7 @@ EXC_COMMON_BEGIN(soft_nmi_common) ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE __ISTACK(decrementer)=0 - GEN_COMMON decrementer + __GEN_COMMON_BODY decrementer bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl soft_nmi_interrupt @@ -2790,12 +2757,12 @@ handle_dabr_fault: h_doorbell_common_msgclr: LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) PPC_MSGCLR(3) - b h_doorbell_common + b h_doorbell_common_virt doorbell_super_common_msgclr: LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) PPC_MSGCLRP(3) - b doorbell_super_common + b doorbell_super_common_virt /* * Called from arch_local_irq_enable when an interrupt needs @@ -2821,20 +2788,20 @@ _GLOBAL(__replay_interrupt) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 - beq decrementer_common + beq decrementer_common_virt cmpwi r3,0x500 BEGIN_FTR_SECTION - beq h_virt_irq_common + beq h_virt_irq_common_virt FTR_SECTION_ELSE - beq hardware_interrupt_common + beq hardware_interrupt_common_virt ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) cmpwi r3,0xf00 - beq performance_monitor_common + beq performance_monitor_common_virt BEGIN_FTR_SECTION cmpwi r3,0xa00 beq h_doorbell_common_msgclr cmpwi r3,0xe60 - beq hmi_exception_common + beq hmi_exception_common_virt FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr -- cgit v1.2.3 From 2babd6ea43edacfc1577432baa187a7d212f3f4f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Feb 2020 03:35:25 +1000 Subject: powerpc/64s/exception: Avoid touching the stack in hdecrementer The hdec interrupt handler is reported to sometimes fire in Linux if KVM leaves it pending after a guest exists. This is harmless, so there is a no-op handler for it. The interrupt handler currently uses the regular kernel stack. Change this to avoid touching the stack entirely. This should be the last place where the regular Linux stack can be accessed with asynchronous interrupts (including PMI) soft-masked. It might be possible to take advantage of this invariant, e.g., to context switch the kernel stack SLB entry without clearing MSR[EE]. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200225173541.1549955-17-npiggin@gmail.com --- arch/powerpc/include/asm/time.h | 1 - arch/powerpc/kernel/exceptions-64s.S | 25 ++++++++++++++++++++----- arch/powerpc/kernel/time.c | 9 --------- 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 08dbe3e6831c..e0107495c4de 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -24,7 +24,6 @@ extern struct clock_event_device decrementer_clockevent; extern void generic_calibrate_decr(void); -extern void hdec_interrupt(struct pt_regs *regs); /* Some sane defaults: 125 MHz timebase, 1GHz processor */ extern unsigned long ppc_proc_freq; diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index da15ec4fc8cb..146afa0922fb 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1507,6 +1507,8 @@ EXC_COMMON_BEGIN(decrementer_common) INT_DEFINE_BEGIN(hdecrementer) IVEC=0x980 IHSRR=EXC_HV + ISTACK=0 + IRECONCILE=0 IKVM_REAL=1 IKVM_VIRT=1 INT_DEFINE_END(hdecrementer) @@ -1518,11 +1520,24 @@ EXC_VIRT_BEGIN(hdecrementer, 0x4980, 0x80) GEN_INT_ENTRY hdecrementer, virt=1 EXC_VIRT_END(hdecrementer, 0x4980, 0x80) EXC_COMMON_BEGIN(hdecrementer_common) - GEN_COMMON hdecrementer - bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD - bl hdec_interrupt - b ret_from_except + __GEN_COMMON_ENTRY hdecrementer + /* + * Hypervisor decrementer interrupts not caught by the KVM test + * shouldn't occur but are sometimes left pending on exit from a KVM + * guest. We don't need to do anything to clear them, as they are + * edge-triggered. + * + * Be careful to avoid touching the kernel stack. + */ + ld r10,PACA_EXGEN+EX_CTR(r13) + mtctr r10 + mtcrf 0x80,r9 + ld r9,PACA_EXGEN+EX_R9(r13) + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + ld r12,PACA_EXGEN+EX_R12(r13) + ld r13,PACA_EXGEN+EX_R13(r13) + HRFI_TO_KERNEL GEN_KVM hdecrementer diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 1168e8b37e30..bda9cb4a0a5f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -663,15 +663,6 @@ void timer_broadcast_interrupt(void) } #endif -/* - * Hypervisor decrementer interrupts shouldn't occur but are sometimes - * left pending on exit from a KVM guest. We don't need to do anything - * to clear them, as they are edge-triggered. - */ -void hdec_interrupt(struct pt_regs *regs) -{ -} - #ifdef CONFIG_SUSPEND static void generic_suspend_disable_irqs(void) { -- cgit v1.2.3 From 68b34588e2027f699a3c034235f21cd19356b2e6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Feb 2020 03:35:34 +1000 Subject: powerpc/64/sycall: Implement syscall entry/exit logic in C System call entry and particularly exit code is beyond the limit of what is reasonable to implement in asm. This conversion moves all conditional branches out of the asm code, except for the case that all GPRs should be restored at exit. Null syscall test is about 5% faster after this patch, because the exit work is handled under local_irq_disable, and the hard mask and pending interrupt replay is handled after that, which avoids games with MSR. mpe: Includes subsequent fixes from Nick: This fixes 4 issues caught by TM selftests. First was a tm-syscall bug that hit due to tabort_syscall being called after interrupts were reconciled (in a subsequent patch), which led to interrupts being enabled before tabort_syscall was called. Rather than going through an un-reconciling interrupts for the return, I just go back to putting the test early in asm, the C-ification of that wasn't a big win anyway. Second is the syscall return _TIF_USER_WORK_MASK check would go into an infinite loop if _TIF_RESTORE_TM became set. The asm code uses _TIF_USER_WORK_MASK to brach to slowpath which includes restore_tm_state. Third is system call return was not calling restore_tm_state, I missed this completely (alhtough it's in the return from interrupt C conversion because when the asm syscall code encountered problems it would branch to the interrupt return code. Fourth is MSR_VEC missing from restore_math, which was caught by tm-unavailable selftest taking an unexpected facility unavailable interrupt when testing VSX unavailble exception with MSR.FP=1 MSR.VEC=1. Fourth case also has a fixup in a subsequent patch. Signed-off-by: Nicholas Piggin Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200225173541.1549955-26-npiggin@gmail.com --- arch/powerpc/include/asm/asm-prototypes.h | 13 +- arch/powerpc/include/asm/book3s/64/kup-radix.h | 14 +- arch/powerpc/include/asm/cputime.h | 33 +++ arch/powerpc/include/asm/hw_irq.h | 4 + arch/powerpc/include/asm/ptrace.h | 3 + arch/powerpc/include/asm/signal.h | 3 + arch/powerpc/include/asm/switch_to.h | 5 + arch/powerpc/include/asm/time.h | 3 + arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/entry_64.S | 326 ++++--------------------- arch/powerpc/kernel/signal.h | 2 - arch/powerpc/kernel/syscall_64.c | 214 ++++++++++++++++ arch/powerpc/kernel/systbl.S | 9 +- 13 files changed, 326 insertions(+), 306 deletions(-) create mode 100644 arch/powerpc/kernel/syscall_64.c (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 983c0084fb3f..ab59a4904254 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -97,6 +97,8 @@ ppc_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, unsigned long __init early_init(unsigned long dt_ptr); void __init machine_init(u64 dt_ptr); #endif +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs); +notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs); long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); @@ -104,14 +106,6 @@ long sys_switch_endian(void); notrace unsigned int __check_irq_replay(void); void notrace restore_interrupts(void); -/* ptrace */ -long do_syscall_trace_enter(struct pt_regs *regs); -void do_syscall_trace_leave(struct pt_regs *regs); - -/* process */ -void restore_math(struct pt_regs *regs); -void restore_tm_state(struct pt_regs *regs); - /* prom_init (OpenFirmware) */ unsigned long __init prom_init(unsigned long r3, unsigned long r4, unsigned long pp, @@ -122,9 +116,6 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, void __init early_setup(unsigned long dt_ptr); void early_setup_secondary(void); -/* time */ -void accumulate_stolen_time(void); - /* misc runtime */ extern u64 __bswapdi2(u64); extern s64 __lshrdi3(s64, int); diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 90dd3a3fc8c7..71081d90f999 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -3,6 +3,7 @@ #define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H #include +#include #define AMR_KUAP_BLOCK_READ UL(0x4000000000000000) #define AMR_KUAP_BLOCK_WRITE UL(0x8000000000000000) @@ -56,7 +57,14 @@ #ifdef CONFIG_PPC_KUAP -#include +#include +#include + +static inline void kuap_check_amr(void) +{ + if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP)) + WARN_ON_ONCE(mfspr(SPRN_AMR) != AMR_KUAP_BLOCKED); +} /* * We support individually allowing read or write, but we don't support nesting @@ -127,6 +135,10 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) (regs->kuap & (is_write ? AMR_KUAP_BLOCK_WRITE : AMR_KUAP_BLOCK_READ)), "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); } +#else /* CONFIG_PPC_KUAP */ +static inline void kuap_check_amr(void) +{ +} #endif /* CONFIG_PPC_KUAP */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 2431b4ada2fa..0fccd5ea1e9a 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -43,9 +43,12 @@ static inline unsigned long cputime_to_usecs(const cputime_t ct) */ #ifdef CONFIG_PPC64 #define get_accounting(tsk) (&get_paca()->accounting) +#define raw_get_accounting(tsk) (&local_paca->accounting) static inline void arch_vtime_task_switch(struct task_struct *tsk) { } + #else #define get_accounting(tsk) (&task_thread_info(tsk)->accounting) +#define raw_get_accounting(tsk) get_accounting(tsk) /* * Called from the context switch with interrupts disabled, to charge all * accumulated times to the current process, and to prepare accounting on @@ -60,6 +63,36 @@ static inline void arch_vtime_task_switch(struct task_struct *prev) } #endif +/* + * account_cpu_user_entry/exit runs "unreconciled", so can't trace, + * can't use use get_paca() + */ +static notrace inline void account_cpu_user_entry(void) +{ + unsigned long tb = mftb(); + struct cpu_accounting_data *acct = raw_get_accounting(current); + + acct->utime += (tb - acct->starttime_user); + acct->starttime = tb; +} + +static notrace inline void account_cpu_user_exit(void) +{ + unsigned long tb = mftb(); + struct cpu_accounting_data *acct = raw_get_accounting(current); + + acct->stime += (tb - acct->starttime); + acct->starttime_user = tb; +} + + #endif /* __KERNEL__ */ +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +static inline void account_cpu_user_entry(void) +{ +} +static inline void account_cpu_user_exit(void) +{ +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #endif /* __POWERPC_CPUTIME_H */ diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index e3a905e3d573..310583e62bd9 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -228,9 +228,13 @@ static inline bool arch_irqs_disabled(void) #ifdef CONFIG_PPC_BOOK3E #define __hard_irq_enable() wrtee(MSR_EE) #define __hard_irq_disable() wrtee(0) +#define __hard_EE_RI_disable() wrtee(0) +#define __hard_RI_enable() do { } while (0) #else #define __hard_irq_enable() __mtmsrd(MSR_EE|MSR_RI, 1) #define __hard_irq_disable() __mtmsrd(MSR_RI, 1) +#define __hard_EE_RI_disable() __mtmsrd(0, 1) +#define __hard_RI_enable() __mtmsrd(MSR_RI, 1) #endif #define hard_irq_disable() do { \ diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index ee3ada66deb5..082a40153b94 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -138,6 +138,9 @@ extern unsigned long profile_pc(struct pt_regs *regs); #define profile_pc(regs) instruction_pointer(regs) #endif +long do_syscall_trace_enter(struct pt_regs *regs); +void do_syscall_trace_leave(struct pt_regs *regs); + #define kernel_stack_pointer(regs) ((regs)->gpr[1]) static inline int is_syscall_success(struct pt_regs *regs) { diff --git a/arch/powerpc/include/asm/signal.h b/arch/powerpc/include/asm/signal.h index 0803ca8b9149..99e1c6de27bc 100644 --- a/arch/powerpc/include/asm/signal.h +++ b/arch/powerpc/include/asm/signal.h @@ -6,4 +6,7 @@ #include #include +struct pt_regs; +void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); + #endif /* _ASM_POWERPC_SIGNAL_H */ diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 5b03d8a82409..476008bc3d08 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -5,6 +5,7 @@ #ifndef _ASM_POWERPC_SWITCH_TO_H #define _ASM_POWERPC_SWITCH_TO_H +#include #include struct thread_struct; @@ -22,6 +23,10 @@ extern void switch_booke_debug_regs(struct debug_reg *new_debug); extern int emulate_altivec(struct pt_regs *); +void restore_math(struct pt_regs *regs); + +void restore_tm_state(struct pt_regs *regs); + extern void flush_all_to_thread(struct task_struct *); extern void giveup_all(struct task_struct *); diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index e0107495c4de..39ce95016a3a 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -194,5 +194,8 @@ DECLARE_PER_CPU(u64, decrementers_next_tb); /* Convert timebase ticks to nanoseconds */ unsigned long long tb_to_ns(unsigned long long tb_ticks); +/* SPLPAR */ +void accumulate_stolen_time(void); + #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 78a1b22d4fd8..5700231a8988 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -50,7 +50,8 @@ obj-y := cputable.o ptrace.o syscalls.o \ of_platform.o prom_parse.o obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o nvram_64.o firmware.o note.o + paca.o nvram_64.o firmware.o note.o \ + syscall_64.o obj-$(CONFIG_VDSO32) += vdso32/ obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 14afe12eae8c..5f70830b5ae4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -69,6 +69,7 @@ BEGIN_FTR_SECTION bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif +_ASM_NOKPROBE_SYMBOL(system_call_common) mr r10,r1 ld r1,PACAKSAVE(r13) std r10,0(r1) @@ -76,341 +77,98 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) std r12,_MSR(r1) std r0,GPR0(r1) std r10,GPR1(r1) + std r2,GPR2(r1) #ifdef CONFIG_PPC_FSL_BOOK3E START_BTB_FLUSH_SECTION BTB_FLUSH(r10) END_BTB_FLUSH_SECTION #endif - ACCOUNT_CPU_USER_ENTRY(r13, r10, r11) - std r2,GPR2(r1) + ld r2,PACATOC(r13) + mfcr r12 + li r11,0 + /* Can we avoid saving r3-r8 in common case? */ std r3,GPR3(r1) - mfcr r2 std r4,GPR4(r1) std r5,GPR5(r1) std r6,GPR6(r1) std r7,GPR7(r1) std r8,GPR8(r1) - li r11,0 + /* Zero r9-r12, this should only be required when restoring all GPRs */ std r11,GPR9(r1) std r11,GPR10(r1) std r11,GPR11(r1) std r11,GPR12(r1) - std r11,_XER(r1) - std r11,_CTR(r1) std r9,GPR13(r1) SAVE_NVGPRS(r1) + std r11,_XER(r1) + std r11,_CTR(r1) mflr r10 + /* * This clears CR0.SO (bit 28), which is the error indication on * return from this system call. */ - rldimi r2,r11,28,(63-28) + rldimi r12,r11,28,(63-28) li r11,0xc00 std r10,_LINK(r1) std r11,_TRAP(r1) + std r12,_CCR(r1) std r3,ORIG_GPR3(r1) - std r2,_CCR(r1) - ld r2,PACATOC(r13) - addi r9,r1,STACK_FRAME_OVERHEAD + addi r10,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) - std r11,-16(r9) /* "regshere" marker */ - - kuap_check_amr r10, r11 - -#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) -BEGIN_FW_FTR_SECTION - /* see if there are any DTL entries to process */ - ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ - ld r11,PACA_DTL_RIDX(r13) /* get log read index */ - addi r10,r10,LPPACA_DTLIDX - LDX_BE r10,0,r10 /* get log write index */ - cmpd r11,r10 - beq+ 33f - bl accumulate_stolen_time - REST_GPR(0,r1) - REST_4GPRS(3,r1) - REST_2GPRS(7,r1) - addi r9,r1,STACK_FRAME_OVERHEAD -33: -END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */ - - /* - * A syscall should always be called with interrupts enabled - * so we just unconditionally hard-enable here. When some kind - * of irq tracing is used, we additionally check that condition - * is correct - */ -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) - lbz r10,PACAIRQSOFTMASK(r13) -1: tdnei r10,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - -#ifdef CONFIG_PPC_BOOK3E - wrteei 1 -#else - li r11,MSR_RI - ori r11,r11,MSR_EE - mtmsrd r11,1 -#endif /* CONFIG_PPC_BOOK3E */ - -system_call: /* label this so stack traces look sane */ - /* We do need to set SOFTE in the stack frame or the return - * from interrupt will be painful - */ - li r10,IRQS_ENABLED - std r10,SOFTE(r1) - - ld r11, PACA_THREAD_INFO(r13) - ld r10,TI_FLAGS(r11) - andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne .Lsyscall_dotrace /* does not return */ - cmpldi 0,r0,NR_syscalls - bge- .Lsyscall_enosys + std r11,-16(r10) /* "regshere" marker */ -.Lsyscall: -/* - * Need to vector to 32 Bit or default sys_call_table here, - * based on caller's run-mode / personality. - */ - ld r11,SYS_CALL_TABLE@toc(2) - andis. r10,r10,_TIF_32BIT@h - beq 15f - ld r11,COMPAT_SYS_CALL_TABLE@toc(2) - clrldi r3,r3,32 - clrldi r4,r4,32 - clrldi r5,r5,32 - clrldi r6,r6,32 - clrldi r7,r7,32 - clrldi r8,r8,32 -15: - slwi r0,r0,3 - - barrier_nospec_asm - /* - * Prevent the load of the handler below (based on the user-passed - * system call number) being speculatively executed until the test - * against NR_syscalls and branch to .Lsyscall_enosys above has - * committed. - */ - - ldx r12,r11,r0 /* Fetch system call handler [ptr] */ - mtctr r12 - bctrl /* Call handler */ + /* Calling convention has r9 = orig r0, r10 = regs */ + mr r9,r0 + bl system_call_exception - /* syscall_exit can exit to kernel mode, via ret_from_kernel_thread */ .Lsyscall_exit: - std r3,RESULT(r1) - -#ifdef CONFIG_DEBUG_RSEQ - /* Check whether the syscall is issued inside a restartable sequence */ - addi r3,r1,STACK_FRAME_OVERHEAD - bl rseq_syscall - ld r3,RESULT(r1) -#endif - - ld r12, PACA_THREAD_INFO(r13) - - ld r8,_MSR(r1) - -/* - * This is a few instructions into the actual syscall exit path (which actually - * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the - * number of visible symbols for profiling purposes. - * - * We can probe from system_call until this point as MSR_RI is set. But once it - * is cleared below, we won't be able to take a trap. - * - * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). - */ -system_call_exit: - /* - * Disable interrupts so current_thread_info()->flags can't change, - * and so that we don't get interrupted after loading SRR0/1. - * - * Leave MSR_RI enabled for now, because with THREAD_INFO_IN_TASK we - * could fault on the load of the TI_FLAGS below. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r11,MSR_RI - mtmsrd r11,1 -#endif /* CONFIG_PPC_BOOK3E */ + addi r4,r1,STACK_FRAME_OVERHEAD + bl syscall_exit_prepare - ld r9,TI_FLAGS(r12) - li r11,-MAX_ERRNO - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- .Lsyscall_exit_work + ld r2,_CCR(r1) + ld r4,_NIP(r1) + ld r5,_MSR(r1) + ld r6,_LINK(r1) - andi. r0,r8,MSR_FP - beq 2f -#ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f -#endif -2: addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_math - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO - -3: cmpld r3,r11 - ld r5,_CCR(r1) - bge- .Lsyscall_error -.Lsyscall_error_cont: - ld r7,_NIP(r1) BEGIN_FTR_SECTION stdcx. r0,0,r1 /* to clear the reservation */ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - andi. r6,r8,MSR_PR - ld r4,_LINK(r1) - kuap_check_amr r10, r11 + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + mtlr r6 -#ifdef CONFIG_PPC_BOOK3S - /* - * Clear MSR_RI, MSR_EE is already and remains disabled. We could do - * this later, but testing shows that doing it here causes less slow - * down than doing it closer to the rfid. - */ - li r11,0 - mtmsrd r11,1 -#endif - - beq- 1f - ACCOUNT_CPU_USER_EXIT(r13, r11, r12) + cmpdi r3,0 + bne .Lsyscall_restore_regs +.Lsyscall_restore_regs_cont: BEGIN_FTR_SECTION HMT_MEDIUM_LOW END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - std r8, PACATMSCRATCH(r13) -#endif - /* * We don't need to restore AMR on the way back to userspace for KUAP. * The value of AMR only matters while we're in the kernel. */ - ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + mtcr r2 ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r13,GPR13(r1) ld r1,GPR1(r1) - mtlr r4 - mtcr r5 - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r8 RFI_TO_USER b . /* prevent speculative execution */ -1: /* exit to kernel */ - kuap_restore_amr r2 - - ld r2,GPR2(r1) - ld r1,GPR1(r1) - mtlr r4 - mtcr r5 - mtspr SPRN_SRR0,r7 - mtspr SPRN_SRR1,r8 - RFI_TO_KERNEL - b . /* prevent speculative execution */ - -.Lsyscall_error: - oris r5,r5,0x1000 /* Set SO bit in CR */ - neg r3,r3 - std r5,_CCR(r1) - b .Lsyscall_error_cont - -/* Traced system call support */ -.Lsyscall_dotrace: - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_enter - - /* - * We use the return value of do_syscall_trace_enter() as the syscall - * number. If the syscall was rejected for any reason do_syscall_trace_enter() - * returns an invalid syscall number and the test below against - * NR_syscalls will fail. - */ - mr r0,r3 - - /* Restore argument registers just clobbered and/or possibly changed. */ - ld r3,GPR3(r1) - ld r4,GPR4(r1) - ld r5,GPR5(r1) - ld r6,GPR6(r1) - ld r7,GPR7(r1) - ld r8,GPR8(r1) - - /* Repopulate r9 and r10 for the syscall path */ - addi r9,r1,STACK_FRAME_OVERHEAD - ld r10, PACA_THREAD_INFO(r13) - ld r10,TI_FLAGS(r10) - - cmpldi r0,NR_syscalls - blt+ .Lsyscall - - /* Return code is already in r3 thanks to do_syscall_trace_enter() */ - b .Lsyscall_exit - - -.Lsyscall_enosys: - li r3,-ENOSYS - b .Lsyscall_exit - -.Lsyscall_exit_work: - /* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr. - If TIF_NOERROR is set, just save r3 as it is. */ - - andi. r0,r9,_TIF_RESTOREALL - beq+ 0f +.Lsyscall_restore_regs: + ld r3,_CTR(r1) + ld r4,_XER(r1) REST_NVGPRS(r1) - b 2f -0: cmpld r3,r11 /* r11 is -MAX_ERRNO */ - blt+ 1f - andi. r0,r9,_TIF_NOERROR - bne- 1f - ld r5,_CCR(r1) - neg r3,r3 - oris r5,r5,0x1000 /* Set SO bit in CR */ - std r5,_CCR(r1) -1: std r3,GPR3(r1) -2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) - beq 4f - - /* Clear per-syscall TIF flags if any are set. */ - - li r11,_TIF_PERSYSCALL_MASK - addi r12,r12,TI_FLAGS -3: ldarx r10,0,r12 - andc r10,r10,r11 - stdcx. r10,0,r12 - bne- 3b - subi r12,r12,TI_FLAGS - -4: /* Anything else left to do? */ -BEGIN_FTR_SECTION - lis r3,DEFAULT_PPR@highest /* Set default PPR */ - sldi r3,r3,32 /* bits 11-13 are used for ppr */ - std r3,_PPR(r1) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP) - beq ret_from_except_lite - - /* Re-enable interrupts */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 1 -#else - li r10,MSR_RI - ori r10,r10,MSR_EE - mtmsrd r10,1 -#endif /* CONFIG_PPC_BOOK3E */ - - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_syscall_trace_leave - b ret_from_except + mtctr r3 + mtspr SPRN_XER,r4 + ld r0,GPR0(r1) + REST_8GPRS(4, r1) + ld r12,GPR12(r1) + b .Lsyscall_restore_regs_cont #ifdef CONFIG_PPC_TRANSACTIONAL_MEM .Ltabort_syscall: @@ -438,8 +196,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI_TO_USER b . /* prevent speculative execution */ #endif -_ASM_NOKPROBE_SYMBOL(system_call_common); -_ASM_NOKPROBE_SYMBOL(system_call_exit); _GLOBAL(ret_from_fork) bl schedule_tail diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h index 800433685888..d396efca4068 100644 --- a/arch/powerpc/kernel/signal.h +++ b/arch/powerpc/kernel/signal.h @@ -10,8 +10,6 @@ #ifndef _POWERPC_ARCH_SIGNAL_H #define _POWERPC_ARCH_SIGNAL_H -extern void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags); - extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp, size_t frame_size, int is_32); diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c new file mode 100644 index 000000000000..75be20fdb270 --- /dev/null +++ b/arch/powerpc/kernel/syscall_64.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef long (*syscall_fn)(long, long, long, long, long, long); + +/* Has to run notrace because it is entered "unreconciled" */ +notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, + unsigned long r0, struct pt_regs *regs) +{ + unsigned long ti_flags; + syscall_fn f; + + BUG_ON(!(regs->msr & MSR_PR)); + + account_cpu_user_entry(); + +#ifdef CONFIG_PPC_SPLPAR + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && + firmware_has_feature(FW_FEATURE_SPLPAR)) { + struct lppaca *lp = local_paca->lppaca_ptr; + + if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx))) + accumulate_stolen_time(); + } +#endif + + kuap_check_amr(); + + /* + * A syscall should always be called with interrupts enabled + * so we just unconditionally hard-enable here. When some kind + * of irq tracing is used, we additionally check that condition + * is correct + */ + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + WARN_ON(irq_soft_mask_return() != IRQS_ENABLED); + WARN_ON(local_paca->irq_happened); + } + /* + * This is not required for the syscall exit path, but makes the + * stack frame look nicer. If this was initialised in the first stack + * frame, or if the unwinder was taught the first stack frame always + * returns to user with IRQS_ENABLED, this store could be avoided! + */ + regs->softe = IRQS_ENABLED; + + __hard_irq_enable(); + + ti_flags = current_thread_info()->flags; + if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { + /* + * We use the return value of do_syscall_trace_enter() as the + * syscall number. If the syscall was rejected for any reason + * do_syscall_trace_enter() returns an invalid syscall number + * and the test against NR_syscalls will fail and the return + * value to be used is in regs->gpr[3]. + */ + r0 = do_syscall_trace_enter(regs); + if (unlikely(r0 >= NR_syscalls)) + return regs->gpr[3]; + r3 = regs->gpr[3]; + r4 = regs->gpr[4]; + r5 = regs->gpr[5]; + r6 = regs->gpr[6]; + r7 = regs->gpr[7]; + r8 = regs->gpr[8]; + + } else if (unlikely(r0 >= NR_syscalls)) { + return -ENOSYS; + } + + /* May be faster to do array_index_nospec? */ + barrier_nospec(); + + if (unlikely(ti_flags & _TIF_32BIT)) { + f = (void *)compat_sys_call_table[r0]; + + r3 &= 0x00000000ffffffffULL; + r4 &= 0x00000000ffffffffULL; + r5 &= 0x00000000ffffffffULL; + r6 &= 0x00000000ffffffffULL; + r7 &= 0x00000000ffffffffULL; + r8 &= 0x00000000ffffffffULL; + + } else { + f = (void *)sys_call_table[r0]; + } + + return f(r3, r4, r5, r6, r7, r8); +} + +/* + * This should be called after a syscall returns, with r3 the return value + * from the syscall. If this function returns non-zero, the system call + * exit assembly should additionally load all GPR registers and CTR and XER + * from the interrupt frame. + * + * The function graph tracer can not trace the return side of this function, + * because RI=0 and soft mask state is "unreconciled", so it is marked notrace. + */ +notrace unsigned long syscall_exit_prepare(unsigned long r3, + struct pt_regs *regs) +{ + unsigned long *ti_flagsp = ¤t_thread_info()->flags; + unsigned long ti_flags; + unsigned long ret = 0; + + regs->result = r3; + + /* Check whether the syscall is issued inside a restartable sequence */ + rseq_syscall(regs); + + ti_flags = *ti_flagsp; + + if (unlikely(r3 >= (unsigned long)-MAX_ERRNO)) { + if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) { + r3 = -r3; + regs->ccr |= 0x10000000; /* Set SO bit in CR */ + } + } + + if (unlikely(ti_flags & _TIF_PERSYSCALL_MASK)) { + if (ti_flags & _TIF_RESTOREALL) + ret = _TIF_RESTOREALL; + else + regs->gpr[3] = r3; + clear_bits(_TIF_PERSYSCALL_MASK, ti_flagsp); + } else { + regs->gpr[3] = r3; + } + + if (unlikely(ti_flags & _TIF_SYSCALL_DOTRACE)) { + do_syscall_trace_leave(regs); + ret |= _TIF_RESTOREALL; + } + +again: + local_irq_disable(); + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); + if (ti_flags & _TIF_NEED_RESCHED) { + schedule(); + } else { + /* + * SIGPENDING must restore signal handler function + * argument GPRs, and some non-volatiles (e.g., r1). + * Restore all for now. This could be made lighter. + */ + if (ti_flags & _TIF_SIGPENDING) + ret |= _TIF_RESTOREALL; + do_notify_resume(regs, ti_flags); + } + local_irq_disable(); + ti_flags = READ_ONCE(*ti_flagsp); + } + + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) { + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + unlikely((ti_flags & _TIF_RESTORE_TM))) { + restore_tm_state(regs); + } else { + unsigned long mathflags = MSR_FP; + + if (cpu_has_feature(CPU_FTR_VSX)) + mathflags |= MSR_VEC | MSR_VSX; + else if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mathflags |= MSR_VEC; + + if ((regs->msr & mathflags) != mathflags) + restore_math(regs); + } + } + + /* This must be done with RI=1 because tracing may touch vmaps */ + trace_hardirqs_on(); + + /* This pattern matches prep_irq_for_idle */ + __hard_EE_RI_disable(); + if (unlikely(lazy_irq_pending())) { + __hard_RI_enable(); + trace_hardirqs_off(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + local_irq_enable(); + /* Took an interrupt which may have more exit work to do. */ + goto again; + } + local_paca->irq_happened = 0; + irq_soft_mask_set(IRQS_ENABLED); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + local_paca->tm_scratch = regs->msr; +#endif + + kuap_check_amr(); + + account_cpu_user_exit(); + + return ret; +} diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 5b905a2f4e4d..d34276f3c495 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -16,25 +16,22 @@ #ifdef CONFIG_PPC64 .p2align 3 +#define __SYSCALL(nr, entry) .8byte entry +#else +#define __SYSCALL(nr, entry) .long entry #endif .globl sys_call_table sys_call_table: #ifdef CONFIG_PPC64 -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include -#undef __SYSCALL #else -#define __SYSCALL(nr, entry) .long entry #include -#undef __SYSCALL #endif #ifdef CONFIG_COMPAT .globl compat_sys_call_table compat_sys_call_table: #define compat_sys_sigsuspend sys_sigsuspend -#define __SYSCALL(nr, entry) .8byte DOTSYM(entry) #include -#undef __SYSCALL #endif -- cgit v1.2.3 From 3282a3da25bd63fdb7240bc35dbdefa4b1947005 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Feb 2020 03:35:36 +1000 Subject: powerpc/64: Implement soft interrupt replay in C When local_irq_enable() finds a pending soft-masked interrupt, it "replays" it by setting up registers like the initial interrupt entry, then calls into the low level handler to set up an interrupt stack frame and process the interrupt. This is not necessary, and uses more stack than needed. The high level interrupt handler can be called directly from C, with just pt_regs set up on stack. This should be faster and use less stack. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200225173541.1549955-28-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 1 - arch/powerpc/kernel/exceptions-64e.S | 32 ------- arch/powerpc/kernel/exceptions-64s.S | 47 ---------- arch/powerpc/kernel/irq.c | 165 +++++++++++++++++++++++++++-------- 4 files changed, 130 insertions(+), 115 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 310583e62bd9..0e9a9598f91f 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -52,7 +52,6 @@ #ifndef __ASSEMBLY__ extern void replay_system_reset(void); -extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); extern void timer_broadcast_interrupt(void); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index e4076e3c072d..4efac5490216 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -1002,38 +1002,6 @@ masked_interrupt_book3e_0x280: masked_interrupt_book3e_0x2c0: masked_interrupt_book3e PACA_IRQ_DBELL 0 -/* - * Called from arch_local_irq_enable when an interrupt needs - * to be resent. r3 contains either 0x500,0x900,0x260 or 0x280 - * to indicate the kind of interrupt. MSR:EE is already off. - * We generate a stackframe like if a real interrupt had happened. - * - * Note: While MSR:EE is off, we need to make sure that _MSR - * in the generated frame has EE set to 1 or the exception - * handler will not properly re-enable them. - */ -_GLOBAL(__replay_interrupt) - /* We are going to jump to the exception common code which - * will retrieve various register values from the PACA which - * we don't give a damn about. - */ - mflr r10 - mfmsr r11 - mfcr r4 - mtspr SPRN_SPRG_GEN_SCRATCH,r13; - std r1,PACA_EXGEN+EX_R1(r13); - stw r4,PACA_EXGEN+EX_CR(r13); - ori r11,r11,MSR_EE - subi r1,r1,INT_FRAME_SIZE; - cmpwi cr0,r3,0x500 - beq exc_0x500_common - cmpwi cr0,r3,0x900 - beq exc_0x900_common - cmpwi cr0,r3,0x280 - beq exc_0x280_common - blr - - /* * This is called from 0x300 and 0x400 handlers after the prologs with * r14 and r15 containing the fault address and error code, with the diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d75df223da7c..d6536a7c2a01 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -3165,50 +3165,3 @@ doorbell_super_common_msgclr: LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) PPC_MSGCLRP(3) b doorbell_super_common_virt - -/* - * Called from arch_local_irq_enable when an interrupt needs - * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate - * which kind of interrupt. MSR:EE is already off. We generate a - * stackframe like if a real interrupt had happened. - * - * Note: While MSR:EE is off, we need to make sure that _MSR - * in the generated frame has EE set to 1 or the exception - * handler will not properly re-enable them. - * - * Note that we don't specify LR as the NIP (return address) for - * the interrupt because that would unbalance the return branch - * predictor. - */ -_GLOBAL(__replay_interrupt) - /* We are going to jump to the exception common code which - * will retrieve various register values from the PACA which - * we don't give a damn about, so we don't bother storing them. - */ - mfmsr r12 - LOAD_REG_ADDR(r11, replay_interrupt_return) - mfcr r9 - ori r12,r12,MSR_EE - cmpwi r3,0x900 - beq decrementer_common_virt - cmpwi r3,0x500 -BEGIN_FTR_SECTION - beq h_virt_irq_common_virt -FTR_SECTION_ELSE - beq hardware_interrupt_common_virt -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) - cmpwi r3,0xf00 - beq performance_monitor_common_virt -BEGIN_FTR_SECTION - cmpwi r3,0xa00 - beq h_doorbell_common_msgclr - cmpwi r3,0xe60 - beq hmi_exception_common_virt -FTR_SECTION_ELSE - cmpwi r3,0xa00 - beq doorbell_super_common_msgclr -ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -replay_interrupt_return: - blr - -_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 1bed18b7229e..2e5dca87b936 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -70,6 +70,7 @@ #include #include #include +#include #endif #define CREATE_TRACE_POINTS #include @@ -230,10 +231,121 @@ notrace unsigned int __check_irq_replay(void) return 0; } +static void replay_soft_interrupts(void) +{ + /* + * We use local_paca rather than get_paca() to avoid all + * the debug_smp_processor_id() business in this low level + * function + */ + unsigned char happened = local_paca->irq_happened; + struct pt_regs regs; + + ppc_save_regs(®s); + regs.softe = IRQS_ALL_DISABLED; + +again: + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(mfmsr() & MSR_EE); + + if (happened & PACA_IRQ_HARD_DIS) { + /* + * We may have missed a decrementer interrupt if hard disabled. + * Check the decrementer register in case we had a rollover + * while hard disabled. + */ + if (!(happened & PACA_IRQ_DEC)) { + if (decrementer_check_overflow()) + happened |= PACA_IRQ_DEC; + } + } + + /* + * Force the delivery of pending soft-disabled interrupts on PS3. + * Any HV call will have this side effect. + */ + if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { + u64 tmp, tmp2; + lv1_get_version_info(&tmp, &tmp2); + } + + /* + * Check if an hypervisor Maintenance interrupt happened. + * This is a higher priority interrupt than the others, so + * replay it first. + */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (happened & PACA_IRQ_HMI)) { + local_paca->irq_happened &= ~PACA_IRQ_HMI; + regs.trap = 0xe60; + handle_hmi_exception(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + if (happened & PACA_IRQ_DEC) { + local_paca->irq_happened &= ~PACA_IRQ_DEC; + regs.trap = 0x900; + timer_interrupt(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + if (happened & PACA_IRQ_EE) { + local_paca->irq_happened &= ~PACA_IRQ_EE; + regs.trap = 0x500; + do_IRQ(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + /* + * Check if an EPR external interrupt happened this bit is typically + * set if we need to handle another "edge" interrupt from within the + * MPIC "EPR" handler. + */ + if (IS_ENABLED(CONFIG_PPC_BOOK3E) && (happened & PACA_IRQ_EE_EDGE)) { + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; + regs.trap = 0x500; + do_IRQ(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + if (IS_ENABLED(CONFIG_PPC_DOORBELL) && (happened & PACA_IRQ_DBELL)) { + local_paca->irq_happened &= ~PACA_IRQ_DBELL; + if (IS_ENABLED(CONFIG_PPC_BOOK3E)) + regs.trap = 0x280; + else + regs.trap = 0xa00; + doorbell_exception(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + /* Book3E does not support soft-masking PMI interrupts */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && (happened & PACA_IRQ_PMI)) { + local_paca->irq_happened &= ~PACA_IRQ_PMI; + regs.trap = 0xf00; + performance_monitor_exception(®s); + if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS)) + hard_irq_disable(); + } + + happened = local_paca->irq_happened; + if (happened & ~PACA_IRQ_HARD_DIS) { + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + goto again; + } +} + notrace void arch_local_irq_restore(unsigned long mask) { unsigned char irq_happened; - unsigned int replay; /* Write the new soft-enabled value */ irq_soft_mask_set(mask); @@ -255,24 +367,16 @@ notrace void arch_local_irq_restore(unsigned long mask) */ irq_happened = get_irq_happened(); if (!irq_happened) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON_ONCE(!(mfmsr() & MSR_EE)); -#endif + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(!(mfmsr() & MSR_EE)); return; } - /* - * We need to hard disable to get a trusted value from - * __check_irq_replay(). We also need to soft-disable - * again to avoid warnings in there due to the use of - * per-cpu variables. - */ + /* We need to hard disable to replay. */ if (!(irq_happened & PACA_IRQ_HARD_DIS)) { -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG - WARN_ON_ONCE(!(mfmsr() & MSR_EE)); -#endif + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) + WARN_ON_ONCE(!(mfmsr() & MSR_EE)); __hard_irq_disable(); -#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG } else { /* * We should already be hard disabled here. We had bugs @@ -280,35 +384,26 @@ notrace void arch_local_irq_restore(unsigned long mask) * warn if we are wrong. Only do that when IRQ tracing * is enabled as mfmsr() can be costly. */ - if (WARN_ON_ONCE(mfmsr() & MSR_EE)) - __hard_irq_disable(); -#endif + if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { + if (WARN_ON_ONCE(mfmsr() & MSR_EE)) + __hard_irq_disable(); + } + + if (irq_happened == PACA_IRQ_HARD_DIS) { + local_paca->irq_happened = 0; + __hard_irq_enable(); + return; + } } irq_soft_mask_set(IRQS_ALL_DISABLED); trace_hardirqs_off(); - /* - * Check if anything needs to be re-emitted. We haven't - * soft-enabled yet to avoid warnings in decrementer_check_overflow - * accessing per-cpu variables - */ - replay = __check_irq_replay(); + replay_soft_interrupts(); + local_paca->irq_happened = 0; - /* We can soft-enable now */ trace_hardirqs_on(); irq_soft_mask_set(IRQS_ENABLED); - - /* - * And replay if we have to. This will return with interrupts - * hard-enabled. - */ - if (replay) { - __replay_interrupt(replay); - return; - } - - /* Finally, let's ensure we are hard enabled */ __hard_irq_enable(); } EXPORT_SYMBOL(arch_local_irq_restore); -- cgit v1.2.3 From 6cc0c16d82f889f0083f3608237189afb55b67be Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 26 Feb 2020 03:35:37 +1000 Subject: powerpc/64s: Implement interrupt exit logic in C Implement the bulk of interrupt return logic in C. The asm return code must handle a few cases: restoring full GPRs, and emulating stack store. The stack store emulation is significantly simplfied, rather than creating a new return frame and switching to that before performing the store, it uses the PACA to keep a scratch register around to perform the store. The asm return code is moved into 64e for now. The new logic has made allowance for 64e, but I don't have a full environment that works well to test it, and even booting in emulated qemu is not great for stress testing. 64e shouldn't be too far off working with this, given a bit more testing and auditing of the logic. This is slightly faster on a POWER9 (page fault speed increases about 1.1%), probably due to reduced mtmsrd. mpe: Includes fixes from Nick for _TIF_EMULATE_STACK_STORE handling (including the fast_interrupt_return path), to remove trace_hardirqs_on(), and fixes the interrupt-return part of the MSR_VSX restore bug caught by tm-unavailable selftest. mpe: Incorporate fix from Nick: The return-to-kernel path has to replay any soft-pending interrupts if it is returning to a context that had interrupts soft-enabled. It has to do this carefully and avoid plain enabling interrupts if this is an irq context, which can cause multiple nesting of interrupts on the stack, and other unexpected issues. The code which avoided this case got the soft-mask state wrong, and marked interrupts as enabled before going around again to retry. This seems to be mostly harmless except when PREEMPT=y, this calls preempt_schedule_irq with irqs apparently enabled and runs into a BUG in kernel/sched/core.c Signed-off-by: Nicholas Piggin Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200225173541.1549955-29-npiggin@gmail.com --- arch/powerpc/include/asm/asm-prototypes.h | 2 + arch/powerpc/include/asm/book3s/64/kup-radix.h | 10 + arch/powerpc/include/asm/hw_irq.h | 1 + arch/powerpc/include/asm/switch_to.h | 6 + arch/powerpc/kernel/entry_64.S | 487 ++++++------------------- arch/powerpc/kernel/exceptions-64e.S | 255 ++++++++++++- arch/powerpc/kernel/exceptions-64s.S | 119 +++--- arch/powerpc/kernel/irq.c | 36 +- arch/powerpc/kernel/process.c | 89 ++--- arch/powerpc/kernel/syscall_64.c | 171 ++++++++- arch/powerpc/kernel/vector.S | 2 +- 11 files changed, 652 insertions(+), 526 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index ab59a4904254..7d81e86a1e5d 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -99,6 +99,8 @@ void __init machine_init(u64 dt_ptr); #endif long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs); notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs); +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr); +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr); long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low); diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h index 71081d90f999..3bcef989a35d 100644 --- a/arch/powerpc/include/asm/book3s/64/kup-radix.h +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -60,6 +60,12 @@ #include #include +static inline void kuap_restore_amr(struct pt_regs *regs) +{ + if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) + mtspr(SPRN_AMR, regs->kuap); +} + static inline void kuap_check_amr(void) { if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && mmu_has_feature(MMU_FTR_RADIX_KUAP)) @@ -136,6 +142,10 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long address, bool is_write) "Bug: %s fault blocked by AMR!", is_write ? "Write" : "Read"); } #else /* CONFIG_PPC_KUAP */ +static inline void kuap_restore_amr(struct pt_regs *regs) +{ +} + static inline void kuap_check_amr(void) { } diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 0e9a9598f91f..e0e71777961f 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -52,6 +52,7 @@ #ifndef __ASSEMBLY__ extern void replay_system_reset(void); +extern void replay_soft_interrupts(void); extern void timer_interrupt(struct pt_regs *); extern void timer_broadcast_interrupt(void); diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 476008bc3d08..b867b58b1093 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -23,7 +23,13 @@ extern void switch_booke_debug_regs(struct debug_reg *new_debug); extern int emulate_altivec(struct pt_regs *); +#ifdef CONFIG_PPC_BOOK3S_64 void restore_math(struct pt_regs *regs); +#else +static inline void restore_math(struct pt_regs *regs) +{ +} +#endif void restore_tm_state(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 29949bbe857b..5d782acb86d4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -228,6 +229,7 @@ _GLOBAL(ret_from_kernel_thread) li r3,0 b .Lsyscall_exit +#ifdef CONFIG_PPC_BOOK3E /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) ld r11,_TRAP(r1) @@ -238,6 +240,7 @@ _GLOBAL(save_nvgprs) std r0,_TRAP(r1) blr _ASM_NOKPROBE_SYMBOL(save_nvgprs); +#endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -301,7 +304,7 @@ flush_count_cache: * state of one is saved on its kernel stack. Then the state * of the other is restored from its kernel stack. The memory * management hardware is updated to the second process's state. - * Finally, we can return to the second process, via ret_from_except. + * Finally, we can return to the second process, via interrupt_return. * On entry, r3 points to the THREAD for the current task, r4 * points to the THREAD for the new task. * @@ -453,408 +456,152 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) addi r1,r1,SWITCH_FRAME_SIZE blr - .align 7 -_GLOBAL(ret_from_except) - ld r11,_TRAP(r1) - andi. r0,r11,1 - bne ret_from_except_lite - REST_NVGPRS(r1) - -_GLOBAL(ret_from_except_lite) +#ifdef CONFIG_PPC_BOOK3S /* - * Disable interrupts so that current_thread_info()->flags - * can't change between when we test it and when we return - * from the interrupt. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r10,MSR_RI - mtmsrd r10,1 /* Update machine state */ -#endif /* CONFIG_PPC_BOOK3E */ + * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not + * touched, AMR not set, no exit work created, then this can be used. + */ + .balign IFETCH_ALIGN_BYTES + .globl fast_interrupt_return +fast_interrupt_return: +_ASM_NOKPROBE_SYMBOL(fast_interrupt_return) + ld r4,_MSR(r1) + andi. r0,r4,MSR_PR + bne .Lfast_user_interrupt_return + andi. r0,r4,MSR_RI + li r3,0 /* 0 return value, no EMULATE_STACK_STORE */ + bne+ .Lfast_kernel_interrupt_return + addi r3,r1,STACK_FRAME_OVERHEAD + bl unrecoverable_exception + b . /* should not get here */ - ld r9, PACA_THREAD_INFO(r13) - ld r3,_MSR(r1) -#ifdef CONFIG_PPC_BOOK3E - ld r10,PACACURRENT(r13) -#endif /* CONFIG_PPC_BOOK3E */ - ld r4,TI_FLAGS(r9) - andi. r3,r3,MSR_PR - beq resume_kernel -#ifdef CONFIG_PPC_BOOK3E - lwz r3,(THREAD+THREAD_DBCR0)(r10) -#endif /* CONFIG_PPC_BOOK3E */ + .balign IFETCH_ALIGN_BYTES + .globl interrupt_return +interrupt_return: +_ASM_NOKPROBE_SYMBOL(interrupt_return) + REST_NVGPRS(r1) - /* Check current_thread_info()->flags */ - andi. r0,r4,_TIF_USER_WORK_MASK - bne 1f -#ifdef CONFIG_PPC_BOOK3E - /* - * Check to see if the dbcr0 register is set up to debug. - * Use the internal debug mode bit to do this. - */ - andis. r0,r3,DBCR0_IDM@h - beq restore - mfmsr r0 - rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ - mtmsr r0 - mtspr SPRN_DBCR0,r3 - li r10, -1 - mtspr SPRN_DBSR,r10 - b restore -#else - addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_math - b restore -#endif -1: andi. r0,r4,_TIF_NEED_RESCHED - beq 2f - bl restore_interrupts - SCHEDULE_USER - b ret_from_except_lite -2: -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - andi. r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM - bne 3f /* only restore TM if nothing else to do */ + .balign IFETCH_ALIGN_BYTES + .globl interrupt_return_lite +interrupt_return_lite: +_ASM_NOKPROBE_SYMBOL(interrupt_return_lite) + ld r4,_MSR(r1) + andi. r0,r4,MSR_PR + beq .Lkernel_interrupt_return addi r3,r1,STACK_FRAME_OVERHEAD - bl restore_tm_state - b restore -3: -#endif - bl save_nvgprs - /* - * Use a non volatile GPR to save and restore our thread_info flags - * across the call to restore_interrupts. - */ - mr r30,r4 - bl restore_interrupts - mr r4,r30 - addi r3,r1,STACK_FRAME_OVERHEAD - bl do_notify_resume - b ret_from_except - -resume_kernel: - /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ - andis. r8,r4,_TIF_EMULATE_STACK_STORE@h - beq+ 1f + bl interrupt_exit_user_prepare + cmpdi r3,0 + bne- .Lrestore_nvgprs - addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ +.Lfast_user_interrupt_return: + ld r11,_NIP(r1) + ld r12,_MSR(r1) +BEGIN_FTR_SECTION + ld r10,_PPR(r1) + mtspr SPRN_PPR,r10 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 - ld r3,GPR1(r1) - subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ - mr r4,r1 /* src: current exception frame */ - mr r1,r3 /* Reroute the trampoline frame to r1 */ +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +FTR_SECTION_ELSE + ldarx r0,0,r1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - /* Copy from the original to the trampoline. */ - li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ - li r6,0 /* start offset: 0 */ - mtctr r5 -2: ldx r0,r6,r4 - stdx r0,r6,r3 - addi r6,r6,8 - bdnz 2b - - /* Do real store operation to complete stdu */ - ld r5,GPR1(r1) - std r8,0(r5) - - /* Clear _TIF_EMULATE_STACK_STORE flag */ - lis r11,_TIF_EMULATE_STACK_STORE@h - addi r5,r9,TI_FLAGS -0: ldarx r4,0,r5 - andc r4,r4,r11 - stdcx. r4,0,r5 - bne- 0b -1: - -#ifdef CONFIG_PREEMPTION - /* Check if we need to preempt */ - andi. r0,r4,_TIF_NEED_RESCHED - beq+ restore - /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) - cmpwi cr0,r8,0 - bne restore - ld r0,SOFTE(r1) - andi. r0,r0,IRQS_DISABLED - bne restore + ld r3,_CCR(r1) + ld r4,_LINK(r1) + ld r5,_CTR(r1) + ld r6,_XER(r1) + li r0,0 - /* - * Here we are preempting the current task. We want to make - * sure we are soft-disabled first and reconcile irq state. - */ - RECONCILE_IRQ_STATE(r3,r4) - bl preempt_schedule_irq + REST_4GPRS(7, r1) + REST_2GPRS(11, r1) + REST_GPR(13, r1) - /* - * arch_local_irq_restore() from preempt_schedule_irq above may - * enable hard interrupt but we really should disable interrupts - * when we return from the interrupt, and so that we don't get - * interrupted after loading SRR0/1. - */ -#ifdef CONFIG_PPC_BOOK3E - wrteei 0 -#else - li r10,MSR_RI - mtmsrd r10,1 /* Update machine state */ -#endif /* CONFIG_PPC_BOOK3E */ -#endif /* CONFIG_PREEMPTION */ + mtcr r3 + mtlr r4 + mtctr r5 + mtspr SPRN_XER,r6 - .globl fast_exc_return_irq -fast_exc_return_irq: -restore: - /* - * This is the main kernel exit path. First we check if we - * are about to re-enable interrupts - */ - ld r5,SOFTE(r1) - lbz r6,PACAIRQSOFTMASK(r13) - andi. r5,r5,IRQS_DISABLED - bne .Lrestore_irq_off + REST_4GPRS(2, r1) + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + RFI_TO_USER + b . /* prevent speculative execution */ - /* We are enabling, were we already enabled ? Yes, just return */ - andi. r6,r6,IRQS_DISABLED - beq cr0,.Ldo_restore +.Lrestore_nvgprs: + REST_NVGPRS(r1) + b .Lfast_user_interrupt_return - /* - * We are about to soft-enable interrupts (we are hard disabled - * at this point). We check if there's anything that needs to - * be replayed first. - */ - lbz r0,PACAIRQHAPPENED(r13) - cmpwi cr0,r0,0 - bne- .Lrestore_check_irq_replay + .balign IFETCH_ALIGN_BYTES +.Lkernel_interrupt_return: + addi r3,r1,STACK_FRAME_OVERHEAD + bl interrupt_exit_kernel_prepare - /* - * Get here when nothing happened while soft-disabled, just - * soft-enable and move-on. We will hard-enable as a side - * effect of rfi - */ -.Lrestore_no_replay: - TRACE_ENABLE_INTS - li r0,IRQS_ENABLED - stb r0,PACAIRQSOFTMASK(r13); +.Lfast_kernel_interrupt_return: + cmpdi cr1,r3,0 + ld r11,_NIP(r1) + ld r12,_MSR(r1) + mtspr SPRN_SRR0,r11 + mtspr SPRN_SRR1,r12 - /* - * Final return path. BookE is handled in a different file - */ -.Ldo_restore: -#ifdef CONFIG_PPC_BOOK3E - b exception_return_book3e -#else - /* - * Clear the reservation. If we know the CPU tracks the address of - * the reservation then we can potentially save some cycles and use - * a larx. On POWER6 and POWER7 this is significantly faster. - */ BEGIN_FTR_SECTION stdcx. r0,0,r1 /* to clear the reservation */ FTR_SECTION_ELSE - ldarx r4,0,r1 + ldarx r0,0,r1 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) - /* - * Some code path such as load_up_fpu or altivec return directly - * here. They run entirely hard disabled and do not alter the - * interrupt state. They also don't use lwarx/stwcx. and thus - * are known not to leave dangling reservations. - */ - .globl fast_exception_return -fast_exception_return: - ld r3,_MSR(r1) + ld r3,_LINK(r1) ld r4,_CTR(r1) - ld r0,_LINK(r1) - mtctr r4 - mtlr r0 - ld r4,_XER(r1) - mtspr SPRN_XER,r4 - - kuap_check_amr r5, r6 - - REST_8GPRS(5, r1) - - andi. r0,r3,MSR_RI - beq- .Lunrecov_restore - - /* - * Clear RI before restoring r13. If we are returning to - * userspace and we take an exception after restoring r13, - * we end up corrupting the userspace r13 value. - */ - li r4,0 - mtmsrd r4,1 - -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* TM debug */ - std r3, PACATMSCRATCH(r13) /* Stash returned-to MSR */ -#endif - /* - * r13 is our per cpu area, only restore it if we are returning to - * userspace the value stored in the stack frame may belong to - * another CPU. - */ - andi. r0,r3,MSR_PR - beq 1f -BEGIN_FTR_SECTION - /* Restore PPR */ - ld r2,_PPR(r1) - mtspr SPRN_PPR,r2 -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - ACCOUNT_CPU_USER_EXIT(r13, r2, r4) - REST_GPR(13, r1) - - /* - * We don't need to restore AMR on the way back to userspace for KUAP. - * The value of AMR only matters while we're in the kernel. - */ - mtspr SPRN_SRR1,r3 - - ld r2,_CCR(r1) - mtcrf 0xFF,r2 - ld r2,_NIP(r1) - mtspr SPRN_SRR0,r2 - - ld r0,GPR0(r1) - ld r2,GPR2(r1) - ld r3,GPR3(r1) - ld r4,GPR4(r1) - ld r1,GPR1(r1) - RFI_TO_USER - b . /* prevent speculative execution */ + ld r5,_XER(r1) + ld r6,_CCR(r1) + li r0,0 -1: mtspr SPRN_SRR1,r3 + REST_4GPRS(7, r1) + REST_2GPRS(11, r1) - ld r2,_CCR(r1) - mtcrf 0xFF,r2 - ld r2,_NIP(r1) - mtspr SPRN_SRR0,r2 + mtlr r3 + mtctr r4 + mtspr SPRN_XER,r5 /* * Leaving a stale exception_marker on the stack can confuse * the reliable stack unwinder later on. Clear it. */ - li r2,0 - std r2,STACK_FRAME_OVERHEAD-16(r1) + std r0,STACK_FRAME_OVERHEAD-16(r1) - ld r0,GPR0(r1) - ld r2,GPR2(r1) - ld r3,GPR3(r1) + REST_4GPRS(2, r1) - kuap_restore_amr r4 - - ld r4,GPR4(r1) - ld r1,GPR1(r1) + bne- cr1,1f /* emulate stack store */ + mtcr r6 + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) RFI_TO_KERNEL b . /* prevent speculative execution */ -#endif /* CONFIG_PPC_BOOK3E */ - - /* - * We are returning to a context with interrupts soft disabled. - * - * However, we may also about to hard enable, so we need to - * make sure that in this case, we also clear PACA_IRQ_HARD_DIS - * or that bit can get out of sync and bad things will happen - */ -.Lrestore_irq_off: - ld r3,_MSR(r1) - lbz r7,PACAIRQHAPPENED(r13) - andi. r0,r3,MSR_EE - beq 1f - rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS - stb r7,PACAIRQHAPPENED(r13) -1: -#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) - /* The interrupt should not have soft enabled. */ - lbz r7,PACAIRQSOFTMASK(r13) -1: tdeqi r7,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING -#endif - b .Ldo_restore - - /* - * Something did happen, check if a re-emit is needed - * (this also clears paca->irq_happened) - */ -.Lrestore_check_irq_replay: - /* XXX: We could implement a fast path here where we check - * for irq_happened being just 0x01, in which case we can - * clear it and return. That means that we would potentially - * miss a decrementer having wrapped all the way around. - * - * Still, this might be useful for things like hash_page - */ - bl __check_irq_replay - cmpwi cr0,r3,0 - beq .Lrestore_no_replay - - /* - * We need to re-emit an interrupt. We do so by re-using our - * existing exception frame. We first change the trap value, - * but we need to ensure we preserve the low nibble of it - */ - ld r4,_TRAP(r1) - clrldi r4,r4,60 - or r4,r4,r3 - std r4,_TRAP(r1) - - /* - * PACA_IRQ_HARD_DIS won't always be set here, so set it now - * to reconcile the IRQ state. Tracing is already accounted for. - */ - lbz r4,PACAIRQHAPPENED(r13) - ori r4,r4,PACA_IRQ_HARD_DIS - stb r4,PACAIRQHAPPENED(r13) - - /* - * Then find the right handler and call it. Interrupts are - * still soft-disabled and we keep them that way. - */ - cmpwi cr0,r3,0x500 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl do_IRQ - b ret_from_except -1: cmpwi cr0,r3,0xf00 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl performance_monitor_exception - b ret_from_except -1: cmpwi cr0,r3,0xe60 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl handle_hmi_exception - b ret_from_except -1: cmpwi cr0,r3,0x900 - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl timer_interrupt - b ret_from_except -#ifdef CONFIG_PPC_DOORBELL -1: -#ifdef CONFIG_PPC_BOOK3E - cmpwi cr0,r3,0x280 -#else - cmpwi cr0,r3,0xa00 -#endif /* CONFIG_PPC_BOOK3E */ - bne 1f - addi r3,r1,STACK_FRAME_OVERHEAD; - bl doorbell_exception -#endif /* CONFIG_PPC_DOORBELL */ -1: b ret_from_except /* What else to do here ? */ - -.Lunrecov_restore: - addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b .Lunrecov_restore - -_ASM_NOKPROBE_SYMBOL(ret_from_except); -_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); -_ASM_NOKPROBE_SYMBOL(resume_kernel); -_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); -_ASM_NOKPROBE_SYMBOL(restore); -_ASM_NOKPROBE_SYMBOL(fast_exception_return); +1: /* + * Emulate stack store with update. New r1 value was already calculated + * and updated in our interrupt regs by emulate_loadstore, but we can't + * store the previous value of r1 to the stack before re-loading our + * registers from it, otherwise they could be clobbered. Use + * PACA_EXGEN as temporary storage to hold the store data, as + * interrupts are disabled here so it won't be clobbered. + */ + mtcr r6 + std r9,PACA_EXGEN+0(r13) + addi r9,r1,INT_FRAME_SIZE /* get original r1 */ + REST_GPR(6, r1) + REST_GPR(0, r1) + REST_GPR(1, r1) + std r9,0(r1) /* perform store component of stdu */ + ld r9,PACA_EXGEN+0(r13) + RFI_TO_KERNEL + b . /* prevent speculative execution */ +#endif /* CONFIG_PPC_BOOK3S */ #ifdef CONFIG_PPC_RTAS /* diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 4efac5490216..d9ed79415100 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -24,6 +24,7 @@ #include #include #include +#include /* XXX This will ultimately add space for a special exception save * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... @@ -1041,17 +1042,161 @@ alignment_more: bl alignment_exception b ret_from_except -/* - * We branch here from entry_64.S for the last stage of the exception - * return code path. MSR:EE is expected to be off at that point - */ -_GLOBAL(exception_return_book3e) - b 1f + .align 7 +_GLOBAL(ret_from_except) + ld r11,_TRAP(r1) + andi. r0,r11,1 + bne ret_from_except_lite + REST_NVGPRS(r1) + +_GLOBAL(ret_from_except_lite) + /* + * Disable interrupts so that current_thread_info()->flags + * can't change between when we test it and when we return + * from the interrupt. + */ + wrteei 0 + + ld r9, PACA_THREAD_INFO(r13) + ld r3,_MSR(r1) + ld r10,PACACURRENT(r13) + ld r4,TI_FLAGS(r9) + andi. r3,r3,MSR_PR + beq resume_kernel + lwz r3,(THREAD+THREAD_DBCR0)(r10) + + /* Check current_thread_info()->flags */ + andi. r0,r4,_TIF_USER_WORK_MASK + bne 1f + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + andis. r0,r3,DBCR0_IDM@h + beq restore + mfmsr r0 + rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ + mtmsr r0 + mtspr SPRN_DBCR0,r3 + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +1: andi. r0,r4,_TIF_NEED_RESCHED + beq 2f + bl restore_interrupts + SCHEDULE_USER + b ret_from_except_lite +2: + bl save_nvgprs + /* + * Use a non volatile GPR to save and restore our thread_info flags + * across the call to restore_interrupts. + */ + mr r30,r4 + bl restore_interrupts + mr r4,r30 + addi r3,r1,STACK_FRAME_OVERHEAD + bl do_notify_resume + b ret_from_except + +resume_kernel: + /* check current_thread_info, _TIF_EMULATE_STACK_STORE */ + andis. r8,r4,_TIF_EMULATE_STACK_STORE@h + beq+ 1f + + addi r8,r1,INT_FRAME_SIZE /* Get the kprobed function entry */ + + ld r3,GPR1(r1) + subi r3,r3,INT_FRAME_SIZE /* dst: Allocate a trampoline exception frame */ + mr r4,r1 /* src: current exception frame */ + mr r1,r3 /* Reroute the trampoline frame to r1 */ + + /* Copy from the original to the trampoline. */ + li r5,INT_FRAME_SIZE/8 /* size: INT_FRAME_SIZE */ + li r6,0 /* start offset: 0 */ + mtctr r5 +2: ldx r0,r6,r4 + stdx r0,r6,r3 + addi r6,r6,8 + bdnz 2b + + /* Do real store operation to complete stdu */ + ld r5,GPR1(r1) + std r8,0(r5) + + /* Clear _TIF_EMULATE_STACK_STORE flag */ + lis r11,_TIF_EMULATE_STACK_STORE@h + addi r5,r9,TI_FLAGS +0: ldarx r4,0,r5 + andc r4,r4,r11 + stdcx. r4,0,r5 + bne- 0b +1: + +#ifdef CONFIG_PREEMPT + /* Check if we need to preempt */ + andi. r0,r4,_TIF_NEED_RESCHED + beq+ restore + /* Check that preempt_count() == 0 and interrupts are enabled */ + lwz r8,TI_PREEMPT(r9) + cmpwi cr0,r8,0 + bne restore + ld r0,SOFTE(r1) + andi. r0,r0,IRQS_DISABLED + bne restore + + /* + * Here we are preempting the current task. We want to make + * sure we are soft-disabled first and reconcile irq state. + */ + RECONCILE_IRQ_STATE(r3,r4) + bl preempt_schedule_irq + + /* + * arch_local_irq_restore() from preempt_schedule_irq above may + * enable hard interrupt but we really should disable interrupts + * when we return from the interrupt, and so that we don't get + * interrupted after loading SRR0/1. + */ + wrteei 0 +#endif /* CONFIG_PREEMPT */ + +restore: + /* + * This is the main kernel exit path. First we check if we + * are about to re-enable interrupts + */ + ld r5,SOFTE(r1) + lbz r6,PACAIRQSOFTMASK(r13) + andi. r5,r5,IRQS_DISABLED + bne .Lrestore_irq_off + + /* We are enabling, were we already enabled ? Yes, just return */ + andi. r6,r6,IRQS_DISABLED + beq cr0,fast_exception_return + + /* + * We are about to soft-enable interrupts (we are hard disabled + * at this point). We check if there's anything that needs to + * be replayed first. + */ + lbz r0,PACAIRQHAPPENED(r13) + cmpwi cr0,r0,0 + bne- .Lrestore_check_irq_replay + + /* + * Get here when nothing happened while soft-disabled, just + * soft-enable and move-on. We will hard-enable as a side + * effect of rfi + */ +.Lrestore_no_replay: + TRACE_ENABLE_INTS + li r0,IRQS_ENABLED + stb r0,PACAIRQSOFTMASK(r13); /* This is the return from load_up_fpu fast path which could do with * less GPR restores in fact, but for now we have a single return path */ - .globl fast_exception_return fast_exception_return: wrteei 0 1: mr r0,r13 @@ -1092,6 +1237,102 @@ fast_exception_return: mfspr r13,SPRN_SPRG_GEN_SCRATCH rfi + /* + * We are returning to a context with interrupts soft disabled. + * + * However, we may also about to hard enable, so we need to + * make sure that in this case, we also clear PACA_IRQ_HARD_DIS + * or that bit can get out of sync and bad things will happen + */ +.Lrestore_irq_off: + ld r3,_MSR(r1) + lbz r7,PACAIRQHAPPENED(r13) + andi. r0,r3,MSR_EE + beq 1f + rlwinm r7,r7,0,~PACA_IRQ_HARD_DIS + stb r7,PACAIRQHAPPENED(r13) +1: +#if defined(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && defined(CONFIG_BUG) + /* The interrupt should not have soft enabled. */ + lbz r7,PACAIRQSOFTMASK(r13) +1: tdeqi r7,IRQS_ENABLED + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING +#endif + b fast_exception_return + + /* + * Something did happen, check if a re-emit is needed + * (this also clears paca->irq_happened) + */ +.Lrestore_check_irq_replay: + /* XXX: We could implement a fast path here where we check + * for irq_happened being just 0x01, in which case we can + * clear it and return. That means that we would potentially + * miss a decrementer having wrapped all the way around. + * + * Still, this might be useful for things like hash_page + */ + bl __check_irq_replay + cmpwi cr0,r3,0 + beq .Lrestore_no_replay + + /* + * We need to re-emit an interrupt. We do so by re-using our + * existing exception frame. We first change the trap value, + * but we need to ensure we preserve the low nibble of it + */ + ld r4,_TRAP(r1) + clrldi r4,r4,60 + or r4,r4,r3 + std r4,_TRAP(r1) + + /* + * PACA_IRQ_HARD_DIS won't always be set here, so set it now + * to reconcile the IRQ state. Tracing is already accounted for. + */ + lbz r4,PACAIRQHAPPENED(r13) + ori r4,r4,PACA_IRQ_HARD_DIS + stb r4,PACAIRQHAPPENED(r13) + + /* + * Then find the right handler and call it. Interrupts are + * still soft-disabled and we keep them that way. + */ + cmpwi cr0,r3,0x500 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl do_IRQ + b ret_from_except +1: cmpwi cr0,r3,0xf00 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl performance_monitor_exception + b ret_from_except +1: cmpwi cr0,r3,0xe60 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl handle_hmi_exception + b ret_from_except +1: cmpwi cr0,r3,0x900 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl timer_interrupt + b ret_from_except +#ifdef CONFIG_PPC_DOORBELL +1: + cmpwi cr0,r3,0x280 + bne 1f + addi r3,r1,STACK_FRAME_OVERHEAD; + bl doorbell_exception +#endif /* CONFIG_PPC_DOORBELL */ +1: b ret_from_except /* What else to do here ? */ + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + /* * Trampolines used when spotting a bad kernel stack pointer in * the exception entry code. diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d6536a7c2a01..11244031adc5 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -589,6 +589,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r10,GPR12(r1) std r11,GPR13(r1) + SAVE_NVGPRS(r1) + .if IDAR .if IISIDE ld r10,_NIP(r1) @@ -625,7 +627,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) mfspr r11,SPRN_XER /* save XER in stackframe */ std r10,SOFTE(r1) std r11,_XER(r1) - li r9,(IVEC)+1 + li r9,IVEC std r9,_TRAP(r1) /* set trap number */ li r10,0 ld r11,exception_marker@toc(r2) @@ -932,7 +934,6 @@ EXC_COMMON_BEGIN(system_reset_common) ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE __GEN_COMMON_BODY system_reset - bl save_nvgprs /* * Set IRQS_ALL_DISABLED unconditionally so irqs_disabled() does * the right thing. We do not want to reconcile because that goes @@ -1115,7 +1116,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) li r10,MSR_RI mtmsrd r10,1 - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_early std r3,RESULT(r1) /* Save result */ @@ -1208,10 +1208,9 @@ EXC_COMMON_BEGIN(machine_check_common) /* Enable MSR_RI when finished with PACA_EXMC */ li r10,MSR_RI mtmsrd r10,1 - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl machine_check_exception - b ret_from_except + b interrupt_return GEN_KVM machine_check @@ -1378,20 +1377,19 @@ BEGIN_MMU_FTR_SECTION bl do_slb_fault cmpdi r3,0 bne- 1f - b fast_exception_return + b fast_interrupt_return 1: /* Error case */ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) ld r4,_DAR(r1) ld r5,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_slb_fault - b ret_from_except + b interrupt_return GEN_KVM data_access_slb @@ -1471,20 +1469,19 @@ BEGIN_MMU_FTR_SECTION bl do_slb_fault cmpdi r3,0 bne- 1f - b fast_exception_return + b fast_interrupt_return 1: /* Error case */ MMU_FTR_SECTION_ELSE /* Radix case, access is outside page table range */ li r3,-EFAULT ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) ld r4,_DAR(r1) ld r5,RESULT(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_bad_slb_fault - b ret_from_except + b interrupt_return GEN_KVM instruction_access_slb @@ -1532,7 +1529,7 @@ EXC_COMMON_BEGIN(hardware_interrupt_common) RUNLATCH_ON addi r3,r1,STACK_FRAME_OVERHEAD bl do_IRQ - b ret_from_except_lite + b interrupt_return_lite GEN_KVM hardware_interrupt @@ -1558,10 +1555,9 @@ EXC_VIRT_BEGIN(alignment, 0x4600, 0x100) EXC_VIRT_END(alignment, 0x4600, 0x100) EXC_COMMON_BEGIN(alignment_common) GEN_COMMON alignment - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl alignment_exception - b ret_from_except + b interrupt_return GEN_KVM alignment @@ -1622,10 +1618,9 @@ EXC_COMMON_BEGIN(program_check_common) __ISTACK(program_check)=1 __GEN_COMMON_BODY program_check 3: - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl program_check_exception - b ret_from_except + b interrupt_return GEN_KVM program_check @@ -1656,7 +1651,6 @@ EXC_VIRT_END(fp_unavailable, 0x4800, 0x100) EXC_COMMON_BEGIN(fp_unavailable_common) GEN_COMMON fp_unavailable bne 1f /* if from user, just load it up */ - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_fp_unavailable_exception @@ -1673,14 +1667,13 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif bl load_up_fpu - b fast_exception_return + b fast_interrupt_return #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl fp_unavailable_tm - b ret_from_except + b interrupt_return #endif GEN_KVM fp_unavailable @@ -1723,7 +1716,7 @@ EXC_COMMON_BEGIN(decrementer_common) RUNLATCH_ON addi r3,r1,STACK_FRAME_OVERHEAD bl timer_interrupt - b ret_from_except_lite + b interrupt_return_lite GEN_KVM decrementer @@ -1814,7 +1807,7 @@ EXC_COMMON_BEGIN(doorbell_super_common) #else bl unknown_exception #endif - b ret_from_except_lite + b interrupt_return_lite GEN_KVM doorbell_super @@ -1986,10 +1979,9 @@ EXC_VIRT_BEGIN(single_step, 0x4d00, 0x100) EXC_VIRT_END(single_step, 0x4d00, 0x100) EXC_COMMON_BEGIN(single_step_common) GEN_COMMON single_step - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl single_step_exception - b ret_from_except + b interrupt_return GEN_KVM single_step @@ -2024,7 +2016,6 @@ EXC_VIRT_BEGIN(h_data_storage, 0x4e00, 0x20) EXC_VIRT_END(h_data_storage, 0x4e00, 0x20) EXC_COMMON_BEGIN(h_data_storage_common) GEN_COMMON h_data_storage - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD BEGIN_MMU_FTR_SECTION ld r4,_DAR(r1) @@ -2033,7 +2024,7 @@ BEGIN_MMU_FTR_SECTION MMU_FTR_SECTION_ELSE bl unknown_exception ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_TYPE_RADIX) - b ret_from_except + b interrupt_return GEN_KVM h_data_storage @@ -2058,10 +2049,9 @@ EXC_VIRT_BEGIN(h_instr_storage, 0x4e20, 0x20) EXC_VIRT_END(h_instr_storage, 0x4e20, 0x20) EXC_COMMON_BEGIN(h_instr_storage_common) GEN_COMMON h_instr_storage - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl unknown_exception - b ret_from_except + b interrupt_return GEN_KVM h_instr_storage @@ -2084,10 +2074,9 @@ EXC_VIRT_BEGIN(emulation_assist, 0x4e40, 0x20) EXC_VIRT_END(emulation_assist, 0x4e40, 0x20) EXC_COMMON_BEGIN(emulation_assist_common) GEN_COMMON emulation_assist - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl emulation_assist_interrupt - b ret_from_except + b interrupt_return GEN_KVM emulation_assist @@ -2169,10 +2158,9 @@ EXC_COMMON_BEGIN(hmi_exception_common) GEN_COMMON hmi_exception FINISH_NAP RUNLATCH_ON - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl handle_hmi_exception - b ret_from_except + b interrupt_return GEN_KVM hmi_exception @@ -2206,7 +2194,7 @@ EXC_COMMON_BEGIN(h_doorbell_common) #else bl unknown_exception #endif - b ret_from_except_lite + b interrupt_return_lite GEN_KVM h_doorbell @@ -2236,7 +2224,7 @@ EXC_COMMON_BEGIN(h_virt_irq_common) RUNLATCH_ON addi r3,r1,STACK_FRAME_OVERHEAD bl do_IRQ - b ret_from_except_lite + b interrupt_return_lite GEN_KVM h_virt_irq @@ -2283,7 +2271,7 @@ EXC_COMMON_BEGIN(performance_monitor_common) RUNLATCH_ON addi r3,r1,STACK_FRAME_OVERHEAD bl performance_monitor_exception - b ret_from_except_lite + b interrupt_return_lite GEN_KVM performance_monitor @@ -2323,23 +2311,21 @@ BEGIN_FTR_SECTION END_FTR_SECTION_NESTED(CPU_FTR_TM, CPU_FTR_TM, 69) #endif bl load_up_altivec - b fast_exception_return + b fast_interrupt_return #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl altivec_unavailable_tm - b ret_from_except + b interrupt_return #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl altivec_unavailable_exception - b ret_from_except + b interrupt_return GEN_KVM altivec_unavailable @@ -2381,20 +2367,18 @@ BEGIN_FTR_SECTION b load_up_vsx #ifdef CONFIG_PPC_TRANSACTIONAL_MEM 2: /* User process was in a transaction */ - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl vsx_unavailable_tm - b ret_from_except + b interrupt_return #endif 1: END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD bl vsx_unavailable_exception - b ret_from_except + b interrupt_return GEN_KVM vsx_unavailable @@ -2421,10 +2405,9 @@ EXC_VIRT_BEGIN(facility_unavailable, 0x4f60, 0x20) EXC_VIRT_END(facility_unavailable, 0x4f60, 0x20) EXC_COMMON_BEGIN(facility_unavailable_common) GEN_COMMON facility_unavailable - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl facility_unavailable_exception - b ret_from_except + b interrupt_return GEN_KVM facility_unavailable @@ -2451,10 +2434,9 @@ EXC_VIRT_BEGIN(h_facility_unavailable, 0x4f80, 0x20) EXC_VIRT_END(h_facility_unavailable, 0x4f80, 0x20) EXC_COMMON_BEGIN(h_facility_unavailable_common) GEN_COMMON h_facility_unavailable - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl facility_unavailable_exception - b ret_from_except + b interrupt_return GEN_KVM h_facility_unavailable @@ -2485,10 +2467,9 @@ EXC_REAL_END(cbe_system_error, 0x1200, 0x100) EXC_VIRT_NONE(0x5200, 0x100) EXC_COMMON_BEGIN(cbe_system_error_common) GEN_COMMON cbe_system_error - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl cbe_system_error_exception - b ret_from_except + b interrupt_return GEN_KVM cbe_system_error @@ -2514,10 +2495,9 @@ EXC_VIRT_BEGIN(instruction_breakpoint, 0x5300, 0x100) EXC_VIRT_END(instruction_breakpoint, 0x5300, 0x100) EXC_COMMON_BEGIN(instruction_breakpoint_common) GEN_COMMON instruction_breakpoint - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl instruction_breakpoint_exception - b ret_from_except + b interrupt_return GEN_KVM instruction_breakpoint @@ -2637,10 +2617,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) EXC_COMMON_BEGIN(denorm_exception_common) GEN_COMMON denorm_exception - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl unknown_exception - b ret_from_except + b interrupt_return GEN_KVM denorm_exception @@ -2659,10 +2638,9 @@ EXC_REAL_END(cbe_maintenance, 0x1600, 0x100) EXC_VIRT_NONE(0x5600, 0x100) EXC_COMMON_BEGIN(cbe_maintenance_common) GEN_COMMON cbe_maintenance - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl cbe_maintenance_exception - b ret_from_except + b interrupt_return GEN_KVM cbe_maintenance @@ -2687,14 +2665,13 @@ EXC_VIRT_BEGIN(altivec_assist, 0x5700, 0x100) EXC_VIRT_END(altivec_assist, 0x5700, 0x100) EXC_COMMON_BEGIN(altivec_assist_common) GEN_COMMON altivec_assist - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_ALTIVEC bl altivec_assist_exception #else bl unknown_exception #endif - b ret_from_except + b interrupt_return GEN_KVM altivec_assist @@ -2713,10 +2690,9 @@ EXC_REAL_END(cbe_thermal, 0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) EXC_COMMON_BEGIN(cbe_thermal_common) GEN_COMMON cbe_thermal - bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl cbe_thermal_exception - b ret_from_except + b interrupt_return GEN_KVM cbe_thermal @@ -2749,7 +2725,6 @@ EXC_COMMON_BEGIN(soft_nmi_common) ld r1,PACAEMERGSP(r13) subi r1,r1,INT_FRAME_SIZE __GEN_COMMON_BODY soft_nmi - bl save_nvgprs /* * Set IRQS_ALL_DISABLED and save PACAIRQHAPPENED (see @@ -3082,7 +3057,7 @@ do_hash_page: cmpdi r3,0 /* see if __hash_page succeeded */ /* Success */ - beq fast_exc_return_irq /* Return from exception on success */ + beq interrupt_return_lite /* Return from exception on success */ /* Error */ blt- 13f @@ -3099,17 +3074,15 @@ handle_page_fault: addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault cmpdi r3,0 - beq+ ret_from_except_lite - bl save_nvgprs + beq+ interrupt_return_lite mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) bl bad_page_fault - b ret_from_except + b interrupt_return /* We have a data breakpoint exception - handle it */ handle_dabr_fault: - bl save_nvgprs ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD @@ -3117,21 +3090,20 @@ handle_dabr_fault: /* * do_break() may have changed the NV GPRS while handling a breakpoint. * If so, we need to restore them with their updated values. Don't use - * ret_from_except_lite here. + * interrupt_return_lite here. */ - b ret_from_except + b interrupt_return #ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ -13: bl save_nvgprs - mr r5,r3 +13: mr r5,r3 addi r3,r1,STACK_FRAME_OVERHEAD ld r4,_DAR(r1) bl low_hash_fault - b ret_from_except + b interrupt_return #endif /* @@ -3141,11 +3113,10 @@ handle_dabr_fault: * were soft-disabled. We want to invoke the exception handler for * the access, or panic if there isn't a handler. */ -77: bl save_nvgprs - addi r3,r1,STACK_FRAME_OVERHEAD +77: addi r3,r1,STACK_FRAME_OVERHEAD li r5,SIGSEGV bl bad_page_fault - b ret_from_except + b interrupt_return /* * When doorbell is triggered from system reset wakeup, the message is diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2e5dca87b936..a25ed47087ee 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -110,6 +110,8 @@ static inline notrace int decrementer_check_overflow(void) return now >= *next_tb; } +#ifdef CONFIG_PPC_BOOK3E + /* This is called whenever we are re-enabling interrupts * and returns either 0 (nothing to do) or 500/900/280/a00/e80 if * there's an EE, DEC or DBELL to generate. @@ -169,41 +171,16 @@ notrace unsigned int __check_irq_replay(void) } } - /* - * Force the delivery of pending soft-disabled interrupts on PS3. - * Any HV call will have this side effect. - */ - if (firmware_has_feature(FW_FEATURE_PS3_LV1)) { - u64 tmp, tmp2; - lv1_get_version_info(&tmp, &tmp2); - } - - /* - * Check if an hypervisor Maintenance interrupt happened. - * This is a higher priority interrupt than the others, so - * replay it first. - */ - if (happened & PACA_IRQ_HMI) { - local_paca->irq_happened &= ~PACA_IRQ_HMI; - return 0xe60; - } - if (happened & PACA_IRQ_DEC) { local_paca->irq_happened &= ~PACA_IRQ_DEC; return 0x900; } - if (happened & PACA_IRQ_PMI) { - local_paca->irq_happened &= ~PACA_IRQ_PMI; - return 0xf00; - } - if (happened & PACA_IRQ_EE) { local_paca->irq_happened &= ~PACA_IRQ_EE; return 0x500; } -#ifdef CONFIG_PPC_BOOK3E /* * Check if an EPR external interrupt happened this bit is typically * set if we need to handle another "edge" interrupt from within the @@ -218,20 +195,15 @@ notrace unsigned int __check_irq_replay(void) local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0x280; } -#else - if (happened & PACA_IRQ_DBELL) { - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - return 0xa00; - } -#endif /* CONFIG_PPC_BOOK3E */ /* There should be nothing left ! */ BUG_ON(local_paca->irq_happened != 0); return 0; } +#endif /* CONFIG_PPC_BOOK3E */ -static void replay_soft_interrupts(void) +void replay_soft_interrupts(void) { /* * We use local_paca rather than get_paca() to avoid all diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 009833f928bf..9c21288f8645 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -236,23 +236,9 @@ void enable_kernel_fp(void) } } EXPORT_SYMBOL(enable_kernel_fp); - -static int restore_fp(struct task_struct *tsk) -{ - if (tsk->thread.load_fp) { - load_fp_state(¤t->thread.fp_state); - current->thread.load_fp++; - return 1; - } - return 0; -} -#else -static int restore_fp(struct task_struct *tsk) { return 0; } #endif /* CONFIG_PPC_FPU */ #ifdef CONFIG_ALTIVEC -#define loadvec(thr) ((thr).load_vec) - static void __giveup_altivec(struct task_struct *tsk) { unsigned long msr; @@ -318,21 +304,6 @@ void flush_altivec_to_thread(struct task_struct *tsk) } } EXPORT_SYMBOL_GPL(flush_altivec_to_thread); - -static int restore_altivec(struct task_struct *tsk) -{ - if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) { - load_vr_state(&tsk->thread.vr_state); - tsk->thread.used_vr = 1; - tsk->thread.load_vec++; - - return 1; - } - return 0; -} -#else -#define loadvec(thr) 0 -static inline int restore_altivec(struct task_struct *tsk) { return 0; } #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_VSX @@ -400,18 +371,6 @@ void flush_vsx_to_thread(struct task_struct *tsk) } } EXPORT_SYMBOL_GPL(flush_vsx_to_thread); - -static int restore_vsx(struct task_struct *tsk) -{ - if (cpu_has_feature(CPU_FTR_VSX)) { - tsk->thread.used_vsr = 1; - return 1; - } - - return 0; -} -#else -static inline int restore_vsx(struct task_struct *tsk) { return 0; } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -511,6 +470,53 @@ void giveup_all(struct task_struct *tsk) } EXPORT_SYMBOL(giveup_all); +#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_FPU +static int restore_fp(struct task_struct *tsk) +{ + if (tsk->thread.load_fp) { + load_fp_state(¤t->thread.fp_state); + current->thread.load_fp++; + return 1; + } + return 0; +} +#else +static int restore_fp(struct task_struct *tsk) { return 0; } +#endif /* CONFIG_PPC_FPU */ + +#ifdef CONFIG_ALTIVEC +#define loadvec(thr) ((thr).load_vec) +static int restore_altivec(struct task_struct *tsk) +{ + if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) { + load_vr_state(&tsk->thread.vr_state); + tsk->thread.used_vr = 1; + tsk->thread.load_vec++; + + return 1; + } + return 0; +} +#else +#define loadvec(thr) 0 +static inline int restore_altivec(struct task_struct *tsk) { return 0; } +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +static int restore_vsx(struct task_struct *tsk) +{ + if (cpu_has_feature(CPU_FTR_VSX)) { + tsk->thread.used_vsr = 1; + return 1; + } + + return 0; +} +#else +static inline int restore_vsx(struct task_struct *tsk) { return 0; } +#endif /* CONFIG_VSX */ + /* * The exception exit path calls restore_math() with interrupts hard disabled * but the soft irq state not "reconciled". ftrace code that calls @@ -551,6 +557,7 @@ void notrace restore_math(struct pt_regs *regs) regs->msr = msr; } +#endif static void save_all(struct task_struct *tsk) { diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index 75be20fdb270..a986effee1e0 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -24,7 +24,11 @@ notrace long system_call_exception(long r3, long r4, long r5, long r6, long r7, unsigned long ti_flags; syscall_fn f; + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) + BUG_ON(!(regs->msr & MSR_RI)); BUG_ON(!(regs->msr & MSR_PR)); + BUG_ON(!FULL_REGS(regs)); + BUG_ON(regs->softe != IRQS_ENABLED); account_cpu_user_entry(); @@ -196,7 +200,7 @@ again: trace_hardirqs_off(); local_paca->irq_happened |= PACA_IRQ_HARD_DIS; local_irq_enable(); - /* Took an interrupt which may have more exit work to do. */ + /* Took an interrupt, may have more exit work to do. */ goto again; } local_paca->irq_happened = 0; @@ -212,3 +216,168 @@ again: return ret; } + +#ifdef CONFIG_PPC_BOOK3S /* BOOK3E not yet using this */ +notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr) +{ +#ifdef CONFIG_PPC_BOOK3E + struct thread_struct *ts = ¤t->thread; +#endif + unsigned long *ti_flagsp = ¤t_thread_info()->flags; + unsigned long ti_flags; + unsigned long flags; + unsigned long ret = 0; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) + BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(!(regs->msr & MSR_PR)); + BUG_ON(!FULL_REGS(regs)); + BUG_ON(regs->softe != IRQS_ENABLED); + + local_irq_save(flags); + +again: + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); /* returning to user: may enable */ + if (ti_flags & _TIF_NEED_RESCHED) { + schedule(); + } else { + if (ti_flags & _TIF_SIGPENDING) + ret |= _TIF_RESTOREALL; + do_notify_resume(regs, ti_flags); + } + local_irq_disable(); + ti_flags = READ_ONCE(*ti_flagsp); + } + + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && IS_ENABLED(CONFIG_PPC_FPU)) { + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && + unlikely((ti_flags & _TIF_RESTORE_TM))) { + restore_tm_state(regs); + } else { + unsigned long mathflags = MSR_FP; + + if (cpu_has_feature(CPU_FTR_VSX)) + mathflags |= MSR_VEC | MSR_VSX; + else if (cpu_has_feature(CPU_FTR_ALTIVEC)) + mathflags |= MSR_VEC; + + if ((regs->msr & mathflags) != mathflags) + restore_math(regs); + } + } + + trace_hardirqs_on(); + __hard_EE_RI_disable(); + if (unlikely(lazy_irq_pending())) { + __hard_RI_enable(); + trace_hardirqs_off(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + local_irq_enable(); + local_irq_disable(); + /* Took an interrupt, may have more exit work to do. */ + goto again; + } + local_paca->irq_happened = 0; + irq_soft_mask_set(IRQS_ENABLED); + +#ifdef CONFIG_PPC_BOOK3E + if (unlikely(ts->debug.dbcr0 & DBCR0_IDM)) { + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + mtmsr(mfmsr() & ~MSR_DE); + mtspr(SPRN_DBCR0, ts->debug.dbcr0); + mtspr(SPRN_DBSR, -1); + } +#endif + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + local_paca->tm_scratch = regs->msr; +#endif + + kuap_check_amr(); + + account_cpu_user_exit(); + + return ret; +} + +void unrecoverable_exception(struct pt_regs *regs); +void preempt_schedule_irq(void); + +notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr) +{ + unsigned long *ti_flagsp = ¤t_thread_info()->flags; + unsigned long flags; + unsigned long ret = 0; + + if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI))) + unrecoverable_exception(regs); + BUG_ON(regs->msr & MSR_PR); + BUG_ON(!FULL_REGS(regs)); + + if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) { + clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp); + ret = 1; + } + + local_irq_save(flags); + + if (regs->softe == IRQS_ENABLED) { + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); +again: + if (IS_ENABLED(CONFIG_PREEMPT)) { + /* Return to preemptible kernel context */ + if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); + } + } + + trace_hardirqs_on(); + __hard_EE_RI_disable(); + if (unlikely(lazy_irq_pending())) { + __hard_RI_enable(); + irq_soft_mask_set(IRQS_ALL_DISABLED); + trace_hardirqs_off(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + /* + * Can't local_irq_restore to replay if we were in + * interrupt context. Must replay directly. + */ + if (irqs_disabled_flags(flags)) { + replay_soft_interrupts(); + } else { + local_irq_restore(flags); + local_irq_save(flags); + } + /* Took an interrupt, may have more exit work to do. */ + goto again; + } + local_paca->irq_happened = 0; + irq_soft_mask_set(IRQS_ENABLED); + } else { + /* Returning to a kernel context with local irqs disabled. */ + __hard_EE_RI_disable(); + if (regs->msr & MSR_EE) + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + } + + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + local_paca->tm_scratch = regs->msr; +#endif + + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ + kuap_restore_amr(regs); + + return ret; +} +#endif diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index 25c14a0981bf..d20c5e79e03c 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -134,7 +134,7 @@ _GLOBAL(load_up_vsx) /* enable use of VSX after return */ oris r12,r12,MSR_VSX@h std r12,_MSR(r1) - b fast_exception_return + b fast_interrupt_return #endif /* CONFIG_VSX */ -- cgit v1.2.3 From f1763e623c69bcec2c1e739e990058de41d45030 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 28 Feb 2020 00:14:39 +0000 Subject: powerpc/ptrace: drop unnecessary #ifdefs CONFIG_PPC64 Drop a bunch of #ifdefs CONFIG_PPC64 that are not vital. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/af38b87a7e1e3efe4f9b664eaeb029e6e7d69fdb.1582848567.git.christophe.leroy@c-s.fr --- arch/powerpc/include/asm/ptrace.h | 2 ++ arch/powerpc/kernel/ptrace/ptrace.c | 18 +++--------------- 2 files changed, 5 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 082a40153b94..e0195e6b892b 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -279,6 +279,8 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, #endif /* __ASSEMBLY__ */ #ifndef __powerpc64__ +/* We need PT_SOFTE defined at all time to avoid #ifdefs */ +#define PT_SOFTE PT_MQ #else /* __powerpc64__ */ #define PT_FPSCR32 (PT_FPR0 + 2*32 + 1) /* each FP reg occupies 2 32-bit userspace slots */ #define PT_VR0_32 164 /* each Vector reg occupies 4 slots in 32-bit */ diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 7ed54dbb2d7e..3dd94c296ac7 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -274,17 +274,15 @@ int ptrace_get_reg(struct task_struct *task, int regno, unsigned long *data) if (regno == PT_DSCR) return get_user_dscr(task, data); -#ifdef CONFIG_PPC64 /* * softe copies paca->irq_soft_mask variable state. Since irq_soft_mask is * no more used as a flag, lets force usr to alway see the softe value as 1 * which means interrupts are not soft disabled. */ - if (regno == PT_SOFTE) { + if (IS_ENABLED(CONFIG_PPC64) && regno == PT_SOFTE) { *data = 1; return 0; } -#endif regs_max = sizeof(struct user_pt_regs) / sizeof(unsigned long); if (regno < regs_max) { @@ -1998,7 +1996,6 @@ static const struct user_regset_view user_ppc_native_view = { .regsets = native_regsets, .n = ARRAY_SIZE(native_regsets) }; -#ifdef CONFIG_PPC64 #include static int gpr32_get_common(struct task_struct *target, @@ -2272,14 +2269,11 @@ static const struct user_regset_view user_ppc_compat_view = { .name = "ppc", .e_machine = EM_PPC, .ei_osabi = ELF_OSABI, .regsets = compat_regsets, .n = ARRAY_SIZE(compat_regsets) }; -#endif /* CONFIG_PPC64 */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { -#ifdef CONFIG_PPC64 - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (IS_ENABLED(CONFIG_PPC64) && test_tsk_thread_flag(task, TIF_32BIT)) return &user_ppc_compat_view; -#endif return &user_ppc_native_view; } @@ -3063,11 +3057,7 @@ long arch_ptrace(struct task_struct *child, long request, else dbginfo.num_data_bps = 0; dbginfo.num_condition_regs = 0; -#ifdef CONFIG_PPC64 - dbginfo.data_bp_alignment = 8; -#else - dbginfo.data_bp_alignment = 4; -#endif + dbginfo.data_bp_alignment = sizeof(long); dbginfo.sizeof_condition = 0; #ifdef CONFIG_HAVE_HW_BREAKPOINT dbginfo.features = PPC_DEBUG_FEATURE_DATA_BP_RANGE; @@ -3304,12 +3294,10 @@ long do_syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_enter(regs, regs->gpr[0]); -#ifdef CONFIG_PPC64 if (!is_32bit_task()) audit_syscall_entry(regs->gpr[0], regs->gpr[3], regs->gpr[4], regs->gpr[5], regs->gpr[6]); else -#endif audit_syscall_entry(regs->gpr[0], regs->gpr[3] & 0xffffffff, regs->gpr[4] & 0xffffffff, -- cgit v1.2.3 From b77afad84e1eedca03658ae1478ce5b8ed5aa18c Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 25 Nov 2019 11:20:33 +0200 Subject: powerpc/32: drop unused ISA_DMA_THRESHOLD The ISA_DMA_THRESHOLD variable is set by several platforms but never referenced. Remove it. Signed-off-by: Mike Rapoport Reviewed-by: Christoph Hellwig Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20191125092033.20014-1-rppt@kernel.org --- arch/powerpc/include/asm/dma.h | 3 +-- arch/powerpc/kernel/setup_32.c | 1 - arch/powerpc/platforms/44x/warp.c | 3 --- arch/powerpc/platforms/52xx/efika.c | 1 - arch/powerpc/platforms/amigaone/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - 7 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/dma.h b/arch/powerpc/include/asm/dma.h index 1b4f0254868f..6161a9596196 100644 --- a/arch/powerpc/include/asm/dma.h +++ b/arch/powerpc/include/asm/dma.h @@ -151,10 +151,9 @@ #define DMA2_EXT_REG 0x4D6 #ifndef __powerpc64__ - /* in arch/ppc/kernel/setup.c -- Cort */ + /* in arch/powerpc/kernel/setup_32.c -- Cort */ extern unsigned int DMA_MODE_WRITE; extern unsigned int DMA_MODE_READ; - extern unsigned long ISA_DMA_THRESHOLD; #else #define DMA_MODE_READ 0x44 /* I/O to memory, no autoinit, increment, single mode */ #define DMA_MODE_WRITE 0x48 /* memory to I/O, no autoinit, increment, single mode */ diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 5b49b26eb154..305ca89d856f 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -58,7 +58,6 @@ EXPORT_SYMBOL_GPL(boot_cpuid_phys); int smp_hw_index[NR_CPUS]; EXPORT_SYMBOL(smp_hw_index); -unsigned long ISA_DMA_THRESHOLD; unsigned int DMA_MODE_READ; unsigned int DMA_MODE_WRITE; diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index 6620b64e4963..665f18e37efb 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -43,9 +43,6 @@ static int __init warp_probe(void) if (!of_machine_is_compatible("pika,warp")) return 0; - /* For arch_dma_alloc */ - ISA_DMA_THRESHOLD = ~0L; - return 1; } diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 61538869e88a..4514a6f7458a 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -205,7 +205,6 @@ static int __init efika_probe(void) if (strcmp(model, "EFIKA5K2")) return 0; - ISA_DMA_THRESHOLD = ~0L; DMA_MODE_READ = 0x44; DMA_MODE_WRITE = 0x48; diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index ea5e45e32683..f5d0bf999759 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -146,7 +146,6 @@ static int __init amigaone_probe(void) */ cur_cpu_spec->cpu_features &= ~CPU_FTR_NEED_COHERENT; - ISA_DMA_THRESHOLD = 0x00ffffff; DMA_MODE_READ = 0x44; DMA_MODE_WRITE = 0x48; diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index 8328cd5817b0..65a7e01a8f7d 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -569,7 +569,6 @@ static int __init chrp_probe(void) if (strcmp(dtype, "chrp")) return 0; - ISA_DMA_THRESHOLD = ~0L; DMA_MODE_READ = 0x44; DMA_MODE_WRITE = 0x48; diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index c6d5333729ed..95fb4feb6ccc 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -586,7 +586,6 @@ static int __init pmac_probe(void) #ifdef CONFIG_PPC32 /* isa_io_base gets set in pmac_pci_init */ - ISA_DMA_THRESHOLD = ~0L; DMA_MODE_READ = 1; DMA_MODE_WRITE = 2; #endif /* CONFIG_PPC32 */ -- cgit v1.2.3 From c17eb4dca5a353a9dbbb8ad6934fe57af7165e91 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Mon, 30 Mar 2020 10:03:56 +0200 Subject: powerpc: Make setjmp/longjmp signature standard Declaring setjmp()/longjmp() as taking longs makes the signature non-standard, and makes clang complain. In the past, this has been worked around by adding -ffreestanding to the compile flags. The implementation looks like it only ever propagates the value (in longjmp) or sets it to 1 (in setjmp), and we only call longjmp with integer parameters. This allows removing -ffreestanding from the compilation flags. Fixes: c9029ef9c957 ("powerpc: Avoid clang warnings around setjmp and longjmp") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Clement Courbet Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200330080400.124803-1-courbet@google.com --- arch/powerpc/include/asm/setjmp.h | 6 ++++-- arch/powerpc/kexec/Makefile | 3 --- arch/powerpc/xmon/Makefile | 3 --- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/include/asm') diff --git a/arch/powerpc/include/asm/setjmp.h b/arch/powerpc/include/asm/setjmp.h index e9f81bb3f83b..f798e80e4106 100644 --- a/arch/powerpc/include/asm/setjmp.h +++ b/arch/powerpc/include/asm/setjmp.h @@ -7,7 +7,9 @@ #define JMP_BUF_LEN 23 -extern long setjmp(long *) __attribute__((returns_twice)); -extern void longjmp(long *, long) __attribute__((noreturn)); +typedef long jmp_buf[JMP_BUF_LEN]; + +extern int setjmp(jmp_buf env) __attribute__((returns_twice)); +extern void longjmp(jmp_buf env, int val) __attribute__((noreturn)); #endif /* _ASM_POWERPC_SETJMP_H */ diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 378f6108a414..86380c69f5ce 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -3,9 +3,6 @@ # Makefile for the linux kernel. # -# Avoid clang warnings around longjmp/setjmp declarations -CFLAGS_crash.o += -ffreestanding - obj-y += core.o crash.o core_$(BITS).o obj-$(CONFIG_PPC32) += relocate_32.o diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index c3842dbeb1b7..6f9cccea54f3 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -1,9 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for xmon -# Avoid clang warnings around longjmp/setjmp declarations -subdir-ccflags-y := -ffreestanding - GCOV_PROFILE := n KCOV_INSTRUMENT := n UBSAN_SANITIZE := n -- cgit v1.2.3